コード例 #1
0
 def getDatasetMetaData(self,datasetName):
     # make logger
     methodName = 'getDatasetMetaData'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     try:
         # get DQ2 API
         dq2=DQ2()
         # get file list
         tmpRet = dq2.getMetaDataAttribute(datasetName,dq2.listMetaDataAttributes())
         # change dataset state to string
         if tmpRet['state'] in [DatasetState.CLOSED,DatasetState.FROZEN]:
             tmpRet['state'] = 'closed'
         elif tmpRet['state'] == DatasetState.OPEN:
             tmpRet['state'] = 'open'
         else:
             tmpRet['state'] = 'unknown'                
         tmpLog.debug(str(tmpRet))    
         return self.SC_SUCCEEDED,tmpRet
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         errCode = self.checkError(errtype)
         return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
コード例 #2
0
ファイル: GenWatchDog.py プロジェクト: PanDAWMS/panda-jedi
 def doAction(self):
     # get logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start')
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #3
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority, timeoutVal in [
             (950, 10),
             (900, 30),
         ]:
             self.doActionForHighPrioPending(tmpLog, minPriority,
                                             timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(
             errtype.__name__, errvalue, traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #4
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to refine
                     tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo,prodSourceLabel)
                     if tmpList == None:
                         # failed
                         tmpLog.error('failed to get the list of tasks to refine')
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # get work queue mapper
                         workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskRefinerThread(taskList,threadPool,
                                                     self.taskBufferIF,
                                                     self.ddmIF,
                                                     self,workQueueMapper)
                             thr.start()
                         # join
                         threadPool.join()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
         # sleep if needed
         loopCycle = jedi_config.taskrefine.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep()
コード例 #5
0
 def checkDatasetConsistency(self,location,datasetName):
     # make logger
     methodName = 'checkDatasetConsistency'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     try:
         # get DQ2 API
         dq2=DQ2()
         # check
         tmpRet = dq2.checkDatasetConsistency(location,datasetName)
         tmpLog.debug(str(tmpRet))
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         errCode = self.checkError(errtype)
         return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
コード例 #6
0
ファイル: AtlasProdWatchDog.py プロジェクト: RRCKI/panda-jedi
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                       traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #7
0
 def freezeDataset(self,datasetName,ignoreUnknown=False):
     methodName = 'freezeDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2=DQ2()
         # freeze
         dq2.freezeDataset(datasetName)
     except DQFrozenDatasetException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,True
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
コード例 #8
0
 def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False):
     methodName = 'registerDatasetSubscription'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # call
         dq2.registerDatasetSubscription(datasetName,location,activity=activity)
     except DQSubscriptionExistsException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if not isOK:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
コード例 #9
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
         # action to throttle jobs in paused tasks
         self.doActionToThrottleJobInPausedTasks(tmpLog)
         # action for jumbo
         jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed')
         jumbo.run()
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                       traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #10
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to refine
                     tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo,prodSourceLabel)
                     if tmpList == None:
                         # failed
                         tmpLog.error('failed to get the list of tasks to refine')
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # get work queue mapper
                         workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskRefinerThread(taskList,threadPool,
                                                     self.taskBufferIF,
                                                     self.ddmIF,
                                                     self,workQueueMapper)
                             thr.start()
                         # join
                         threadPool.join()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
         # sleep if needed
         loopCycle = jedi_config.taskrefine.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep()
コード例 #11
0
 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   resourceType):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if not workQueue.throttled:
         tmpLog.debug("  done : unthrottled since throttled is False")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled
コード例 #12
0
 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   jobStat):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled
コード例 #13
0
ファイル: GenWatchDog.py プロジェクト: PanDAWMS/panda-jedi
 def doAction(self):
     # get logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start')
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #14
0
 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resourceType):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(vo,prodSourceLabel,cloudName,
                                                                          workQueue.queue_name))
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled
コード例 #15
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # make tasks pending under certain conditions
         self.do_for_data_locality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         err_str = traceback.format_exc()
         origTmpLog.error('failed with {0} {1} ; {2}'.format(
             errtype, errvalue, err_str))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #16
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # clean up data locality
         self.doCleanDataLocality()
         # update data locality
         self.doUpdateDataLocality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype, errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #17
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #18
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # handle waiting jobs
         self.doForWaitingJobs()
         # throttle tasks if so many prestaging requests
         self.doForPreStaging()
         # priority massage
         self.doForPriorityMassage()
         # redo stalled analysis jobs
         self.doForRedoStalledJobs()
         # throttle WAN data access
         #self.doForThrottleWAN()
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #19
0
 def finger(self,userName):
     methodName = 'finger'
     methodName = '{0} userName={1}'.format(methodName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # exec
         tmpRet = infoClient().finger(userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0}:{1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,tmpRet
コード例 #20
0
 def setDatasetOwner(self,datasetName,userName):
     methodName = 'setDatasetOwner'
     methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # get DQ2 API            
         dq2=DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,'owner',userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
コード例 #21
0
    def doAction(self):
        try:
            # get logger
            tmpLog = MsgWrapper(logger)
            tmpLog.debug('start')

            # action for priority boost
            self.doActionForPriorityBoost(tmpLog)

            # action for reassign
            self.doActionForReassign(tmpLog)

            # action for throttled
            self.doActionForThrottled(tmpLog)

            # action for high prio pending
            for minPriority, timeoutVal in [(950, 10),
                                            (900, 30),
                                            ]:
                self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal)

            # action to set scout job data w/o scouts
            self.doActionToSetScoutJobData(tmpLog)

            # action to throttle jobs in paused tasks
            self.doActionToThrottleJobInPausedTasks(tmpLog)

            # action for jumbo
            jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed')
            jumbo.run()
        except Exception:
            errtype, errvalue = sys.exc_info()[:2]
            tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__, errvalue, traceback.format_exc()))
        # return
        tmpLog.debug('done')
        return self.SC_SUCCEEDED
コード例 #22
0
 def reassign_jobs(self, to_reassign_map):
     tmp_log = MsgWrapper(logger, 'reassign_jobs')
     for jedi_taskid, value_map in to_reassign_map.items():
         site = value_map['site']
         n_jobs_to_fill = value_map['n_jobs_to_fill']
         # compute n_jobs_to_close from n_jobs_to_fill
         n_jobs_to_close = int(n_jobs_to_fill / 3)
         # reassign
         n_jobs_closed = self.taskBufferIF.reassignJobsInPreassignedTask_JEDI(
             jedi_taskid, site, n_jobs_to_close)
         if n_jobs_closed is None:
             tmp_log.debug(
                 'jediTaskID={0} no longer ready/running or not assigned to {1} , skipped'
                 .format(jedi_taskid, site))
         else:
             tmp_log.debug('jediTaskID={0} to {1} , closed {2} jobs'.format(
                 jedi_taskid, site, n_jobs_closed))
コード例 #23
0
 def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None):
     methodName = 'registerDatasetLocation'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         owner = parse_dn(owner)
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime)
         dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
コード例 #24
0
 def setDatasetMetadata(self,datasetName,metadataName,metadaValue):
     methodName = 'setDatasetMetadata'
     methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName,
                                                                                metadataName,metadaValue)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue)
     except DQUnknownDatasetException:
         pass
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
コード例 #25
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #26
0
 def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False):
     methodName = 'deleteDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     retStr = ''
     nFiles = -1
     try:
         # get DQ2 API            
         dq2=DQ2()
         # get the number of files
         if emptyOnly:
             nFiles = dq2.getNumberOfFiles(datasetName)
         # erase
         if not emptyOnly or nFiles == 0:
             dq2.eraseDataset(datasetName)
             retStr = 'deleted {0}'.format(datasetName)
         else:
             retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles)
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,retStr
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
コード例 #27
0
ファイル: TaskBroker.py プロジェクト: PanDAWMS/panda-jedi
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(
                     idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(
                     None,
                     None,
                     None,
                     None,
                     None,
                     simTasks=[tmpTaskItem],
                     readMinFiles=True)
                 if tmpListItem is None:
                     # failed
                     tmpLog.error(
                         'failed to get the input chunks for jediTaskID={0}'
                         .format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(
                         self.vo, self.prodSourceLabel)
                     if impl is None:
                         # task refiner is undefined
                         tmpLog.error(
                             'task broker is undefined for vo={0} sourceLabel={1}'
                             .format(self.vo, self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(
                     impl.__class__.__name__, len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign, self.vo,
                                                self.prodSourceLabel,
                                                self.workQueue,
                                                self.resource_name)
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
コード例 #28
0
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel,
                                                                          jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel,
                                                                               jedi_config.watchdog.waitForPending,
                                                                               jedi_config.watchdog.timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo,prodSourceLabel)
                        if impl != None:
                            tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat !=  Interaction.SC_SUCCEEDED:
                                tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()
コード例 #29
0
 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # update cloudtasks
         tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
         if tmpStat != 'SUCCEEDED':
             tmpLog.error('failed to update CloudTasks')
             continue
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # check cloud
         if not siteMapper.checkCloud(taskSpec.cloud):
             tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
             continue
         # get T1
         t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # make subscription
             tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                    ddmBackEnd))
             tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                         activity='Production',ignoreUnknown=True,
                                                         backEnd=ddmBackEnd)
             if tmpStat != True:
                 tmpLog.error("failed to make subscription")
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             if taskSpec.oldStatus in ['assigning','exhausted']:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})
             tmpLog.debug('finished to reassign')
コード例 #30
0
 def findMissingFiles(self,jediTaskID,cloudName):
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(jediTaskID))
     tmpLog.debug('start findMissingFiles')
     # return for failure
     retError = self.SC_FAILED
     # get datasets
     tmpSt,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(jediTaskID,['input'],True)
     if not tmpSt:
         tmpLog.error('failed to get the list of datasets')
         return retError
     # loop over all datasets
     for datasetSpec in datasetSpecList: 
         # check only master dataset
         if not datasetSpec.isMaster():
             continue
         tmpLog.debug('checking {0}'.format(datasetSpec.datasetName))
         # get ddmIF
         ddmIF = self.ddmIF.getInterface(datasetSpec.vo)
         if ddmIF == None:
             tmpLog.error('failed to get DDM I/F for vo={0}'.format(datasetSpec.vo))
             return retError
         # get the list of sites where data is available
         tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,ddmIF,
                                                          datasetSpec.datasetName)
         if tmpSt != self.SC_SUCCEEDED:
             tmpLog.error('failed to get the list of sites where {0} is available, since {1}'.format(datasetSpec.datasetName,
                                                                                                     tmpRet))
             return retError
         dataSiteMap = tmpRet
         # data is unavailable in cloud
         if not dataSiteMap.has_key(cloudName):
             tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(datasetSpec.datasetName,cloudName,str(dataSiteMap)))
             return retError
         # mapping between sites and storage endpoints
         checkedSites = [self.siteMapper.getCloud(cloudName)['source']]+dataSiteMap[cloudName]['t2']
         siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(checkedSites,self.siteMapper)
         # get available files per site/endpoint                                                                                     
         tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec,
                                                siteStorageEP,
                                                self.siteMapper,
                                                ngGroup=[1],
                                                checkLFC=True)
         if tmpAvFileMap == None:
             tmpLog.error('failed to get available file list for {0}'.format(datasetSpec.datasetName))
             return retError
         # check availability
         missingFiles = []
         for fileSpec in datasetSpec.Files:
             fileFound = False
             for tmpSiteName,availableFilesMap in tmpAvFileMap.iteritems():
                 for tmpStorageType,availableFiles in availableFilesMap.iteritems():
                     for availableFile in availableFiles:
                         if fileSpec.lfn == availableFile.lfn:
                             fileFound = True
                             break
                     if fileFound:
                         break
                 if fileFound:
                     break
             # missing
             if not fileFound:
                 missingFiles.append(fileSpec.fileID)
                 tmpLog.debug('{0} missing'.format(fileSpec.lfn))
         # update contents
         if missingFiles != []:        
             tmpSt = self.taskBufferIF.setMissingFiles_JEDI(jediTaskID,datasetSpec.datasetID,missingFiles)
             if not tmpSt:
                 tmpLog.error('failed to set missing files in {0}'.format(datasetSpec.datasetName))
                 return retError
     tmpLog.debug('done findMissingFiles')
     return self.SC_SUCCEEDED
コード例 #31
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
コード例 #32
0
 def doCheck(self, taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug("start doCheck")
     # return for failure
     retFatal = self.SC_FATAL, {}
     retTmpError = self.SC_FAILED, {}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug("check with panda")
     tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error("failed to see clouds")
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug("jediTaskID={0} -> {1}".format(tmpTaskID, tmpCoreName))
         if not tmpCoreName in ["NULL", "", None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     tmpTaskID, ["output", "log"]
                 )
                 # get destinations
                 retMap[tmpTaskID] = []
                 for datasetSpec in tmpDatasetSpecs:
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm, datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     retMap[tmpTaskID].append(
                         {
                             "datasetID": datasetSpec.datasetID,
                             "token": "dst:{0}".format(token),
                             "destination": tmpCoreName,
                         }
                     )
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug("ret {0}".format(str(retMap)))
     # return
     tmpLog.debug("done")
     return self.SC_SUCCEEDED, retMap
コード例 #33
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         origTmpLog = tmpLog
         # check every 60 min
         checkInterval = 60
         # get lib.tgz for waiting jobs
         libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo,self.prodSourceLabel,checkInterval)
         tmpLog.debug('got {0} lib.tgz files'.format(len(libList)))
         # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced
         for prodUserName,datasetName,tmpFileSpec in libList:
             tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID))
             tmpLog.debug('start')
             # check status of lib.tgz
             if tmpFileSpec.status == 'failed':
                 # get buildJob 
                 pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID],
                                                            fromDefined=False,
                                                            fromActive=False,
                                                            fromWaiting=False)
                 pandaJobSpec = pandaJobSpecs[0]
                 if pandaJobSpec != None:
                     # kill
                     self.taskBufferIF.updateJobs([pandaJobSpec],False)
                     tmpLog.debug('  killed downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
                 else:
                     # PandaJobSpec not found
                     tmpLog.error('  cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName,
                                                                                                      tmpFileSpec.PandaID))
             elif tmpFileSpec.status == 'finished':
                 # set metadata
                 self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID,
                                              'lfn':tmpFileSpec.lfn,
                                              'checksum':tmpFileSpec.checksum,
                                              'fsize':tmpFileSpec.fsize,
                                              'scope':tmpFileSpec.scope,
                                              }])
                 # get lib dataset
                 dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName})
                 if dataset != None:
                     # activate jobs
                     aThr = Activator(self.taskBufferIF,dataset)
                     aThr.start()
                     aThr.join()
                     tmpLog.debug('  activated downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
                 else:
                     # datasetSpec not found
                     tmpLog.error('  cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # lib.tgz is not ready
                 tmpLog.debug('  keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName))
     except:
         tmpLog = origTmpLog
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog = origTmpLog
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #34
0
 def findMissingFiles(self, jediTaskID, cloudName):
     tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID))
     tmpLog.debug('start findMissingFiles')
     # return for failure
     retError = self.SC_FAILED
     # get datasets
     tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
         jediTaskID, ['input'], True)
     if not tmpSt:
         tmpLog.error('failed to get the list of datasets')
         return retError
     # loop over all datasets
     for datasetSpec in datasetSpecList:
         # check only master dataset
         if not datasetSpec.isMaster():
             continue
         tmpLog.debug('checking {0}'.format(datasetSpec.datasetName))
         # get ddmIF
         ddmIF = self.ddmIF.getInterface(datasetSpec.vo)
         if ddmIF == None:
             tmpLog.error('failed to get DDM I/F for vo={0}'.format(
                 datasetSpec.vo))
             return retError
         # get the list of sites where data is available
         tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData(
             self.siteMapper, ddmIF, datasetSpec.datasetName)
         if tmpSt != self.SC_SUCCEEDED:
             tmpLog.error(
                 'failed to get the list of sites where {0} is available, since {1}'
                 .format(datasetSpec.datasetName, tmpRet))
             return retError
         dataSiteMap = tmpRet
         # data is unavailable in cloud
         if not dataSiteMap.has_key(cloudName):
             tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(
                 datasetSpec.datasetName, cloudName, str(dataSiteMap)))
             return retError
         # mapping between sites and storage endpoints
         checkedSites = [self.siteMapper.getCloud(cloudName)['source']
                         ] + dataSiteMap[cloudName]['t2']
         siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
             checkedSites, self.siteMapper)
         # get available files per site/endpoint
         tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec,
                                                siteStorageEP,
                                                self.siteMapper,
                                                ngGroup=[1],
                                                checkLFC=True)
         if tmpAvFileMap == None:
             tmpLog.error(
                 'failed to get available file list for {0}'.format(
                     datasetSpec.datasetName))
             return retError
         # check availability
         missingFiles = []
         for fileSpec in datasetSpec.Files:
             fileFound = False
             for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems():
                 for tmpStorageType, availableFiles in availableFilesMap.iteritems(
                 ):
                     for availableFile in availableFiles:
                         if fileSpec.lfn == availableFile.lfn:
                             fileFound = True
                             break
                     if fileFound:
                         break
                 if fileFound:
                     break
             # missing
             if not fileFound:
                 missingFiles.append(fileSpec.fileID)
                 tmpLog.debug('{0} missing'.format(fileSpec.lfn))
         # update contents
         if missingFiles != []:
             tmpSt = self.taskBufferIF.setMissingFiles_JEDI(
                 jediTaskID, datasetSpec.datasetID, missingFiles)
             if not tmpSt:
                 tmpLog.error('failed to set missing files in {0}'.format(
                     datasetSpec.datasetName))
                 return retError
     tmpLog.debug('done findMissingFiles')
     return self.SC_SUCCEEDED
コード例 #35
0
 def doCheck(self, taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal = self.SC_FATAL, {}
     retTmpError = self.SC_FAILED, {}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(
             tmpTaskID, tmpCoreName))
         if not tmpCoreName in ['NULL', '', None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     tmpTaskID, ['output', 'log'])
                 # get destinations
                 retMap[tmpTaskID] = []
                 for datasetSpec in tmpDatasetSpecs:
                     token = ddmIF.convertTokenToEndpoint(
                         siteSpec.ddm, datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     retMap[tmpTaskID].append({
                         'datasetID':
                         datasetSpec.datasetID,
                         'token':
                         'dst:{0}'.format(token),
                         'destination':
                         tmpCoreName
                     })
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, retMap
コード例 #36
0
ファイル: TaskCommando.py プロジェクト: tertychnyy/panda-jedi
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
コード例 #37
0
ファイル: GenJobBroker.py プロジェクト: tertychnyy/panda-jedi
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['', None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' %
                      (cloudName, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug(
                     '  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxmemory,
                             minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug(
                     '  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.minmemory,
                             minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk
コード例 #38
0
from pandajedi.jedicore.MsgWrapper import MsgWrapper

from pandajedi.jedicore.JediTaskBufferInterface import JediTaskBufferInterface

from pandajedi.jediddm.DDMInterface import DDMInterface

from pandajedi.jediorder.JobBroker import JobBroker
from pandajedi.jediorder.JobSplitter import JobSplitter
from pandajedi.jediorder.JobGenerator import JobGeneratorThread
from pandajedi.jedicore.ThreadUtils import ThreadPool
from pandajedi.jediorder.TaskSetupper import TaskSetupper

import sys

logger = PandaLogger().getLogger('JobGenerator')
tmpLog = MsgWrapper(logger)

tbIF = JediTaskBufferInterface()
tbIF.setupInterface()

siteMapper = tbIF.getSiteMapper()

ddmIF = DDMInterface()
ddmIF.setupInterface()

jediTaskID = int(sys.argv[1])

datasetIDs = None
if len(sys.argv) > 2:
    datasetIDs = [int(sys.argv[2])]
コード例 #39
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # prepare tasks to be finished
                     tmpLog.info(
                         'preparing tasks to be finished for vo={0} label={1}'
                         .format(vo, prodSourceLabel))
                     tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(
                         vo,
                         prodSourceLabel,
                         jedi_config.postprocessor.nTasks,
                         pid=self.pid)
                     if tmpRet == None:
                         # failed
                         tmpLog.error('failed to prepare tasks')
                     # get tasks to be finished
                     tmpLog.info('getting tasks to be finished')
                     tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(
                         vo, prodSourceLabel, self.pid,
                         jedi_config.postprocessor.nTasks)
                     if tmpList == None:
                         # failed
                         tmpLog.error('failed to get tasks to be finished')
                     else:
                         tmpLog.info('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.postprocessor.nWorkers
                         for iWorker in range(nWorker):
                             thr = PostProcessorThread(
                                 taskList, threadPool, self.taskBufferIF,
                                 self.ddmIF, self)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.info('done')
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = 60
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
コード例 #40
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # loop over all tasks
             for taskSpec in taskList:
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 # get impl
                 impl = self.implFactory.instantiateImpl(
                     taskSpec.vo, taskSpec.prodSourceLabel, None,
                     self.taskBufferIF, self.ddmIF)
                 if impl == None:
                     # post processor is undefined
                     tmpLog.error(
                         'post-processor is undefined for vo={0} sourceLabel={1}'
                         .format(taskSpec.vo, taskSpec.prodSourceLabel))
                     tmpStat = Interaction.SC_FATAL
                 # execute
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('post-process with {0}'.format(
                         impl.__class__.__name__))
                     try:
                         impl.doPostProcess(taskSpec, tmpLog)
                     except:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error(
                             'doPostProcess failed with {0}:{1}'.format(
                                 errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FATAL
                 # done
                 if tmpStat == Interaction.SC_FATAL:
                     # task is broken
                     tmpErrStr = 'post-process failed'
                     tmpLog.error(tmpErrStr)
                     taskSpec.status = 'broken'
                     taskSpec.setErrDiag(tmpErrStr)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(
                         taskSpec, {'jediTaskID': taskSpec.jediTaskID})
                 elif tmpStat == Interaction.SC_FAILED:
                     tmpErrStr = 'post processing failed'
                     taskSpec.setOnHold()
                     taskSpec.setErrDiag(tmpErrStr, True)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(
                         taskSpec, {'jediTaskID': taskSpec.jediTaskID})
                     tmpLog.info('set task.status={0} since {1}'.format(
                         taskSpec.status, taskSpec.errorDialog))
                     continue
                 # final procedure
                 try:
                     impl.doFinalProcedure(taskSpec, tmpLog)
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error(
                         'doFinalProcedure failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                 # done
                 tmpLog.info('done')
         except:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
コード例 #41
0
 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # update DB
         if not taskSpec.useWorldCloud():
             # update cloudtasks
             tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
             if tmpStat != 'SUCCEEDED':
                 tmpLog.error('failed to update CloudTasks')
                 continue
             # check cloud
             if not siteMapper.checkCloud(taskSpec.cloud):
                 tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                 continue
         else:
             # re-run task brokerage
             if taskSpec.nucleus in [None,'']:
                 taskSpec.status = 'assigning'
                 taskSpec.oldStatus = None
                 taskSpec.setToRegisterDatasets()
                 self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                   setOldModTime=True)
                 tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status))
                 continue
             # get nucleus
             nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
             if nucleusSpec == None:
                 tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                 continue
             # set nucleus
             retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
             tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
         # get T1/nucleus
         if not taskSpec.useWorldCloud():
             t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         else:
             t1SiteName = nucleusSpec.getOnePandaSite()
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                 tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                 continue
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # make subscription
             try:
                 tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                        ddmBackEnd))
                 tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                             'Production Output',asynchronous=True)
                 if tmpStat != True:
                     tmpLog.error("failed to make subscription")
                     isOK = False
                     break
             except:
                 errtype,errvalue = sys.exc_info()[:2]
                 tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             if taskSpec.oldStatus in ['assigning','exhausted',None]:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                               setOldModTime=True)
             tmpLog.debug('finished to reassign')
コード例 #42
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         origTmpLog = tmpLog
         # check every 60 min
         checkInterval = 60
         # get lib.tgz for waiting jobs
         libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(
             self.vo, self.prodSourceLabel, checkInterval)
         tmpLog.debug('got {0} lib.tgz files'.format(len(libList)))
         # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced
         for prodUserName, datasetName, tmpFileSpec in libList:
             tmpLog = MsgWrapper(
                 logger, '<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID))
             tmpLog.debug('start')
             # check status of lib.tgz
             if tmpFileSpec.status == 'failed':
                 # get buildJob
                 pandaJobSpecs = self.taskBufferIF.peekJobs(
                     [tmpFileSpec.PandaID],
                     fromDefined=False,
                     fromActive=False,
                     fromWaiting=False)
                 pandaJobSpec = pandaJobSpecs[0]
                 if pandaJobSpec != None:
                     # kill
                     self.taskBufferIF.updateJobs([pandaJobSpec], False)
                     tmpLog.debug(
                         '  killed downstream jobs for user="******" with libDS={1}'
                         .format(prodUserName, datasetName))
                 else:
                     # PandaJobSpec not found
                     tmpLog.error(
                         '  cannot find PandaJobSpec for user="******" with PandaID={1}'
                         .format(prodUserName, tmpFileSpec.PandaID))
             elif tmpFileSpec.status == 'finished':
                 # set metadata
                 self.taskBufferIF.setGUIDs([{
                     'guid': tmpFileSpec.GUID,
                     'lfn': tmpFileSpec.lfn,
                     'checksum': tmpFileSpec.checksum,
                     'fsize': tmpFileSpec.fsize,
                     'scope': tmpFileSpec.scope,
                 }])
                 # get lib dataset
                 dataset = self.taskBufferIF.queryDatasetWithMap(
                     {'name': datasetName})
                 if dataset != None:
                     # activate jobs
                     aThr = Activator(self.taskBufferIF, dataset)
                     aThr.start()
                     aThr.join()
                     tmpLog.debug(
                         '  activated downstream jobs for user="******" with libDS={1}'
                         .format(prodUserName, datasetName))
                 else:
                     # datasetSpec not found
                     tmpLog.error(
                         '  cannot find datasetSpec for user="******" with libDS={1}'
                         .format(prodUserName, datasetName))
             else:
                 # lib.tgz is not ready
                 tmpLog.debug(
                     '  keep waiting for user="******" libDS={1}'.format(
                         prodUserName, datasetName))
     except:
         tmpLog = origTmpLog
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype, errvalue))
     # return
     tmpLog = origTmpLog
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #43
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled
コード例 #44
0
ファイル: ContentsFeeder.py プロジェクト: PanDAWMS/panda-jedi
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
コード例 #45
0
 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue):
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug("start doBrokerage")
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug("vo={0} label={1} queue={2}".format(vo, prodSourceLabel, workQueue.queue_name))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = "managed"
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue, jobSpec.currentPriority
                 )
                 if tmpRW == None:
                     tmpLog.error("failed to calculate RW with prio={0}".format(jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error("failed to calculate RW for jediTaskID={0}".format(jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # get fullRWs
     fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo, prodSourceLabel, None, None)
     if fullRWs == None:
         tmpLog.error("failed to calculate full RW")
         return retTmpError
     # set metadata
     for jobSpec in jobSpecList:
         rwValues = allRwMap[jobSpec.currentPriority]
         jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
             jobSpec.metadata,
             str(rwValues),
             str(expRWs),
             str(prioMap),
             str(fullRWs),
             str(tt2Map),
         )
     tmpLog.debug("run task assigner for {0} tasks".format(len(jobSpecList)))
     nBunchTask = 0
     while nBunchTask < len(jobSpecList):
         # get a bunch
         jobsBunch = jobSpecList[nBunchTask : nBunchTask + maxBunchTask]
         strIDs = "jediTaskID="
         for tmpJobSpec in jobsBunch:
             strIDs += "{0},".format(tmpJobSpec.taskID)
         strIDs = strIDs[:-1]
         tmpLog.debug(strIDs)
         # increment index
         nBunchTask += maxBunchTask
         # run task brokerge
         stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
         tmpLog.debug("{0}:{1}".format(stS, str(outSs)))
     # return
     tmpLog.debug("done")
     return self.SC_SUCCEEDED
コード例 #46
0
    def doActionForReassign(self,gTmpLog):
        # get DDM I/F
        ddmIF = self.ddmIF.getInterface(self.vo)
        # get site mapper
        siteMapper = self.taskBufferIF.getSiteMapper()
        # get tasks to get reassigned
        taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)

        gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
        for taskSpec in taskList:
            tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID))
            tmpLog.debug('start to reassign')
            # DDM backend
            ddmBackEnd = taskSpec.getDdmBackEnd()
            # get datasets
            tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
            if tmpStat is not True:
                tmpLog.error('failed to get datasets')
                continue
            # update DB
            if not taskSpec.useWorldCloud():
                # update cloudtasks
                tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
                if tmpStat != 'SUCCEEDED':
                    tmpLog.error('failed to update CloudTasks')
                    continue
                # check cloud
                if not siteMapper.checkCloud(taskSpec.cloud):
                    tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                    continue
            else:
                # re-run task brokerage
                if taskSpec.nucleus in [None,'']:
                    taskSpec.status = 'assigning'
                    taskSpec.oldStatus = None
                    taskSpec.setToRegisterDatasets()
                    self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID},
                                                      setOldModTime=True)
                    tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'.
                                 format(taskSpec.status))
                    continue

                # get nucleus
                nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
                if nucleusSpec is None:
                    tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                    continue

                # set nucleus
                retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
                tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)

            # get T1/nucleus
            if not taskSpec.useWorldCloud():
                t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
            else:
                t1SiteName = nucleusSpec.getOnePandaSite()
            t1Site = siteMapper.getSite(t1SiteName)

            # loop over all datasets
            isOK = True
            for datasetSpec in datasetSpecList:
                tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
                if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                    tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                    continue
                # get location
                location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel,
                                                     JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                # make subscription
                try:
                    tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                           ddmBackEnd))
                    tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                                'Production Output',asynchronous=True)
                    if tmpStat is not True:
                        tmpLog.error("failed to make subscription")
                        isOK = False
                        break
                except Exception:
                    errtype,errvalue = sys.exc_info()[:2]
                    tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                    isOK = False
                    break
            # succeeded
            if isOK:
                # activate task
                if taskSpec.oldStatus in ['assigning','exhausted',None]:
                    taskSpec.status = 'ready'
                else:
                    taskSpec.status = taskSpec.oldStatus
                taskSpec.oldStatus = None
                self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                  setOldModTime=True)
                tmpLog.debug('finished to reassign')
コード例 #47
0
ファイル: WatchDog.py プロジェクト: PanDAWMS/panda-jedi
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # vo/prodSourceLabel specific action
                     impl = self.getImpl(vo,
                                         prodSourceLabel,
                                         subType=self.subStr)
                     if impl is not None:
                         plugin_name = impl.__class__.__name__
                         tmpLog.info(
                             'pre-action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         impl.pre_action(tmpLog, vo, prodSourceLabel,
                                         self.pid)
                         tmpLog.info(
                             'do action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         tmpStat = impl.doAction()
                         if tmpStat != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to run special action for vo={} label={} cls={}'
                                 .format(vo, prodSourceLabel, plugin_name))
                         else:
                             tmpLog.info(
                                 'done for vo={} label={} cls={}'.format(
                                     vo, prodSourceLabel, plugin_name))
             tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)
コード例 #48
0
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
コード例 #49
0
 def doCheck(self,taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal    = self.SC_FATAL,{}
     retTmpError = self.SC_FAILED,{}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName))
         if not tmpCoreName in ['NULL','',None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     # add origianl token
                     if not datasetSpec.storageToken in ['',None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID,
                                                           'token':'dst:{0}'.format(token),
                                                           'destination':tmpCoreName})
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,retMap
コード例 #50
0
 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 500
     nJobsInBunchMin = 300
     nJobsInBunchMaxES = 1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 8
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.debug(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special limit for CERN
     if cloudName == 'CERN':
         nQueueLimit = 2000
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing']:
         if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0:
             nQueueLimit = cloudSpec['nprestage']
             # reset nJobsInBunch
             if nQueueLimit > (nNotRun+nDefine):
                 tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
                 if tmpRemainingSlot < nJobsInBunch:
                     pass
                 elif tmpRemainingSlot < nJobsInBunchMax:
                     nJobsInBunch = tmpRemainingSlot
                 else:
                     nJobsInBunch = nJobsInBunchMax
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpLog.debug(msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3}".format(nQueueLimit,
                                                                                           nNotRun+nDefine,
                                                                                           nDefine,
                                                                                           nRunning))
     # check when high prio tasks are not waiting
     if not highPrioQueued:
         if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1}".format(nNotRun+nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued+Defined({3})>{4}".format(nNotRun,nRunning,
                                                                                               threshold,nNotRun+nDefine,
                                                                                               nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nDefine > nQueueLimit:
             limitPriority = True
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
             limitPriority = True
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioInPandaDB
         self.setMinPriority(limitPriorityValue)
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled
コード例 #51
0
 def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue):
     # list with a lock
     inputListWorld = ListWithLock([])
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel,
                                                                 workQueue.queue_name,
                                                                 len(inputList)))
     # loop over all tasks
     allRwMap    = {}
     prioMap     = {}
     tt2Map      = {}
     expRWs      = {}
     jobSpecList = []
     for tmpJediTaskID,tmpInputList in inputList:
         for taskSpec,cloudName,inputChunk in tmpInputList:
             # collect tasks for WORLD
             if taskSpec.useWorldCloud():
                 inputListWorld.append((taskSpec,inputChunk))
                 continue
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID     = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel  = 'managed'
             jobSpec.processingType   = taskSpec.processingType
             jobSpec.workingGroup     = taskSpec.workingGroup
             jobSpec.metadata         = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority  = taskSpec.currentPriority
             jobSpec.maxDiskCount     = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID]  = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue,
                                                                    jobSpec.currentPriority) 
                 if tmpRW == None:
                     tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # for old clouds
     if jobSpecList != []:
         # get fullRWs
         fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None)
         if fullRWs == None:
             tmpLog.error('failed to calculate full RW')
             return retTmpError
         # set metadata
         for jobSpec in jobSpecList:
             rwValues = allRwMap[jobSpec.currentPriority]
             jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata,
                                                       str(rwValues),str(expRWs),
                                                       str(prioMap),str(fullRWs),
                                                       str(tt2Map))
         tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList)))
         nBunchTask = 0
         while nBunchTask < len(jobSpecList):
             # get a bunch
             jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask]
             strIDs = 'jediTaskID='
             for tmpJobSpec in jobsBunch:
                 strIDs += '{0},'.format(tmpJobSpec.taskID)
             strIDs = strIDs[:-1]
             tmpLog.debug(strIDs)
             # increment index
             nBunchTask += maxBunchTask
             # run task brokerge
             stS,outSs = PandaClient.runTaskAssignment(jobsBunch)
             tmpLog.debug('{0}:{1}'.format(stS,str(outSs)))
     # for WORLD
     if len(inputListWorld) > 0:
         # thread pool
         threadPool = ThreadPool()
         # get full RW for WORLD
         fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None)
         if fullRWs == None:
             tmpLog.error('failed to calculate full WORLD RW')
             return retTmpError
         # get RW per priority
         for taskSpec,inputChunk in inputListWorld:
             if not taskSpec.currentPriority in allRwMap:
                 tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue,
                                                                         taskSpec.currentPriority)
                 if tmpRW == None:
                     tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority))
                     return retTmpError
                 allRwMap[taskSpec.currentPriority] = tmpRW
         # live counter for RWs
         liveCounter = MapWithLock(allRwMap)
         # make workers
         ddmIF = self.ddmIF.getInterface(vo)
         for iWorker in range(4):
             thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool,
                                             self.taskBufferIF,ddmIF,
                                             fullRWs,liveCounter)
             thr.start()
         threadPool.join(60*10)
     # return
     tmpLog.debug('doBrokerage done')
     return self.SC_SUCCEEDED
コード例 #52
0
 def doCheck(self, taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal = self.SC_FATAL, {}
     retTmpError = self.SC_FAILED, {}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID, tmpCoreName in iteritems(cloudsInPanda):
         tmpLog.debug('jediTaskID={0} -> {1}'.format(
             tmpTaskID, tmpCoreName))
         if tmpCoreName not in ['NULL', '', None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 scopeSiteSpec_input, scopeSiteSpec_output = select_scope(
                     siteSpec, taskSpec.prodSourceLabel,
                     JobUtils.translate_tasktype_to_jobtype(
                         taskSpec.taskType))
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     tmpTaskID, ['output', 'log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets': [], 'nucleus': nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(
                             datasetSpec.storageToken) is not None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(
                         siteSpec.ddm_output[scopeSiteSpec_output],
                         datasetSpec.storageToken)
                     # use default endpoint
                     if token is None:
                         token = siteSpec.ddm_output[scopeSiteSpec_output]
                     # add original token
                     if datasetSpec.storageToken not in ['', None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({
                         'datasetID':
                         datasetSpec.datasetID,
                         'token':
                         'dst:{0}'.format(token),
                         'destination':
                         tmpCoreName
                     })
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, retMap
コード例 #53
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
コード例 #54
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     # get active PandaIDs to be killed
                     pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                     if pandaIDs == None:
                         tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                         tmpStat = Interaction.SC_FAILED
                     # kill jobs or update task
                     if tmpStat == Interaction.SC_SUCCEEDED:
                         if pandaIDs == []:
                             # done since no active jobs
                             tmpLog.info('completed the command')
                             tmpTaskSpec = JediTaskSpec()
                             tmpTaskSpec.jediTaskID = jediTaskID
                             updateTaskStatus = True
                             if commandStr != 'reassign':
                                 # keep oldStatus for task reassignment since it is reset when actually reassigned
                                 tmpTaskSpec.forceUpdate('oldStatus')
                             else:
                                 # extract cloud or site
                                 tmpItems = commentStr.split(':')
                                 if tmpItems[0] == 'cloud':
                                     tmpTaskSpec.cloud = tmpItems[1]
                                 else:
                                     tmpTaskSpec.site = tmpItems[1]
                                 # back to oldStatus if necessary 
                                 if tmpItems[2] == 'y':
                                     tmpTaskSpec.status = oldStatus
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                     updateTaskStatus = False
                             if updateTaskStatus:
                                 tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                             tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                         else:
                             tmpLog.info('sending kill command')
                             tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                         tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
コード例 #55
0
ファイル: TaskBroker.py プロジェクト: PanDAWMS/panda-jedi
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'.
                 format(idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                     jediTaskID, False)
                 if tmpRet and taskSpec is not None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error(
                         'failed to get taskSpec for jediTaskID={0}'.format(
                             jediTaskID))
             if taskSpecList != []:
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(
                             self.vo, self.prodSourceLabel)
                         if impl is None:
                             # task brokerage is undefined
                             tmpLog.error(
                                 'task broker is undefined for vo={0} sourceLabel={1}'
                                 .format(self.vo, self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('check with {0}'.format(
                         impl.__class__.__name__))
                     try:
                         tmpStat, taskCloudMap = impl.doCheck(taskSpecList)
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(
                         taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(
                         tmpRet, str(taskCloudMap)))
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
コード例 #56
0
ファイル: JobSplitter.py プロジェクト: PanDAWMS/panda-jedi
 def doSplit(self,taskSpec,inputChunk,siteMapper):
     # return for failure
     retFatal    = self.SC_FATAL,[]
     retTmpError = self.SC_FAILED,[]
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0} datasetID={1}>'.format(taskSpec.jediTaskID,inputChunk.masterIndexName))
     tmpLog.debug('start')
     if not inputChunk.isMerging:
         # set maxNumFiles using taskSpec if specified
         maxNumFiles = taskSpec.getMaxNumFilesPerJob()
         # set fsize gradients using taskSpec
         sizeGradients  = taskSpec.getOutDiskSize()
         # set fsize intercepts using taskSpec                
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # walltime
         if not taskSpec.useHS06():
             walltimeGradient = taskSpec.walltime
         else:
             walltimeGradient = taskSpec.cpuTime
         # number of events per job if defined
         nEventsPerJob = taskSpec.getNumEventsPerJob()
         # number of files per job if defined
         if not taskSpec.dynamicNumEvents():
             nFilesPerJob = taskSpec.getNumFilesPerJob()
         else:
             nFilesPerJob = None
         if nFilesPerJob == None and nEventsPerJob == None and inputChunk.useScout() \
                 and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule():
             nFilesPerJob = 1
         # grouping with boundaryID
         useBoundary = taskSpec.useGroupWithBoundaryID()
         # fsize intercepts per input size
         sizeGradientsPerInSize = None
         # max primay output size
         maxOutSize = None
         # max size per job
         maxSizePerJob = taskSpec.getMaxSizePerJob()
         if maxSizePerJob is not None:
             maxSizePerJob += InputChunk.defaultOutputSize
         # dynamic number of events
         dynNumEvents = taskSpec.dynamicNumEvents()
         # max number of event ranges
         maxNumEventRanges = None
         # multiplicity of jobs
         if taskSpec.useJobCloning():
             multiplicity = 1
         else:
             multiplicity = taskSpec.getNumEventServiceConsumer()
         # split with fields
         if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN():
             splitByFields = taskSpec.getFieldNumToLFN()
         else:
             splitByFields = None
     else:
         # set parameters for merging
         maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob()
         sizeGradients = 0
         walltimeGradient = 0
         nFilesPerJob = taskSpec.getNumFilesPerMergeJob()
         nEventsPerJob = taskSpec.getNumEventsPerMergeJob()
         maxSizePerJob = None
         useBoundary = {'inSplit':3}
         dynNumEvents = False
         maxNumEventRanges = None
         multiplicity = None
         # gradients per input size is 1 + margin
         sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge
         # intercepts for libDS
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # mergein of 500MB
         interceptsMergin = self.interceptsMerginForMerge
         if sizeIntercepts < interceptsMergin:
             sizeIntercepts = interceptsMergin
         maxOutSize = taskSpec.getMaxSizePerMergeJob()
         if maxOutSize == None:
             # max output size is 5GB for merging by default
             maxOutSize = 5 * 1024 * 1024 * 1024
         # split with fields
         if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN():
             splitByFields = range(4+1,4+1+len(taskSpec.getFieldNumToLFN()))
         else:
             splitByFields = None
     # LB
     respectLB = taskSpec.respectLumiblock()
     # dump
     tmpLog.debug('maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'.format(maxNumFiles,
                                                                                                sizeGradients,
                                                                                                sizeIntercepts,
                                                                                                useBoundary))
     tmpLog.debug('walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(walltimeGradient,
                                                                                     nFilesPerJob,
                                                                                     nEventsPerJob))
     tmpLog.debug('sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}'.format(sizeGradientsPerInSize,
                                                                                                    maxOutSize,
                                                                                                    respectLB,
                                                                                                    dynNumEvents))
     tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format(multiplicity,str(splitByFields),
                                                                         inputChunk.getNumFilesInMaster()))
     # split
     returnList = []
     subChunks  = []
     iSubChunks = 0
     nSubChunks = 25
     subChunk   = None
     while True:
         # change site
         if iSubChunks % nSubChunks == 0 or subChunk == []:
             # append to return map
             if subChunks != []:
                 # get site names for parallel execution
                 if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake':
                     siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(),
                                                            nSubChunks,[siteName])
                 returnList.append({'siteName':siteName,
                                    'subChunks':subChunks,
                                    'siteCandidate':siteCandidate,
                                    })
                 tmpLog.debug('split to %s subchunks' % len(subChunks))
                 # reset
                 subChunks = []
             # skip unavailable files in distributed datasets
             nSkip = inputChunk.skipUnavailableFiles()
             tmpLog.debug('skipped {0} files'.format(nSkip))
             # new candidate
             siteCandidate = inputChunk.getOneSiteCandidate(nSubChunks)
             if siteCandidate == None:
                 break
             siteName = siteCandidate.siteName
             siteSpec = siteMapper.getSite(siteName)
             # directIO
             if taskSpec.useLocalIO() or not siteSpec.isDirectIO() or taskSpec.allowInputLAN() is None \
                     or inputChunk.isMerging:
                 useDirectIO = False
             else:
                 useDirectIO = True
             # get maxSize if it is set in taskSpec
             maxSize = maxSizePerJob
             if maxSize == None:
                 # use maxwdir as the default maxSize
                 if not useDirectIO:
                     maxSize = siteSpec.maxwdir * 1024 * 1024
                 elif nEventsPerJob is not None or nFilesPerJob is not None:
                     maxSize = None
                 else:
                     maxSize = max(50000, siteSpec.maxwdir) * 1024 * 1024
             else:
                 # add offset
                 maxSize += sizeIntercepts
             # max disk size
             maxDiskSize = siteSpec.maxwdir * 1024 * 1024
             # max walltime
             maxWalltime = None
             if not inputChunk.isMerging:
                 maxWalltime = taskSpec.getMaxWalltime()
             if maxWalltime is None:
                 maxWalltime = siteSpec.maxtime
             # core count
             if siteSpec.coreCount > 0:
                 coreCount = siteSpec.coreCount
             else:
                 coreCount = 1
             # core power
             corePower = siteSpec.corepower
             # max num of event ranges for dynNumEvents
             if dynNumEvents:
                 maxNumEventRanges = int(siteSpec.get_n_sim_events() / taskSpec.get_min_granularity())
                 if maxNumEventRanges == 0:
                     maxNumEventRanges = 1
             tmpLog.debug('chosen {0}'.format(siteName))
             tmpLog.debug('new weight {0}'.format(siteCandidate.weight))
             tmpLog.debug('maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}'.format(maxSize,maxWalltime,
                                                                                                                             coreCount,corePower,
                                                                                                                             maxNumEventRanges,
                                                                                                                             maxDiskSize))
             tmpLog.debug('useDirectIO={0} label={1}'.format(useDirectIO, taskSpec.prodSourceLabel))
         # get sub chunk
         subChunk = inputChunk.getSubChunk(siteName,maxSize=maxSize,
                                           maxNumFiles=maxNumFiles,
                                           sizeGradients=sizeGradients,
                                           sizeIntercepts=sizeIntercepts,
                                           nFilesPerJob=nFilesPerJob,
                                           walltimeGradient=walltimeGradient,
                                           maxWalltime=maxWalltime,
                                           nEventsPerJob=nEventsPerJob,
                                           useBoundary=useBoundary,
                                           sizeGradientsPerInSize=sizeGradientsPerInSize,
                                           maxOutSize=maxOutSize,
                                           coreCount=coreCount,
                                           respectLB=respectLB,
                                           corePower=corePower,
                                           dynNumEvents=dynNumEvents,
                                           maxNumEventRanges=maxNumEventRanges,
                                           multiplicity=multiplicity,
                                           splitByFields=splitByFields,
                                           tmpLog=tmpLog,
                                           useDirectIO=useDirectIO,
                                           maxDiskSize=maxDiskSize)
         if subChunk == None:
             break
         if subChunk != []:
             # append
             subChunks.append(subChunk)
         iSubChunks += 1
     # append to return map if remain
     if subChunks != []:
         # get site names for parallel execution
         if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging:
             siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(),
                                                    nSubChunks,[siteName])
         returnList.append({'siteName':siteName,
                            'subChunks':subChunks,
                            'siteCandidate':siteCandidate,
                            })
         tmpLog.debug('split to %s subchunks' % len(subChunks))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED,returnList
コード例 #57
0
ファイル: TaskBroker.py プロジェクト: PanDAWMS/panda-jedi
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.debug('start TaskBroker')
                # get work queue mapper
                workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                resource_types = self.taskBufferIF.load_resource_types()

                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # loop over all work queues
                        for workQueue in workQueueMapper.getAlignedQueueList(
                                vo, prodSourceLabel):
                            for resource_type in resource_types:
                                wq_name = '_'.join(
                                    workQueue.queue_name.split(' '))
                                msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\
                                    format(vo, prodSourceLabel, wq_name, resource_type.resource_name)
                                tmpLog.debug(msgLabel + 'start')
                                # get the list of tasks to check
                                tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to check'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_check={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskCheckerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                # get the list of tasks to assign
                                tmpList = self.taskBufferIF.getTasksToAssign_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to assign'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_assign={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskBrokerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel,
                                            workQueue,
                                            resource_type.resource_name)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                tmpLog.debug(msgLabel + 'done')
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
            tmpLog.debug('done')
            # sleep if needed
            loopCycle = jedi_config.taskbroker.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep(max_val=loopCycle)
コード例 #58
0
    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal
コード例 #59
0
 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue):
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug('vo={0} label={1} queue={2}'.format(
         vo, prodSourceLabel, workQueue.queue_name))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = 'managed'
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (
                 taskSpec.getOutDiskSize() +
                 taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(
                         datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue,
                     jobSpec.currentPriority)
                 if tmpRW == None:
                     tmpLog.error(
                         'failed to calculate RW with prio={0}'.format(
                             jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(
                 jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error(
                     'failed to calculate RW for jediTaskID={0}'.format(
                         jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # get fullRWs
     fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(
         vo, prodSourceLabel, None, None)
     if fullRWs == None:
         tmpLog.error('failed to calculate full RW')
         return retTmpError
     # set metadata
     for jobSpec in jobSpecList:
         rwValues = allRwMap[jobSpec.currentPriority]
         jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
             jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap),
             str(fullRWs), str(tt2Map))
     tmpLog.debug('run task assigner for {0} tasks'.format(
         len(jobSpecList)))
     nBunchTask = 0
     while nBunchTask < len(jobSpecList):
         # get a bunch
         jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask]
         strIDs = 'jediTaskID='
         for tmpJobSpec in jobsBunch:
             strIDs += '{0},'.format(tmpJobSpec.taskID)
         strIDs = strIDs[:-1]
         tmpLog.debug(strIDs)
         # increment index
         nBunchTask += maxBunchTask
         # run task brokerge
         stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
         tmpLog.debug('{0}:{1}'.format(stS, str(outSs)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
コード例 #60
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(
                             taskSpec.jediTaskID,
                             datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get primary site candidates
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(
         None, taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName, tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites
     for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['', None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff', 'test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug(
                 '  skip site=%s due to status=%s criteria=-status' %
                 (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(
             len(scanSiteList), useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search(
                 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList,
                 cmtConfig=taskSpec.architecture,
                 onlyCmtConfig=True)
         else:
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
             transHome = re.sub('_', '-', transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     caches=transHome,
                     cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     releases=transUses,
                     cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList, releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(
             len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
             taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0, None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug(
                     '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                     .format(tmpSiteName, site_maxmemory, minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0, None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug(
                     '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                     .format(tmpSiteName, site_minmemory, minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize = inputChunk.getMaxAtomSize()
     tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug(
         'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
         .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                 tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
         minDiskCountS, minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                     .format(tmpSiteName, tmpSiteSpec.maxwdir,
                             minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(
             tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug(
                     '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                     .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug(
                     '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                     .format(tmpSiteName, tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug(
                 '  skip site=%s due to no pilot criteria=-nopilot' %
                 tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
             tmpLog.debug(
                 '  skip site={0} excluded criteria=-excluded'.format(
                     tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(
                 tmpSiteName, includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug(
                     '  skip site={0} not included criteria=-notincluded'.
                     format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug(
                     '  skip site={0} limited access criteria=-limitedaccess'
                     .format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
             tmpLog.debug(
                 '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                 format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug(
                 '  skip site={0} not included criteria=-notincluded'.
                 format(tmpSiteName))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug(
                     'getting the list of sites where {0} is available'.
                     format(datasetName))
                 tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                     scanSiteList, self.siteMapper, self.ddmIF, datasetName)
                 if tmpSt in [
                         Interaction.JEDITemporaryError,
                         Interaction.JEDITimeoutError
                 ]:
                     tmpLog.error(
                         'temporary failed to get the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error(
                         'fatal error when getting the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(
                         len(tmpRet), str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(
                                 datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(
                                     datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error(
                     '{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (
                     sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                     tmpDiskSiteList,
                     self.taskBufferIF,
                     self.siteMapper,
                     nSites=50,
                     protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
             ):
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0, None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(
                         tmpWeightSrcMap['weight']) / wRemote
                 else:
                     dataWeight[tmpSiteName] += float(
                         tmpWeightSrcMap['weight']) / wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][
                     datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                 ):
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(
                 datasetName, len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                 datasetName, len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug(
                 'data is unavailable locally or remotely at preassigned site {0}'
                 .format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(
             taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
         taskSpec.jediTaskID, timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'starting',
                                                 None, None)
         nFailed = 0
         nClosed = 0
         nFinished = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed + nClosed > 2 * nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(
                 jobStatPrioMap, tmpSiteName, 'throttled', None, None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
             tmpSiteName, nRunning, nAssigned, nActivated, nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
             nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
             candidateSpecList, problematicSites, sitesUsedByTask,
             preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(
                         tmpSiteName
                 ) and remoteSourceList[tmpSiteName].has_key(
                         datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][
                             datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
                 fileScanSiteList, self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 ngGroup=[2],
                 checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' %
                          (errtype.__name__, errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug(
                 '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                 .format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []:
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName, availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                     tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][
                         tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[
                             tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles += availableFiles[
                         tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[
                         tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(
                         availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(
                         tmpMsg.format(
                             tmpDatasetName,
                             tmpSiteName,
                             len(tmpDatasetSpec.Files),
                             len(availableFiles[tmpSiteName]['localdisk']),
                             len(availableFiles[tmpSiteName]['cache']),
                             len(availableFiles[tmpSiteName]['localtape']),
                         ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug(
                 '  skip site={0} file unavailable criteria=-fileunavailable'
                 .format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug(
             '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
             .format(
                 siteCandidateSpec.siteName,
                 siteCandidateSpec.weight,
                 len(siteCandidateSpec.localDiskFiles),
                 len(siteCandidateSpec.localTapeFiles),
                 len(siteCandidateSpec.cacheFiles),
                 len(siteCandidateSpec.remoteFiles),
             ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk