def doActionForReassign(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat is not True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'. format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec is None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat is not True: tmpLog.error("failed to make subscription") isOK = False break except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec == None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50 * 1000 * 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format( vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format( msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug( msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug( "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}" .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI( vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format( msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info( "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}". format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info( "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}" .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format( nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format( nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format( nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format( nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format( nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format( nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader + " not enough jobs queued") if not workQueue.queue_name in non_rt_wqs: self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format( limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled
def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat): # component name compName = 'prod_job_throttler' # params nBunch = 4 threshold = 2.0 thresholdForSite = threshold - 1.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 nJobsInBunchMaxES = 1000 if workQueue.criteria != None and 'site' in workQueue.criteria: minTotalWalltime = 10*1000*1000 else: minTotalWalltime = 50*1000*1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 # make logger tmpLog = MsgWrapper(logger) workQueueIDs = workQueue.getIDs() msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name) tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs))) # change threashold if workQueue.queue_name in ['mcore']: threshold = 5.0 # check cloud status if not self.siteMapper.checkCloud(cloudName): msgBody = "SKIP cloud={0} undefined".format(cloudName) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled cloudSpec = self.siteMapper.getCloud(cloudName) if cloudSpec['status'] in ['offline']: msgBody = "SKIP cloud.status={0}".format(cloudSpec['status']) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled if cloudSpec['status'] in ['test']: if workQueue.queue_name != 'test': msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'], workQueue.queue_name) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') tmpLog.warning(msgHeader+" "+msgBody) return self.retThrottled # check if unthrottled if workQueue.queue_share == None: msgBody = "PASS unthrottled since share=None" tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled # count number of jobs in each status nRunning = 0 nNotRun = 0 nDefine = 0 nWaiting = 0 for workQueueID in workQueueIDs: if jobStat.has_key(cloudName) and \ jobStat[cloudName].has_key(workQueueID): tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID])) for pState,pNumber in jobStat[cloudName][workQueueID].iteritems(): if pState in ['running']: nRunning += pNumber elif pState in ['assigned','activated','starting']: nNotRun += pNumber elif pState in ['defined']: nDefine += pNumber elif pState in ['waiting']: nWaiting += pNumber # check if higher prio tasks are waiting tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue, 'managed',cloudName) if highestPrioWaiting == None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error(msgHeader+" "+msgBody) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \ nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted tmpRemainingSlot = int(nRunning*threshold-nNotRun) if tmpRemainingSlot < nJobsInBunchMin: # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = nJobsInBunchMin else: if workQueue.queue_name in ['evgensimul']: # use higher limit for evgensimul if tmpRemainingSlot < nJobsInBunchMaxES: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMaxES else: if tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax nQueueLimit = nJobsInBunch*nBunch # use special nQueueLimit tmpVal = self.taskBufferIF.getConfigValue(compName, 'NQUEUELIMIT_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') if tmpVal is not None: nQueueLimit = tmpVal # use nPrestage for reprocessing if workQueue.queue_name in ['reprocessing','mcore_repro']: # reset nJobsInBunch if nQueueLimit > (nNotRun+nDefine): tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine) if tmpRemainingSlot < nJobsInBunch: pass elif tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax # get cap nRunningCap = self.taskBufferIF.getConfigValue(compName, 'NRUNNINGCAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') nQueueCap = self.taskBufferIF.getConfigValue(compName, 'NQUEUECAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') # set number of jobs to be submitted self.setMaxNumJobs(nJobsInBunch/nParallel) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo,prodSourceLabel,workQueue,cloudName) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False tmpStr = msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3} totWalltime:{4} nRunCap:{5} nQueueCap:{6}" tmpLog.debug(tmpStr.format(nQueueLimit, nNotRun+nDefine, nDefine, nRunning, totWalltime, nRunningCap, nQueueCap)) # check if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued({0})>{1} totWalltime({2})>{3} ".format(nNotRun+nDefine,nQueueLimit, totWalltime,minTotalWalltime) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nRunning != 0 and float(nNotRun+nDefine)/float(nRunning) > threshold and \ (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued({3})>{4} totWalltime({5})>{6}".format(nNotRun+nDefine,nRunning, threshold,nNotRun+nDefine, nQueueLimit, totWalltime,minTotalWalltime) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nDefine > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit: limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit, nJobsInBunch,nWaitingBunchLimit) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nRunningCap is not None and nRunning > nRunningCap: limitPriority = True if not highPrioQueued: # cap on running msgBody = "SKIP nRunning({0})>nRunningCap({1})".format(nRunning,nRunningCap) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nQueueCap is not None and nNotRun+nDefine > nQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueue({0})>nQueueCap({1})".format(nNotRun+nDefine,nQueueCap) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if nNotRun+nDefine < max(nQueueLimit,nRunning) or (totWalltime != None and totWalltime < minTotalWalltime): tmpLog.debug(msgHeader+" not enough jobs queued") self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs,nQueueLimit/20)) msgBody = "PASS - priority limit={0}".format(limitPriorityValue) tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled
def getLatestDBRelease(self): methodName = 'getLatestDBRelease' tmpLog = MsgWrapper(logger,methodName) tmpLog.info('trying to get the latest version number of DBR') # get ddo datasets tmpStat,ddoDatasets = self.listDatasets('ddo.*') if tmpStat != self.SC_SUCCEEDED or ddoDatasets == {}: tmpLog.error('failed to get a list of DBRelease datasets from DQ2') return self.SC_FAILED,None # reverse sort to avoid redundant lookup ddoDatasets.sort() ddoDatasets.reverse() # extract version number latestVerMajor = 0 latestVerMinor = 0 latestVerBuild = 0 latestVerRev = 0 latestDBR = '' for tmpName in ddoDatasets: # ignore CDRelease if ".CDRelease." in tmpName: continue # ignore user if tmpName.startswith('ddo.user'): continue # use Atlas.Ideal if not ".Atlas.Ideal." in tmpName: continue match = re.search('\.v(\d+)(_*[^\.]*)$',tmpName) if match == None: tmpLog.warning('cannot extract version number from %s' % tmpName) continue # ignore special DBRs if match.group(2) != '': continue # get major,minor,build,revision numbers tmpVerStr = match.group(1) tmpVerMajor = 0 tmpVerMinor = 0 tmpVerBuild = 0 tmpVerRev = 0 try: tmpVerMajor = int(tmpVerStr[0:2]) except: pass try: tmpVerMinor = int(tmpVerStr[2:4]) except: pass try: tmpVerBuild = int(tmpVerStr[4:6]) except: pass try: tmpVerRev = int(tmpVerStr[6:]) # use only three digit DBR continue except: pass # compare if latestVerMajor > tmpVerMajor: continue elif latestVerMajor == tmpVerMajor: if latestVerMinor > tmpVerMinor: continue elif latestVerMinor == tmpVerMinor: if latestVerBuild > tmpVerBuild: continue elif latestVerBuild == tmpVerBuild: if latestVerRev > tmpVerRev: continue # check if well replicated tmpStat,ddoReplicas = self.listDatasetReplicas(tmpName) if len(ddoReplicas) < 10: continue # higher or equal version latestVerMajor = tmpVerMajor latestVerMinor = tmpVerMinor latestVerBuild = tmpVerBuild latestVerRev = tmpVerRev latestDBR = tmpName # failed if latestDBR == '': tmpLog.error('failed to get the latest version of DBRelease dataset from DQ2') return self.SC_FAILED,None tmpLog.info('use {0}'.format(latestDBR)) return self.SC_SUCCEEDED,latestDBR
def getFilesInDataset(self,datasetName,getNumEvents=False,skipDuplicate=True): methodName = 'getFilesInDataset' methodName += ' <datasetName={0}>'.format(datasetName) tmpLog = MsgWrapper(logger,methodName) try: # get DQ2 API dq2=DQ2() # get file list tmpRet = dq2.listFilesInDataset(datasetName) if tmpRet == (): fileMap = {} else: fileMap = tmpRet[0] # skip duplicated files if skipDuplicate: newFileMap = {} baseLFNmap = {} for tmpGUID,valMap in fileMap.iteritems(): # extract base LFN and attempt number lfn = valMap['lfn'] baseLFN = re.sub('(\.(\d+))$','',lfn) attNr = re.sub(baseLFN+'\.*','',lfn) if attNr == '': # without attempt number attNr = -1 else: attNr = int(attNr) # compare attempt numbers addMap = False if baseLFNmap.has_key(baseLFN): # use larger attempt number oldMap = baseLFNmap[baseLFN] if oldMap['attNr'] < attNr: del newFileMap[oldMap['guid']] addMap = True else: addMap = True # append if addMap: baseLFNmap[baseLFN] = {'guid':tmpGUID, 'attNr':attNr} newFileMap[tmpGUID] = valMap # use new map fileMap = newFileMap # get number of events in each file if getNumEvents: try: amiDatasetName = re.sub('(_tid\d+)*(_\d+)*/$','',datasetName) amiclient = AMIClient() for amiItem in amiquery.get_files(amiclient,amiDatasetName): amiGUID = amiItem['fileGUID'] if fileMap.has_key(amiGUID): fileMap[amiGUID]['nevents'] = long(amiItem['events']) except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} AMI failed with {1} {2}'.format(methodName,errtype.__name__,errvalue) tmpLog.warning(errStr) return self.SC_SUCCEEDED,fileMap except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errStr = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) return errCode,'{0} : {1}'.format(methodName,errStr)
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50*1000*1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader+" not enough jobs queued") self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled