def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # prepare tasks to be finished tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel, jedi_config.postprocessor.nTasks, pid=self.pid) if tmpRet == None: # failed tmpLog.error('failed to prepare tasks') # get tasks to be finished tmpLog.info('getting tasks to be finished') tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid, jedi_config.postprocessor.nTasks) if tmpList == None: # failed tmpLog.error('failed to get tasks to be finished') else: tmpLog.info('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.postprocessor.nWorkers for iWorker in range(nWorker): thr = PostProcessorThread(taskList,threadPool, self.taskBufferIF, self.ddmIF, self) thr.start() # join threadPool.join() tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = 60 timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to refine tmpList = self.taskBufferIF.getTasksToRefine_JEDI( vo, prodSourceLabel) if tmpList is None: # failed tmpLog.error( 'failed to get the list of tasks to refine') else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap( ) # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskRefinerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, workQueueMapper) thr.start() # join threadPool.join() except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.error('Traceback: {0}'.format(traceback.format_exc())) # sleep if needed loopCycle = jedi_config.taskrefine.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to refine tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo, prodSourceLabel) if tmpList == None: # failed tmpLog.error('failed to get the list of tasks to refine') else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskRefinerThread(taskList, threadPool, self.taskBufferIF, self.ddmIF, self, workQueueMapper) thr.start() # join threadPool.join() except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.error('Traceback: {0}'.format(traceback.format_exc())) # sleep if needed loopCycle = jedi_config.taskrefine.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def start(self): # start base classes JediKnight.start(self) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to exec command tmpList = self.taskBufferIF.getTasksToExecCommand_JEDI( vo, prodSourceLabel) if tmpList == None: # failed tmpLog.error( 'failed to get the task list for vo={0} label={1}' .format(vo, prodSourceLabel)) else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskCommandoThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self.pid) thr.start() # join threadPool.join() tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.tcommando.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def start(self): # start base class JediKnight.start(self) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of datasets to feed contents to DB tmpList = self.taskBufferIF.getDatasetsToFeedContents_JEDI( vo, prodSourceLabel) if tmpList == None: # failed logger.error( 'failed to get the list of datasets to feed contents' ) else: logger.debug('got %s datasets' % len(tmpList)) # put to a locked list dsList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.confeeder.nWorkers for iWorker in range(nWorker): thr = ContentsFeederThread( dsList, threadPool, self.taskBufferIF, self.ddmIF, self.pid) thr.start() # join threadPool.join() except: errtype, errvalue = sys.exc_info()[:2] logger.error( 'failed in %s.start() with %s %s' % (self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.confeeder.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def start(self): # start base class JediKnight.start(self) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of datasets to feed contents to DB tmpList = self.taskBufferIF.getDatasetsToFeedContents_JEDI(vo,prodSourceLabel) if tmpList == None: # failed logger.error('failed to get the list of datasets to feed contents') else: logger.debug('got %s datasets' % len(tmpList)) # put to a locked list dsList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.confeeder.nWorkers for iWorker in range(nWorker): thr = ContentsFeederThread(dsList,threadPool, self.taskBufferIF,self.ddmIF, self.pid) thr.start() # join threadPool.join() except: errtype,errvalue = sys.exc_info()[:2] logger.error('failed in %s.start() with %s %s' % (self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.confeeder.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def process(self, msg_obj, decoded_data=None): # logger tmp_log = logger_utils.make_logger(base_logger, method_name='process') # start tmp_log.info('start') # parse if decoded_data is None: # json decode try: msg_dict = json.loads(msg_obj.data) except Exception as e: err_str = 'failed to parse message json {2} , skipped. {0} : {1}'.format(e.__class__.__name__, e, msg_obj.data) tmp_log.error(err_str) raise else: msg_dict = decoded_data # run try: tmp_log.debug('got message {0}'.format(msg_dict)) if msg_dict['msg_type'] == 'generate_job': # get task to generate jobs jediTaskID = int(msg_dict['taskid']) s, taskSpec = self.tbIF.getTaskWithID_JEDI(jediTaskID) if not taskSpec: tmp_log.debug('unknown task {}'.format(jediTaskID)) else: # get WQ vo = taskSpec.vo prodSourceLabel = taskSpec.prodSourceLabel workQueue = self.tbIF.getWorkQueueMap().getQueueWithIDGshare(taskSpec.workQueue_ID, taskSpec.gshare) # get inputs tmpList = self.tbIF.getTasksToBeProcessed_JEDI(self.pid, None, workQueue, None, None, nFiles=1000, target_tasks=[jediTaskID]) if tmpList: inputList = ListWithLock(tmpList) # create thread threadPool = ThreadPool() siteMapper = self.tbIF.getSiteMapper() taskSetupper = TaskSetupper(vo, prodSourceLabel) taskSetupper.initializeMods(self.tbIF, self.ddmIF) gen = JobGeneratorThread(inputList, threadPool, self.tbIF, self.ddmIF, siteMapper, True, taskSetupper, self.pid, workQueue, 'pjmsg', None, None, None, False) gen.start() gen.join() else: tmp_log.debug('unknown message type : {}'.format(msg_dict['msg_type'])) except Exception as e: err_str = 'failed to run, skipped. {0} : {1}'.format(e.__class__.__name__, e) tmp_log.error(err_str) raise # done tmp_log.info('done')
def doUpdateDataLocality(self): tmpLog = MsgWrapper(logger, ' #ATM #KV doUpdateDataLocality') tmpLog.debug('start') try: # lock got_lock = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel='default', cloud=None, workqueue_id=None, resource_name=None, component='AtlasDataLocalityUpdaterWatchDog.doUpdateDataLocality', pid=self.pid, timeLimit=240) if not got_lock: tmpLog.debug('locked by another process. Skipped') return tmpLog.debug('got lock') # get list of datasets datasets_list = self.get_datasets_list() tmpLog.debug('got {0} datasets to update'.format(len(datasets_list))) # make thread pool thread_pool = ThreadPool() # make workers n_workers = 4 for _ in range(n_workers): thr = DataLocalityUpdaterThread(taskDsList=datasets_list, threadPool=thread_pool, taskbufferIF=self.taskBufferIF, ddmIF=self.ddmIF, pid=self.pid, loggerObj=tmpLog) thr.start() tmpLog.debug('started {0} updater workers'.format(n_workers)) # join thread_pool.join() # done tmpLog.debug('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start TaskBroker') # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() resource_types = self.taskBufferIF.load_resource_types() # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # loop over all work queues for workQueue in workQueueMapper.getAlignedQueueList( vo, prodSourceLabel): for resource_type in resource_types: wq_name = '_'.join( workQueue.queue_name.split(' ')) msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\ format(vo, prodSourceLabel, wq_name, resource_type.resource_name) tmpLog.debug(msgLabel + 'start') # get the list of tasks to check tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to check' ) else: tmpLog.debug(msgLabel + 'got tasks_to_check={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskCheckerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel) thr.start() # join threadPool.join() # get the list of tasks to assign tmpList = self.taskBufferIF.getTasksToAssign_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to assign' ) else: tmpLog.debug(msgLabel + 'got tasks_to_assign={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskBrokerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel, workQueue, resource_type.resource_name) thr.start() # join threadPool.join() tmpLog.debug(msgLabel + 'done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.debug('done') # sleep if needed loopCycle = jedi_config.taskbroker.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel, workQueue.queue_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID,tmpInputList in inputList: for taskSpec,cloudName,inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec,inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata, str(rwValues),str(expRWs), str(prioMap),str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS,outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS,str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec,inputChunk in inputListWorld: if not taskSpec.currentPriority in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, taskSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool, self.taskBufferIF,ddmIF, fullRWs,liveCounter) thr.start() threadPool.join(60*10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
datasetIDs = None if len(sys.argv) > 2: datasetIDs = [int(sys.argv[2])] s, taskSpec = tbIF.getTaskWithID_JEDI(jediTaskID) cloudName = taskSpec.cloud vo = taskSpec.vo prodSourceLabel = taskSpec.prodSourceLabel queueID = taskSpec.workQueue_ID gshare_name = taskSpec.gshare workQueue = tbIF.getWorkQueueMap().getQueueWithID(queueID, gshare_name) threadPool = ThreadPool() # get typical number of files #typicalNumFilesMap = tbIF.getTypicalNumInput_JEDI(vo,prodSourceLabel,workQueue, # useResultCache=600) typicalNumFilesMap = {} tmpListList = tbIF.getTasksToBeProcessed_JEDI( None, vo, workQueue, prodSourceLabel, cloudName, nFiles=10, simTasks=[jediTaskID],
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue, resource_name): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug( 'vo={0} label={1} queue={2} resource_name={3} nTasks={4}'.format( vo, prodSourceLabel, workQueue.queue_name, resource_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec, inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) // 1024 // 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock is False and prodDBlock is not None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if jobSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW is None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec, inputChunk in inputListWorld: if taskSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, taskSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld, threadPool, self.taskBufferIF, ddmIF, fullRWs, liveCounter, workQueue) thr.start() threadPool.join(60 * 10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start TaskBroker') # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # loop over all work queues for workQueue in workQueueMapper.getQueueListWithVoType(vo,prodSourceLabel): msgLabel = 'vo={0} label={1} queue={2}: '.format(vo,prodSourceLabel,workQueue.queue_name) tmpLog.debug(msgLabel+'start') # get the list of tasks to check tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI(vo,prodSourceLabel,workQueue) if tmpList == None: # failed tmpLog.error(msgLabel+'failed to get the list of tasks to check') else: tmpLog.debug(msgLabel+'got {0} tasks to check'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskCheckerThread(taskList,threadPool, self.taskBufferIF, self.ddmIF,self, vo,prodSourceLabel) thr.start() # join threadPool.join() # get the list of tasks to assign tmpList = self.taskBufferIF.getTasksToAssign_JEDI(vo,prodSourceLabel,workQueue) if tmpList == None: # failed tmpLog.error(msgLabel+'failed to get the list of tasks to assign') else: tmpLog.debug(msgLabel+'got {0} tasks to assign'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskBrokerThread(taskList,threadPool, self.taskBufferIF, self.ddmIF,self, vo,prodSourceLabel, workQueue) thr.start() # join threadPool.join() tmpLog.debug(msgLabel+'done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) tmpLog.debug('done') # sleep if needed loopCycle = jedi_config.taskbroker.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()