def getDatasetMetaData(self,datasetName): # make logger methodName = 'getDatasetMetaData' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) try: # get DQ2 API dq2=DQ2() # get file list tmpRet = dq2.getMetaDataAttribute(datasetName,dq2.listMetaDataAttributes()) # change dataset state to string if tmpRet['state'] in [DatasetState.CLOSED,DatasetState.FROZEN]: tmpRet['state'] = 'closed' elif tmpRet['state'] == DatasetState.OPEN: tmpRet['state'] = 'open' else: tmpRet['state'] = 'unknown' tmpLog.debug(str(tmpRet)) return self.SC_SUCCEEDED,tmpRet except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) errCode = self.checkError(errtype) return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
def doAction(self): # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority, timeoutVal in [ (950, 10), (900, 30), ]: self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format( errtype.__name__, errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to refine tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo,prodSourceLabel) if tmpList == None: # failed tmpLog.error('failed to get the list of tasks to refine') else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskRefinerThread(taskList,threadPool, self.taskBufferIF, self.ddmIF, self,workQueueMapper) thr.start() # join threadPool.join() except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.taskrefine.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def checkDatasetConsistency(self,location,datasetName): # make logger methodName = 'checkDatasetConsistency' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) try: # get DQ2 API dq2=DQ2() # check tmpRet = dq2.checkDatasetConsistency(location,datasetName) tmpLog.debug(str(tmpRet)) except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) errCode = self.checkError(errtype) return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def freezeDataset(self,datasetName,ignoreUnknown=False): methodName = 'freezeDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2=DQ2() # freeze dq2.freezeDataset(datasetName) except DQFrozenDatasetException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,True else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False): methodName = 'registerDatasetSubscription' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2 = DQ2() # call dq2.registerDatasetSubscription(datasetName,location,activity=activity) except DQSubscriptionExistsException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if not isOK: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) # action to throttle jobs in paused tasks self.doActionToThrottleJobInPausedTasks(tmpLog) # action for jumbo jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed') jumbo.run() except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resourceType): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format( vo, prodSourceLabel, cloudName, workQueue.queue_name)) # check if unthrottled if not workQueue.throttled: tmpLog.debug(" done : unthrottled since throttled is False") return self.retUnThrottled tmpLog.debug(" done : SKIP") return self.retThrottled
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, jobStat): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format( vo, prodSourceLabel, cloudName, workQueue.queue_name)) # check if unthrottled if workQueue.queue_share == None: tmpLog.debug(" done : unthrottled since share=None") return self.retUnThrottled tmpLog.debug(" done : SKIP") return self.retThrottled
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resourceType): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(vo,prodSourceLabel,cloudName, workQueue.queue_name)) # check if unthrottled if workQueue.queue_share == None: tmpLog.debug(" done : unthrottled since share=None") return self.retUnThrottled tmpLog.debug(" done : SKIP") return self.retThrottled
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # make tasks pending under certain conditions self.do_for_data_locality() except Exception: errtype, errvalue = sys.exc_info()[:2] err_str = traceback.format_exc() origTmpLog.error('failed with {0} {1} ; {2}'.format( errtype, errvalue, err_str)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # clean up data locality self.doCleanDataLocality() # update data locality self.doUpdateDataLocality() except Exception: errtype, errvalue = sys.exc_info()[:2] origTmpLog.error('failed with {0} {1}'.format(errtype, errvalue)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # handle waiting jobs self.doForWaitingJobs() # throttle tasks if so many prestaging requests self.doForPreStaging() # priority massage self.doForPriorityMassage() # redo stalled analysis jobs self.doForRedoStalledJobs() # throttle WAN data access #self.doForThrottleWAN() except Exception: errtype,errvalue = sys.exc_info()[:2] origTmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def finger(self,userName): methodName = 'finger' methodName = '{0} userName={1}'.format(methodName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # exec tmpRet = infoClient().finger(userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0}:{1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,tmpRet
def setDatasetOwner(self,datasetName,userName): methodName = 'setDatasetOwner' methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # get DQ2 API dq2=DQ2() # set dq2.setMetaDataAttribute(datasetName,'owner',userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassign(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority, timeoutVal in [(950, 10), (900, 30), ]: self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) # action to throttle jobs in paused tasks self.doActionToThrottleJobInPausedTasks(tmpLog) # action for jumbo jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed') jumbo.run() except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__, errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def reassign_jobs(self, to_reassign_map): tmp_log = MsgWrapper(logger, 'reassign_jobs') for jedi_taskid, value_map in to_reassign_map.items(): site = value_map['site'] n_jobs_to_fill = value_map['n_jobs_to_fill'] # compute n_jobs_to_close from n_jobs_to_fill n_jobs_to_close = int(n_jobs_to_fill / 3) # reassign n_jobs_closed = self.taskBufferIF.reassignJobsInPreassignedTask_JEDI( jedi_taskid, site, n_jobs_to_close) if n_jobs_closed is None: tmp_log.debug( 'jediTaskID={0} no longer ready/running or not assigned to {1} , skipped' .format(jedi_taskid, site)) else: tmp_log.debug('jediTaskID={0} to {1} , closed {2} jobs'.format( jedi_taskid, site, n_jobs_closed))
def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None): methodName = 'registerDatasetLocation' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN owner = parse_dn(owner) # get DQ2 API dq2 = DQ2() # set dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime) dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def setDatasetMetadata(self,datasetName,metadataName,metadaValue): methodName = 'setDatasetMetadata' methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName, metadataName,metadaValue) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # get DQ2 API dq2 = DQ2() # set dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue) except DQUnknownDatasetException: pass except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False): methodName = 'deleteDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True retStr = '' nFiles = -1 try: # get DQ2 API dq2=DQ2() # get the number of files if emptyOnly: nFiles = dq2.getNumberOfFiles(datasetName) # erase if not emptyOnly or nFiles == 0: dq2.eraseDataset(datasetName) retStr = 'deleted {0}'.format(datasetName) else: retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles) except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,retStr else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format( idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs tmpListToAssign = [] for tmpTaskItem in taskList: tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI( None, None, None, None, None, simTasks=[tmpTaskItem], readMinFiles=True) if tmpListItem is None: # failed tmpLog.error( 'failed to get the input chunks for jediTaskID={0}' .format(tmpTaskItem)) tmpStat = Interaction.SC_FAILED break tmpListToAssign += tmpListItem # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task refiner is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # brokerage if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0} for {1} tasks '.format( impl.__class__.__name__, len(tmpListToAssign))) try: tmpStat = impl.doBrokerage(tmpListToAssign, self.vo, self.prodSourceLabel, self.workQueue, self.resource_name) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doBrokerage failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed') else: tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # rescue picked files tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPicked) if tmpRet == None: # failed tmpLog.error('failed to rescue') else: tmpLog.info('rescued {0} tasks'.format(tmpRet)) # reactivate pending tasks tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPending, jedi_config.watchdog.timeoutForPending) if tmpRet == None: # failed tmpLog.error('failed to reactivate') else: tmpLog.info('reactivated {0} tasks'.format(tmpRet)) # vo/prodSourceLabel specific action impl = self.getImpl(vo,prodSourceLabel) if impl != None: tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel)) else: tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue # get T1 t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # make subscription tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, activity='Production',ignoreUnknown=True, backEnd=ddmBackEnd) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted']: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) tmpLog.debug('finished to reassign')
def findMissingFiles(self,jediTaskID,cloudName): tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(jediTaskID,['input'],True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format(datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error('failed to get the list of sites where {0} is available, since {1}'.format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(datasetSpec.datasetName,cloudName,str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source']]+dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(checkedSites,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error('failed to get available file list for {0}'.format(datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName,availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType,availableFiles in availableFilesMap.iteritems(): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI(jediTaskID,datasetSpec.datasetID,missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format(datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def runImpl(self): # cutoff for disk in TB diskThreshold = 5 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] thrInputSize = 1024*1024*1024 thrInputNum = 100 thrInputSizeFrac = 0.1 thrInputNumFrac = 0.1 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.debug('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.debug(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus, tmpSpaceSize, diskThreshold, tmpEP['state'])) toSkip = True break if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys()) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpLog.debug(' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac)) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpLog.debug(' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available') # use any nuclei where input is available if no sites can run jobs tmpRet = tmpSiteList okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.debug(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*({0}=weight for TAPE)'.format(negWeightTape) tmpLog.debug(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.debug('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug("start doCheck") # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug("check with panda") tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error("failed to see clouds") return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug("jediTaskID={0} -> {1}".format(tmpTaskID, tmpCoreName)) if not tmpCoreName in ["NULL", "", None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ["output", "log"] ) # get destinations retMap[tmpTaskID] = [] for datasetSpec in tmpDatasetSpecs: token = ddmIF.convertTokenToEndpoint(siteSpec.ddm, datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm retMap[tmpTaskID].append( { "datasetID": datasetSpec.datasetID, "token": "dst:{0}".format(token), "destination": tmpCoreName, } ) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug("ret {0}".format(str(retMap))) # return tmpLog.debug("done") return self.SC_SUCCEEDED, retMap
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') origTmpLog = tmpLog # check every 60 min checkInterval = 60 # get lib.tgz for waiting jobs libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo,self.prodSourceLabel,checkInterval) tmpLog.debug('got {0} lib.tgz files'.format(len(libList))) # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced for prodUserName,datasetName,tmpFileSpec in libList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID)) tmpLog.debug('start') # check status of lib.tgz if tmpFileSpec.status == 'failed': # get buildJob pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID], fromDefined=False, fromActive=False, fromWaiting=False) pandaJobSpec = pandaJobSpecs[0] if pandaJobSpec != None: # kill self.taskBufferIF.updateJobs([pandaJobSpec],False) tmpLog.debug(' killed downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # PandaJobSpec not found tmpLog.error(' cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName, tmpFileSpec.PandaID)) elif tmpFileSpec.status == 'finished': # set metadata self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID, 'lfn':tmpFileSpec.lfn, 'checksum':tmpFileSpec.checksum, 'fsize':tmpFileSpec.fsize, 'scope':tmpFileSpec.scope, }]) # get lib dataset dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName}) if dataset != None: # activate jobs aThr = Activator(self.taskBufferIF,dataset) aThr.start() aThr.join() tmpLog.debug(' activated downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # datasetSpec not found tmpLog.error(' cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # lib.tgz is not ready tmpLog.debug(' keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName)) except: tmpLog = origTmpLog errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog = origTmpLog tmpLog.debug('done') return self.SC_SUCCEEDED
def findMissingFiles(self, jediTaskID, cloudName): tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( jediTaskID, ['input'], True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format( datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData( self.siteMapper, ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error( 'failed to get the list of sites where {0} is available, since {1}' .format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format( datasetSpec.datasetName, cloudName, str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source'] ] + dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( checkedSites, self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error( 'failed to get available file list for {0}'.format( datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType, availableFiles in availableFilesMap.iteritems( ): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI( jediTaskID, datasetSpec.datasetID, missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format( datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format( tmpTaskID, tmpCoreName)) if not tmpCoreName in ['NULL', '', None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ['output', 'log']) # get destinations retMap[tmpTaskID] = [] for datasetSpec in tmpDatasetSpecs: token = ddmIF.convertTokenToEndpoint( siteSpec.ddm, datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm retMap[tmpTaskID].append({ 'datasetID': datasetSpec.datasetID, 'token': 'dst:{0}'.format(token), 'destination': tmpCoreName }) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED, retMap
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get sites in the cloud if not taskSpec.site in ['', None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug( ' skip {0} due to site RAM shortage={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug( ' skip {0} due to job RAM shortage={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
from pandajedi.jedicore.MsgWrapper import MsgWrapper from pandajedi.jedicore.JediTaskBufferInterface import JediTaskBufferInterface from pandajedi.jediddm.DDMInterface import DDMInterface from pandajedi.jediorder.JobBroker import JobBroker from pandajedi.jediorder.JobSplitter import JobSplitter from pandajedi.jediorder.JobGenerator import JobGeneratorThread from pandajedi.jedicore.ThreadUtils import ThreadPool from pandajedi.jediorder.TaskSetupper import TaskSetupper import sys logger = PandaLogger().getLogger('JobGenerator') tmpLog = MsgWrapper(logger) tbIF = JediTaskBufferInterface() tbIF.setupInterface() siteMapper = tbIF.getSiteMapper() ddmIF = DDMInterface() ddmIF.setupInterface() jediTaskID = int(sys.argv[1]) datasetIDs = None if len(sys.argv) > 2: datasetIDs = [int(sys.argv[2])]
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # prepare tasks to be finished tmpLog.info( 'preparing tasks to be finished for vo={0} label={1}' .format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI( vo, prodSourceLabel, jedi_config.postprocessor.nTasks, pid=self.pid) if tmpRet == None: # failed tmpLog.error('failed to prepare tasks') # get tasks to be finished tmpLog.info('getting tasks to be finished') tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI( vo, prodSourceLabel, self.pid, jedi_config.postprocessor.nTasks) if tmpList == None: # failed tmpLog.error('failed to get tasks to be finished') else: tmpLog.info('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.postprocessor.nWorkers for iWorker in range(nWorker): thr = PostProcessorThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self) thr.start() # join threadPool.join() tmpLog.info('done') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = 60 timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # loop over all tasks for taskSpec in taskList: # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED # get impl impl = self.implFactory.instantiateImpl( taskSpec.vo, taskSpec.prodSourceLabel, None, self.taskBufferIF, self.ddmIF) if impl == None: # post processor is undefined tmpLog.error( 'post-processor is undefined for vo={0} sourceLabel={1}' .format(taskSpec.vo, taskSpec.prodSourceLabel)) tmpStat = Interaction.SC_FATAL # execute if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('post-process with {0}'.format( impl.__class__.__name__)) try: impl.doPostProcess(taskSpec, tmpLog) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'doPostProcess failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FATAL # done if tmpStat == Interaction.SC_FATAL: # task is broken tmpErrStr = 'post-process failed' tmpLog.error(tmpErrStr) taskSpec.status = 'broken' taskSpec.setErrDiag(tmpErrStr) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI( taskSpec, {'jediTaskID': taskSpec.jediTaskID}) elif tmpStat == Interaction.SC_FAILED: tmpErrStr = 'post processing failed' taskSpec.setOnHold() taskSpec.setErrDiag(tmpErrStr, True) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI( taskSpec, {'jediTaskID': taskSpec.jediTaskID}) tmpLog.info('set task.status={0} since {1}'.format( taskSpec.status, taskSpec.errorDialog)) continue # final procedure try: impl.doFinalProcedure(taskSpec, tmpLog) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'doFinalProcedure failed with {0}:{1}'.format( errtype.__name__, errvalue)) # done tmpLog.info('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec == None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') origTmpLog = tmpLog # check every 60 min checkInterval = 60 # get lib.tgz for waiting jobs libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI( self.vo, self.prodSourceLabel, checkInterval) tmpLog.debug('got {0} lib.tgz files'.format(len(libList))) # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced for prodUserName, datasetName, tmpFileSpec in libList: tmpLog = MsgWrapper( logger, '<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID)) tmpLog.debug('start') # check status of lib.tgz if tmpFileSpec.status == 'failed': # get buildJob pandaJobSpecs = self.taskBufferIF.peekJobs( [tmpFileSpec.PandaID], fromDefined=False, fromActive=False, fromWaiting=False) pandaJobSpec = pandaJobSpecs[0] if pandaJobSpec != None: # kill self.taskBufferIF.updateJobs([pandaJobSpec], False) tmpLog.debug( ' killed downstream jobs for user="******" with libDS={1}' .format(prodUserName, datasetName)) else: # PandaJobSpec not found tmpLog.error( ' cannot find PandaJobSpec for user="******" with PandaID={1}' .format(prodUserName, tmpFileSpec.PandaID)) elif tmpFileSpec.status == 'finished': # set metadata self.taskBufferIF.setGUIDs([{ 'guid': tmpFileSpec.GUID, 'lfn': tmpFileSpec.lfn, 'checksum': tmpFileSpec.checksum, 'fsize': tmpFileSpec.fsize, 'scope': tmpFileSpec.scope, }]) # get lib dataset dataset = self.taskBufferIF.queryDatasetWithMap( {'name': datasetName}) if dataset != None: # activate jobs aThr = Activator(self.taskBufferIF, dataset) aThr.start() aThr.join() tmpLog.debug( ' activated downstream jobs for user="******" with libDS={1}' .format(prodUserName, datasetName)) else: # datasetSpec not found tmpLog.error( ' cannot find datasetSpec for user="******" with libDS={1}' .format(prodUserName, datasetName)) else: # lib.tgz is not ready tmpLog.debug( ' keep waiting for user="******" libDS={1}'.format( prodUserName, datasetName)) except: tmpLog = origTmpLog errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype, errvalue)) # return tmpLog = origTmpLog tmpLog.debug('done') return self.SC_SUCCEEDED
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50 * 1000 * 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format( vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format( msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug( msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug( "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}" .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI( vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format( msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info( "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}". format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info( "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}" .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format( nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format( nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format( nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format( nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format( nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format( nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader + " not enough jobs queued") if not workQueue.queue_name in non_rt_wqs: self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format( limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} datasetsIdxConsistency = [] # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue # make logger try: gshare = '_'.join(taskSpec.gshare.split(' ')) except: gshare = 'Undefined' tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare)) try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = taskSpec.getNumFilesPerJob() # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid, ['output','log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add(tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # index consistency if datasetSpec.indexConsistent(): datasetsIdxConsistency.append(datasetSpec.datasetID) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock() or taskSpec.orderByLB(): longFormat = True tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat ) tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.debug('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords() != None: nPFN = datasetSpec.getNumRecords() elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'): nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and taskSpec.getNumFilesPerJob() is not None: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob() else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt['nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str(skipFilesUsedBy).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID, {'datasetName':datasetSpec.datasetName}, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']: nPFN += tmpParentAtt['nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset, 'scope':None, 'filesize':0, 'checksum':None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents() # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap['maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # ramCount ramCount = 0 # skip short input if datasetSpec.isMaster() and not datasetSpec.isPseudo() \ and nEventsPerFile is not None and nEventsPerJob is not None \ and nEventsPerFile >= nEventsPerJob \ and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True: skipShortInput = True else: skipShortInput = False # feed files to the contents table tmpLog.debug('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount, taskSpec, skipShortInput) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # index consistency if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0: self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency) # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock not-running task with {0}'.format(retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue): # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug("start doBrokerage") # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug("vo={0} label={1} queue={2}".format(vo, prodSourceLabel, workQueue.queue_name)) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = "managed" jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority ) if tmpRW == None: tmpLog.error("failed to calculate RW with prio={0}".format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error("failed to calculate RW for jediTaskID={0}".format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo, prodSourceLabel, None, None) if fullRWs == None: tmpLog.error("failed to calculate full RW") return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map), ) tmpLog.debug("run task assigner for {0} tasks".format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask : nBunchTask + maxBunchTask] strIDs = "jediTaskID=" for tmpJobSpec in jobsBunch: strIDs += "{0},".format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug("{0}:{1}".format(stS, str(outSs))) # return tmpLog.debug("done") return self.SC_SUCCEEDED
def doActionForReassign(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat is not True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'. format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec is None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat is not True: tmpLog.error("failed to make subscription") isOK = False break except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # vo/prodSourceLabel specific action impl = self.getImpl(vo, prodSourceLabel, subType=self.subStr) if impl is not None: plugin_name = impl.__class__.__name__ tmpLog.info( 'pre-action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) impl.pre_action(tmpLog, vo, prodSourceLabel, self.pid) tmpLog.info( 'do action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to run special action for vo={} label={} cls={}' .format(vo, prodSourceLabel, plugin_name)) else: tmpLog.info( 'done for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper( self.logger, '< jediTaskID={0} >'.format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False, True, self.pid, 10) if not tmpStat or taskSpec == None: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'task param conversion from json failed with {0}:{1}' .format(errtype.__name__, errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap[ 'nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [ None, taskSpec.jediTaskID ]: tmpStat = self.taskBufferIF.checkParentTask_JEDI( taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.parent_tid, ['output', 'log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add( tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format( datasetSpec.datasetName, datasetSpec.datasetID)) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state': 'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state': 'mutable'} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( '{0} failed to get metadata to {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) taskSpec.setErrDiag( 'failed to get metadata for {0}'.format( datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key( 'getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles( ) if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock(): longFormat = True tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat) tmpLog.debug( 'got {0} files in {1}'.format( len(tmpRet), tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles( tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.debug( 'found {0} lost files in {1}'. format(len(tmpLostFiles), tmpDatasetName)) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems( ): tmpLog.debug( 'removed {0}'.format( tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords( ) != None: nPFN = datasetSpec.getNumRecords( ) elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerFile' ) and taskParamMap.has_key( 'nEventsPerRange'): nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and 'nFilesPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nFilesPerJob'] else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI( datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt[ 'nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt[ 'nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str( skipFilesUsedBy ).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI( tmpJediTaskID, { 'datasetName': datasetSpec. datasetName }, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt[ 'nFiles']: nPFN += tmpParentAtt[ 'nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': iPFN + tmpOffset, 'scope': None, 'filesize': 0, 'checksum': None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { 'lfn': 'pseudo_lfn', 'scope': None, 'filesize': 0, 'checksum': None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': '{0:06d}:{1}'.format( iPFN, taskParamMap['pfnList'] [iPFN].split('/')[-1]), 'scope': None, 'filesize': 0, 'checksum': None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'failed to get files due to {0}:{1} {2}' .format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag( 'failed to get files for {0}'.format( datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents( ) # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key( 'nEventsPerFile'): nEventsPerFile = taskParamMap[ 'nEventsPerFile'] elif datasetSpec.isMaster( ) and datasetSpec.isPseudo( ) and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap[ 'nEvents'] if taskParamMap.has_key( 'nEventsPerJob'): nEventsPerJob = taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerRange'): nEventsPerRange = taskParamMap[ 'nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap[ 'tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster( ) or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key( 'maxAttempt'): maxAttempt = taskParamMap[ 'maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap[ 'maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset( ) # nMaxEvents nMaxEvents = None if datasetSpec.isMaster( ) and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio( origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key( 'nEventsPerFile'): if taskParamMap.has_key( 'nEventsPerJob'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerJob']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerJob']) nMaxFiles = int( math.ceil( nMaxFiles)) elif taskParamMap.has_key( 'nEventsPerRange'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerRange']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerRange']) nMaxFiles = int( math.ceil( nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster( ) and taskSpec.useScout() and ( datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo( ) and fileList != [] and taskParamMap.has_key( 'useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True #ramCount ramCount = 0 # feed files to the contents table tmpLog.debug('update contents') retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount) if retDB == False: taskSpec.setErrDiag( 'failed to insert files for {0}. {1}' .format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug( 'escape since task or dataset is locked' ) break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format( len(missingFileList), datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { 'datasetSpec': datasetSpec, 'missingFiles': missingFileList } else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap[ 'nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap[ 'nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken', 'tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format( taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid, setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True, pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task.status={0}'.format( newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock not-running task with {0}'.format( retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName)) if not tmpCoreName in ['NULL','',None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log']) # get destinations retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':tmpCoreName}) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat): # params nBunch = 4 threshold = 2.0 thresholdForSite = threshold - 1.0 nJobsInBunchMax = 500 nJobsInBunchMin = 300 nJobsInBunchMaxES = 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 8 # make logger tmpLog = MsgWrapper(logger) workQueueIDs = workQueue.getIDs() msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name) tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs))) # check cloud status if not self.siteMapper.checkCloud(cloudName): msgBody = "SKIP cloud={0} undefined".format(cloudName) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled cloudSpec = self.siteMapper.getCloud(cloudName) if cloudSpec['status'] in ['offline']: msgBody = "SKIP cloud.status={0}".format(cloudSpec['status']) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled if cloudSpec['status'] in ['test']: if workQueue.queue_name != 'test': msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'], workQueue.queue_name) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') tmpLog.debug(msgHeader+" "+msgBody) return self.retThrottled # check if unthrottled if workQueue.queue_share == None: msgBody = "PASS unthrottled since share=None" tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled # count number of jobs in each status nRunning = 0 nNotRun = 0 nDefine = 0 nWaiting = 0 for workQueueID in workQueueIDs: if jobStat.has_key(cloudName) and \ jobStat[cloudName].has_key(workQueueID): tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID])) for pState,pNumber in jobStat[cloudName][workQueueID].iteritems(): if pState in ['running']: nRunning += pNumber elif pState in ['assigned','activated','starting']: nNotRun += pNumber elif pState in ['defined']: nDefine += pNumber elif pState in ['waiting']: nWaiting += pNumber # check if higher prio tasks are waiting tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue, 'managed',cloudName) if highestPrioWaiting == None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error(msgHeader+" "+msgBody) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \ nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted tmpRemainingSlot = int(nRunning*threshold-nNotRun) if tmpRemainingSlot < nJobsInBunchMin: # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = nJobsInBunchMin else: if workQueue.queue_name in ['evgensimul']: # use higher limit for evgensimul if tmpRemainingSlot < nJobsInBunchMaxES: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMaxES else: if tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax nQueueLimit = nJobsInBunch*nBunch # use special limit for CERN if cloudName == 'CERN': nQueueLimit = 2000 # use nPrestage for reprocessing if workQueue.queue_name in ['reprocessing']: if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0: nQueueLimit = cloudSpec['nprestage'] # reset nJobsInBunch if nQueueLimit > (nNotRun+nDefine): tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine) if tmpRemainingSlot < nJobsInBunch: pass elif tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax # set number of jobs to be submitted self.setMaxNumJobs(nJobsInBunch/nParallel) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False tmpLog.debug(msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3}".format(nQueueLimit, nNotRun+nDefine, nDefine, nRunning)) # check when high prio tasks are not waiting if not highPrioQueued: if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit: limitPriority = True # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued({0})>{1}".format(nNotRun+nDefine,nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit: limitPriority = True # enough jobs in Panda msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued+Defined({3})>{4}".format(nNotRun,nRunning, threshold,nNotRun+nDefine, nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nDefine > nQueueLimit: limitPriority = True # brokerage is stuck msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit: limitPriority = True # too many waiting msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit, nJobsInBunch,nWaitingBunchLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioInPandaDB self.setMinPriority(limitPriorityValue) msgBody = "PASS - priority limit={0}".format(limitPriorityValue) tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled
def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel, workQueue.queue_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID,tmpInputList in inputList: for taskSpec,cloudName,inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec,inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata, str(rwValues),str(expRWs), str(prioMap),str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS,outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS,str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec,inputChunk in inputListWorld: if not taskSpec.currentPriority in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, taskSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool, self.taskBufferIF,ddmIF, fullRWs,liveCounter) thr.start() threadPool.join(60*10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in iteritems(cloudsInPanda): tmpLog.debug('jediTaskID={0} -> {1}'.format( tmpTaskID, tmpCoreName)) if tmpCoreName not in ['NULL', '', None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) scopeSiteSpec_input, scopeSiteSpec_output = select_scope( siteSpec, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype( taskSpec.taskType)) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ['output', 'log']) # get destinations retMap[tmpTaskID] = {'datasets': [], 'nucleus': nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) is not None: continue # get token token = ddmIF.convertTokenToEndpoint( siteSpec.ddm_output[scopeSiteSpec_output], datasetSpec.storageToken) # use default endpoint if token is None: token = siteSpec.ddm_output[scopeSiteSpec_output] # add original token if datasetSpec.storageToken not in ['', None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({ 'datasetID': datasetSpec.datasetID, 'token': 'dst:{0}'.format(token), 'destination': tmpCoreName }) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED, retMap
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue( self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024 * 1024 * 1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 minIoIntensityWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas') if minIoIntensityWithLD is None: minIoIntensityWithLD = 200 minInputSizeWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas') if minInputSizeWithLD is None: minInputSizeWithLD = 10000 maxTaskPrioWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas') if maxTaskPrioWithLD is None: maxTaskPrioWithLD = 800 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug( '{0} terminating after processing {1} tasks since no more inputs ' .format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec, inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info( 'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}' .format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: tmpLog.error('failed to read task params') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI( taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in siteMapper.nuclei: candidateNucleus = taskSpec.nucleus elif taskSpec.nucleus in siteMapper.satellites: nucleusList = siteMapper.satellites candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format( len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleusSpec.state not in ['ACTIVE']: tmpLog.info( ' skip nucleus={0} due to status={1} criteria=-status' .format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed status check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info( 'skip transfer backlog check due to negative T1Weight' ) else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei( ) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in backlogged_nuclei: tmpLog.info( ' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog' .format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed transfer backlog check'. format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination( tmpDatasetSpec.storageToken ) is not None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssociatedEndpoint( tmpDatasetSpec.storageToken) if tmpEP is None: tmpLog.info( ' skip nucleus={0} since no endpoint with {1} criteria=-match' .format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if tmpEP['state'] not in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP[ 'space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long( self.fullRW[tmpNucleus] / 10 / 24 / 3600 * 0.25) if tmpSpaceSize - tmpSpaceToUse < diskThreshold: tmpLog.info( ' skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space' .format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if tmpNucleus not in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = { 'total': 0, 'free': 0 } try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except Exception: tmpOld = None try: tmpNew = float(tmpSpaceSize - tmpSpaceToUse) / float( tmpEP['space_total']) except Exception: tmpNew = None if tmpNew is not None and (tmpOld is None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = { 'total': tmpEP['space_total'], 'free': tmpSpaceSize - tmpSpaceToUse } if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed endpoint check {1} TB'. format(len(nucleusList), diskThreshold / 1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF, self.taskBufferIF) tmpSt, tmpRet = jobBroker.doBrokerage( taskSpec, taskSpec.cloud, inputChunk, None, True, tmpSiteList, tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info( ' skip nucleus={0} due to missing ability to run jobs criteria=-job' .format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType( datasetSpec.datasetName ) in datasetTypeToSkipCheck: continue # primary only if taskParamMap.get( 'taskBrokerOnMaster' ) is True and not datasetSpec.isMaster(): continue # use deep scan for primary dataset unless data carousel if datasetSpec.isMaster( ) and not taskSpec.inputPreStaging(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData( siteMapper, self.ddmIF, datasetSpec.datasetName, list(nucleusList.keys()), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to get nuclei where data is available, since {0}' .format(tmpRet)) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus, tmpVals in iteritems(tmpRet): if tmpNucleus not in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict( (k, v + tmpVals[k]) for (k, v) in iteritems( availableData[tmpNucleus])) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if taskSpec.inputPreStaging( ) and availableData[tmpNucleus][ 'ava_num_any'] > 0: # use incomplete replicas for data carousel since the completeness is guaranteed newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format( tmpNucleus, availableData[tmpNucleus] ['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format( tmpNucleus, availableData[tmpNucleus] ['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec totInputSize = list(availableData.values( ))[0]['tot_size'] / 1024 / 1024 / 1024 data_locality_check_str = ( '(ioIntensity ({0}) is None or less than {1} kBPerS ' 'and input size ({2} GB) is less than {3}) ' 'or task.currentPriority ({4}) is higher than or equal to {5}' ).format(taskSpec.ioIntensity, minIoIntensityWithLD, int(totInputSize), minInputSizeWithLD, taskSpec.currentPriority, maxTaskPrioWithLD) if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) elif ((taskSpec.ioIntensity is None or taskSpec.ioIntensity <= minIoIntensityWithLD) and totInputSize <= minInputSizeWithLD) \ or taskSpec.currentPriority >= maxTaskPrioWithLD: availableData = {} tmpLog.info( ' disable data locality check since no nucleus has input data, {}' .format(data_locality_check_str)) else: # no candidate + unavoidable data locality check nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) tmpLog.info( ' the following conditions required to disable data locality check: {}' .format(data_locality_check_str)) tmpLog.info( '{0} candidates passed data check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus not in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[ tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format( nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format( nucleusRW[tmpNucleus], cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus] ['ava_size_any']) weight /= float( availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format( availableData[tmpNucleus] ['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format( availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus][ 'ava_size_any'] > availableData[ tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format( negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format( fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except Exception: pass tmpLog.info( ' use nucleus={0} weight={1} {2} criteria=+use' .format(tmpNucleus, weight, wStr)) totalWeight += weight nucleusweights.append((tmpNucleus, weight)) tmpLog.info('final {0} candidates'.format( len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0, totalWeight) candidateNucleus = None for tmpNucleus, weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus is None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) # get destinations retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, tmpDatasetSpecs) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info( ' set nucleus={0} with {1} criteria=+set'.format( candidateNucleus, tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task_status=ready' tmpLog.sendMsg(tmpMsg, self.msgType) # update RW table self.prioRW.acquire() for prio, rwMap in iteritems(self.prioRW): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except Exception: errtype, errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format( self.__class__.__name__, errtype.__name__, errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: # get active PandaIDs to be killed pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpLog.info('completed the command') tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) else: tmpLog.info('sending kill command') tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'. format(idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs taskSpecList = [] for jediTaskID in taskList: tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False) if tmpRet and taskSpec is not None: taskSpecList.append(taskSpec) else: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) if taskSpecList != []: # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task brokerage is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # check if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('check with {0}'.format( impl.__class__.__name__)) try: tmpStat, taskCloudMap = impl.doCheck(taskSpecList) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doCheck failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # update if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to check assignment') else: tmpRet = self.taskBufferIF.setCloudToTasks_JEDI( taskCloudMap) tmpLog.info('done with {0} for {1}'.format( tmpRet, str(taskCloudMap))) except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def doSplit(self,taskSpec,inputChunk,siteMapper): # return for failure retFatal = self.SC_FATAL,[] retTmpError = self.SC_FAILED,[] # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0} datasetID={1}>'.format(taskSpec.jediTaskID,inputChunk.masterIndexName)) tmpLog.debug('start') if not inputChunk.isMerging: # set maxNumFiles using taskSpec if specified maxNumFiles = taskSpec.getMaxNumFilesPerJob() # set fsize gradients using taskSpec sizeGradients = taskSpec.getOutDiskSize() # set fsize intercepts using taskSpec sizeIntercepts = taskSpec.getWorkDiskSize() # walltime if not taskSpec.useHS06(): walltimeGradient = taskSpec.walltime else: walltimeGradient = taskSpec.cpuTime # number of events per job if defined nEventsPerJob = taskSpec.getNumEventsPerJob() # number of files per job if defined if not taskSpec.dynamicNumEvents(): nFilesPerJob = taskSpec.getNumFilesPerJob() else: nFilesPerJob = None if nFilesPerJob == None and nEventsPerJob == None and inputChunk.useScout() \ and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule(): nFilesPerJob = 1 # grouping with boundaryID useBoundary = taskSpec.useGroupWithBoundaryID() # fsize intercepts per input size sizeGradientsPerInSize = None # max primay output size maxOutSize = None # max size per job maxSizePerJob = taskSpec.getMaxSizePerJob() if maxSizePerJob is not None: maxSizePerJob += InputChunk.defaultOutputSize # dynamic number of events dynNumEvents = taskSpec.dynamicNumEvents() # max number of event ranges maxNumEventRanges = None # multiplicity of jobs if taskSpec.useJobCloning(): multiplicity = 1 else: multiplicity = taskSpec.getNumEventServiceConsumer() # split with fields if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN(): splitByFields = taskSpec.getFieldNumToLFN() else: splitByFields = None else: # set parameters for merging maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob() sizeGradients = 0 walltimeGradient = 0 nFilesPerJob = taskSpec.getNumFilesPerMergeJob() nEventsPerJob = taskSpec.getNumEventsPerMergeJob() maxSizePerJob = None useBoundary = {'inSplit':3} dynNumEvents = False maxNumEventRanges = None multiplicity = None # gradients per input size is 1 + margin sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge # intercepts for libDS sizeIntercepts = taskSpec.getWorkDiskSize() # mergein of 500MB interceptsMergin = self.interceptsMerginForMerge if sizeIntercepts < interceptsMergin: sizeIntercepts = interceptsMergin maxOutSize = taskSpec.getMaxSizePerMergeJob() if maxOutSize == None: # max output size is 5GB for merging by default maxOutSize = 5 * 1024 * 1024 * 1024 # split with fields if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN(): splitByFields = range(4+1,4+1+len(taskSpec.getFieldNumToLFN())) else: splitByFields = None # LB respectLB = taskSpec.respectLumiblock() # dump tmpLog.debug('maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'.format(maxNumFiles, sizeGradients, sizeIntercepts, useBoundary)) tmpLog.debug('walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(walltimeGradient, nFilesPerJob, nEventsPerJob)) tmpLog.debug('sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}'.format(sizeGradientsPerInSize, maxOutSize, respectLB, dynNumEvents)) tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format(multiplicity,str(splitByFields), inputChunk.getNumFilesInMaster())) # split returnList = [] subChunks = [] iSubChunks = 0 nSubChunks = 25 subChunk = None while True: # change site if iSubChunks % nSubChunks == 0 or subChunk == []: # append to return map if subChunks != []: # get site names for parallel execution if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake': siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(), nSubChunks,[siteName]) returnList.append({'siteName':siteName, 'subChunks':subChunks, 'siteCandidate':siteCandidate, }) tmpLog.debug('split to %s subchunks' % len(subChunks)) # reset subChunks = [] # skip unavailable files in distributed datasets nSkip = inputChunk.skipUnavailableFiles() tmpLog.debug('skipped {0} files'.format(nSkip)) # new candidate siteCandidate = inputChunk.getOneSiteCandidate(nSubChunks) if siteCandidate == None: break siteName = siteCandidate.siteName siteSpec = siteMapper.getSite(siteName) # directIO if taskSpec.useLocalIO() or not siteSpec.isDirectIO() or taskSpec.allowInputLAN() is None \ or inputChunk.isMerging: useDirectIO = False else: useDirectIO = True # get maxSize if it is set in taskSpec maxSize = maxSizePerJob if maxSize == None: # use maxwdir as the default maxSize if not useDirectIO: maxSize = siteSpec.maxwdir * 1024 * 1024 elif nEventsPerJob is not None or nFilesPerJob is not None: maxSize = None else: maxSize = max(50000, siteSpec.maxwdir) * 1024 * 1024 else: # add offset maxSize += sizeIntercepts # max disk size maxDiskSize = siteSpec.maxwdir * 1024 * 1024 # max walltime maxWalltime = None if not inputChunk.isMerging: maxWalltime = taskSpec.getMaxWalltime() if maxWalltime is None: maxWalltime = siteSpec.maxtime # core count if siteSpec.coreCount > 0: coreCount = siteSpec.coreCount else: coreCount = 1 # core power corePower = siteSpec.corepower # max num of event ranges for dynNumEvents if dynNumEvents: maxNumEventRanges = int(siteSpec.get_n_sim_events() / taskSpec.get_min_granularity()) if maxNumEventRanges == 0: maxNumEventRanges = 1 tmpLog.debug('chosen {0}'.format(siteName)) tmpLog.debug('new weight {0}'.format(siteCandidate.weight)) tmpLog.debug('maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}'.format(maxSize,maxWalltime, coreCount,corePower, maxNumEventRanges, maxDiskSize)) tmpLog.debug('useDirectIO={0} label={1}'.format(useDirectIO, taskSpec.prodSourceLabel)) # get sub chunk subChunk = inputChunk.getSubChunk(siteName,maxSize=maxSize, maxNumFiles=maxNumFiles, sizeGradients=sizeGradients, sizeIntercepts=sizeIntercepts, nFilesPerJob=nFilesPerJob, walltimeGradient=walltimeGradient, maxWalltime=maxWalltime, nEventsPerJob=nEventsPerJob, useBoundary=useBoundary, sizeGradientsPerInSize=sizeGradientsPerInSize, maxOutSize=maxOutSize, coreCount=coreCount, respectLB=respectLB, corePower=corePower, dynNumEvents=dynNumEvents, maxNumEventRanges=maxNumEventRanges, multiplicity=multiplicity, splitByFields=splitByFields, tmpLog=tmpLog, useDirectIO=useDirectIO, maxDiskSize=maxDiskSize) if subChunk == None: break if subChunk != []: # append subChunks.append(subChunk) iSubChunks += 1 # append to return map if remain if subChunks != []: # get site names for parallel execution if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging: siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(), nSubChunks,[siteName]) returnList.append({'siteName':siteName, 'subChunks':subChunks, 'siteCandidate':siteCandidate, }) tmpLog.debug('split to %s subchunks' % len(subChunks)) # return tmpLog.debug('done') return self.SC_SUCCEEDED,returnList
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start TaskBroker') # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() resource_types = self.taskBufferIF.load_resource_types() # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # loop over all work queues for workQueue in workQueueMapper.getAlignedQueueList( vo, prodSourceLabel): for resource_type in resource_types: wq_name = '_'.join( workQueue.queue_name.split(' ')) msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\ format(vo, prodSourceLabel, wq_name, resource_type.resource_name) tmpLog.debug(msgLabel + 'start') # get the list of tasks to check tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to check' ) else: tmpLog.debug(msgLabel + 'got tasks_to_check={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskCheckerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel) thr.start() # join threadPool.join() # get the list of tasks to assign tmpList = self.taskBufferIF.getTasksToAssign_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to assign' ) else: tmpLog.debug(msgLabel + 'got tasks_to_assign={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskBrokerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel, workQueue, resource_type.resource_name) thr.start() # join threadPool.join() tmpLog.debug(msgLabel + 'done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.debug('done') # sleep if needed loopCycle = jedi_config.taskbroker.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.datasetID not in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName is None: continue if targetName not in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud is not None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) else: tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) if locForRule is None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if taskSpec.campaign not in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() is not None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]: if taskSpec.workingGroup: userName = taskSpec.workingGroup else: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registering location={} lifetime={} days activity={} grouping={} ' 'owner={}'.format(locForRule, lifetime, activity, grouping, userName)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(locForRule, targetName)) return retFatal # double copy if userSetup and datasetSpec.type == 'output': if datasetSpec.destination != datasetSpec.site: tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination, datasetSpec.site)) else: second_copy = True try: if taskSpec.site: panda_site = siteMapper.getSite(taskSpec.site) if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall: tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site)) second_copy = False except Exception: second_copy = True if second_copy: locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True' tmpMsg = 'registering double copy ' tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime, activity,targetName) tmpLog.info(tmpMsg) tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName, lifetime=lifetime,activity=activity, grouping='NONE',weight='freespace', ignore_availability=False) if not tmpStat: tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if datasetSpec.containerName not in cnDatasetMap: cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # register ES datasets if taskSpec.registerEsFiles(): targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData, resurrect=True) if not tmpStat: tmpLog.error('failed to register ES dataset {0}'.format(targetName)) return retFatal # register rule location = 'type=DATADISK' activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity, grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.destinationDBlock not in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue): # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2}'.format( vo, prodSourceLabel, workQueue.queue_name)) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW == None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format( taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess( None, taskSpec.userName) siteAccessMap = {} for tmpSiteName, tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['', None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff', 'test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug( ' skip site=%s due to status=%s criteria=-status' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format( len(scanSiteList), useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search( 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*', '', taskSpec.transHome) transHome = re.sub('_', '-', transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format( len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0, None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [ 0, None ] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug( ' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory' .format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0, None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [ 0, None ] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug( ' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory' .format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug( 'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}' .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format( minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip site={0} due to small scratch disk={1} < {2} criteria=-disk' .format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint( tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk' .format(tmpSiteName, tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug( ' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist' .format(tmpSiteName, tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug( ' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList): tmpLog.debug( ' skip site={0} excluded criteria=-excluded'.format( tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched( tmpSiteName, includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug( ' skip site={0} limited access criteria=-limitedaccess' .format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]: tmpLog.debug( ' skip site={0} cloud mismatch criteria=-cloudmismatch'. format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug( 'getting the list of sites where {0} is available'. format(datasetName)) tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData( scanSiteList, self.siteMapper, self.ddmIF, datasetName) if tmpSt in [ Interaction.JEDITemporaryError, Interaction.JEDITimeoutError ]: tmpLog.error( 'temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error( 'fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format( len(tmpRet), str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset( datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format( datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error( '{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName, tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or ( sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites( tmpDiskSiteList, self.taskBufferIF, self.siteMapper, nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems( ): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0, None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float( tmpWeightSrcMap['weight']) / wRemote else: dataWeight[tmpSiteName] += float( tmpWeightSrcMap['weight']) / wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][ datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys( ): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format( datasetName, len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format( datasetName, len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug( 'data is unavailable locally or remotely at preassigned site {0}' .format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format( taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI( taskSpec.jediTaskID, timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'starting', None, None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed + nClosed > 2 * nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs( jobStatPrioMap, tmpSiteName, 'throttled', None, None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format( tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format( nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites( candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key( tmpSiteName ) and remoteSourceList[tmpSiteName].has_key( datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( fileScanSiteList, self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug( ' skip site={0} non pre-assigned site criteria=-nonpreassigned' .format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName, availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[ tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[ tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[ tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[ tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles( availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug( tmpMsg.format( tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug( ' skip site={0} file unavailable criteria=-fileunavailable' .format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug( ' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use' .format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk