def doFinalProcedure(self,taskSpec,tmpLog): # check email address toAdd = self.getEmail(taskSpec.userName,taskSpec.vo,tmpLog) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(taskSpec.jediTaskID) self.taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) if toAdd == None or \ (self.taskParamMap != None and self.taskParamMap.has_key('noEmail') and self.taskParamMap['noEmail'] == True): tmpLog.info('email notification is suppressed') else: # send email notification fromAdd = self.senderAddress() msgBody = self.composeMessage(taskSpec,fromAdd,toAdd) self.sendMail(taskSpec.jediTaskID,fromAdd,toAdd,msgBody,3,False,tmpLog) return self.SC_SUCCEEDED
def doFinalProcedure(self,taskSpec,tmpLog): # check email address toAdd = self.getEmail(taskSpec.userName,taskSpec.vo,tmpLog) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(taskSpec.jediTaskID) self.taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) if toAdd == None or \ (self.taskParamMap != None and self.taskParamMap.has_key('noEmail') and self.taskParamMap['noEmail'] == True): tmpLog.debug('email notification is suppressed') else: # send email notification fromAdd = self.senderAddress() msgBody = self.composeMessage(taskSpec,fromAdd,toAdd) self.sendMail(taskSpec.jediTaskID,fromAdd,toAdd,msgBody,3,False,tmpLog) return self.SC_SUCCEEDED
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} datasetsIdxConsistency = [] # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue # make logger try: gshare = '_'.join(taskSpec.gshare.split(' ')) except: gshare = 'Undefined' tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare)) try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = taskSpec.getNumFilesPerJob() # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid, ['output','log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add(tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # index consistency if datasetSpec.indexConsistent(): datasetsIdxConsistency.append(datasetSpec.datasetID) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock() or taskSpec.orderByLB(): longFormat = True tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat ) tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.debug('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords() != None: nPFN = datasetSpec.getNumRecords() elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'): nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and taskSpec.getNumFilesPerJob() is not None: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob() else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt['nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str(skipFilesUsedBy).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID, {'datasetName':datasetSpec.datasetName}, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']: nPFN += tmpParentAtt['nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset, 'scope':None, 'filesize':0, 'checksum':None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents() # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap['maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # ramCount ramCount = 0 # skip short input if datasetSpec.isMaster() and not datasetSpec.isPseudo() \ and nEventsPerFile is not None and nEventsPerJob is not None \ and nEventsPerFile >= nEventsPerJob \ and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True: skipShortInput = True else: skipShortInput = False # feed files to the contents table tmpLog.debug('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount, taskSpec, skipShortInput) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # index consistency if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0: self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency) # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock not-running task with {0}'.format(retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) # extarct common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # no wait for parent if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError: impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() errStr = 'pending until parent produces input' tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: errStr = 'failed to refine task' tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID}) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = 'tobroken' impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper( self.logger, '< jediTaskID={0} >'.format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False, True, self.pid, 10) if not tmpStat or taskSpec == None: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'task param conversion from json failed with {0}:{1}' .format(errtype.__name__, errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap[ 'nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [ None, taskSpec.jediTaskID ]: tmpStat = self.taskBufferIF.checkParentTask_JEDI( taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.parent_tid, ['output', 'log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add( tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format( datasetSpec.datasetName, datasetSpec.datasetID)) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state': 'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state': 'mutable'} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( '{0} failed to get metadata to {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) taskSpec.setErrDiag( 'failed to get metadata for {0}'.format( datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key( 'getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles( ) if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock(): longFormat = True tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat) tmpLog.debug( 'got {0} files in {1}'.format( len(tmpRet), tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles( tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.debug( 'found {0} lost files in {1}'. format(len(tmpLostFiles), tmpDatasetName)) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems( ): tmpLog.debug( 'removed {0}'.format( tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords( ) != None: nPFN = datasetSpec.getNumRecords( ) elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerFile' ) and taskParamMap.has_key( 'nEventsPerRange'): nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and 'nFilesPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nFilesPerJob'] else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI( datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt[ 'nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt[ 'nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str( skipFilesUsedBy ).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI( tmpJediTaskID, { 'datasetName': datasetSpec. datasetName }, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt[ 'nFiles']: nPFN += tmpParentAtt[ 'nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': iPFN + tmpOffset, 'scope': None, 'filesize': 0, 'checksum': None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { 'lfn': 'pseudo_lfn', 'scope': None, 'filesize': 0, 'checksum': None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': '{0:06d}:{1}'.format( iPFN, taskParamMap['pfnList'] [iPFN].split('/')[-1]), 'scope': None, 'filesize': 0, 'checksum': None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'failed to get files due to {0}:{1} {2}' .format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag( 'failed to get files for {0}'.format( datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents( ) # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key( 'nEventsPerFile'): nEventsPerFile = taskParamMap[ 'nEventsPerFile'] elif datasetSpec.isMaster( ) and datasetSpec.isPseudo( ) and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap[ 'nEvents'] if taskParamMap.has_key( 'nEventsPerJob'): nEventsPerJob = taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerRange'): nEventsPerRange = taskParamMap[ 'nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap[ 'tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster( ) or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key( 'maxAttempt'): maxAttempt = taskParamMap[ 'maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap[ 'maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset( ) # nMaxEvents nMaxEvents = None if datasetSpec.isMaster( ) and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio( origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key( 'nEventsPerFile'): if taskParamMap.has_key( 'nEventsPerJob'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerJob']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerJob']) nMaxFiles = int( math.ceil( nMaxFiles)) elif taskParamMap.has_key( 'nEventsPerRange'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerRange']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerRange']) nMaxFiles = int( math.ceil( nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster( ) and taskSpec.useScout() and ( datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo( ) and fileList != [] and taskParamMap.has_key( 'useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True #ramCount ramCount = 0 # feed files to the contents table tmpLog.debug('update contents') retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount) if retDB == False: taskSpec.setErrDiag( 'failed to insert files for {0}. {1}' .format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug( 'escape since task or dataset is locked' ) break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format( len(missingFileList), datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { 'datasetSpec': datasetSpec, 'missingFiles': missingFileList } else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap[ 'nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap[ 'nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken', 'tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format( taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid, setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True, pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task.status={0}'.format( newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock not-running task with {0}'.format( retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue( self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024 * 1024 * 1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 minIoIntensityWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas') if minIoIntensityWithLD is None: minIoIntensityWithLD = 200 minInputSizeWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas') if minInputSizeWithLD is None: minInputSizeWithLD = 10000 maxTaskPrioWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas') if maxTaskPrioWithLD is None: maxTaskPrioWithLD = 800 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug( '{0} terminating after processing {1} tasks since no more inputs ' .format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec, inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info( 'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}' .format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: tmpLog.error('failed to read task params') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI( taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in siteMapper.nuclei: candidateNucleus = taskSpec.nucleus elif taskSpec.nucleus in siteMapper.satellites: nucleusList = siteMapper.satellites candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format( len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleusSpec.state not in ['ACTIVE']: tmpLog.info( ' skip nucleus={0} due to status={1} criteria=-status' .format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed status check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info( 'skip transfer backlog check due to negative T1Weight' ) else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei( ) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in backlogged_nuclei: tmpLog.info( ' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog' .format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed transfer backlog check'. format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination( tmpDatasetSpec.storageToken ) is not None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssociatedEndpoint( tmpDatasetSpec.storageToken) if tmpEP is None: tmpLog.info( ' skip nucleus={0} since no endpoint with {1} criteria=-match' .format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if tmpEP['state'] not in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP[ 'space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long( self.fullRW[tmpNucleus] / 10 / 24 / 3600 * 0.25) if tmpSpaceSize - tmpSpaceToUse < diskThreshold: tmpLog.info( ' skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space' .format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if tmpNucleus not in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = { 'total': 0, 'free': 0 } try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except Exception: tmpOld = None try: tmpNew = float(tmpSpaceSize - tmpSpaceToUse) / float( tmpEP['space_total']) except Exception: tmpNew = None if tmpNew is not None and (tmpOld is None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = { 'total': tmpEP['space_total'], 'free': tmpSpaceSize - tmpSpaceToUse } if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed endpoint check {1} TB'. format(len(nucleusList), diskThreshold / 1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF, self.taskBufferIF) tmpSt, tmpRet = jobBroker.doBrokerage( taskSpec, taskSpec.cloud, inputChunk, None, True, tmpSiteList, tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info( ' skip nucleus={0} due to missing ability to run jobs criteria=-job' .format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType( datasetSpec.datasetName ) in datasetTypeToSkipCheck: continue # primary only if taskParamMap.get( 'taskBrokerOnMaster' ) is True and not datasetSpec.isMaster(): continue # use deep scan for primary dataset unless data carousel if datasetSpec.isMaster( ) and not taskSpec.inputPreStaging(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData( siteMapper, self.ddmIF, datasetSpec.datasetName, list(nucleusList.keys()), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to get nuclei where data is available, since {0}' .format(tmpRet)) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus, tmpVals in iteritems(tmpRet): if tmpNucleus not in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict( (k, v + tmpVals[k]) for (k, v) in iteritems( availableData[tmpNucleus])) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if taskSpec.inputPreStaging( ) and availableData[tmpNucleus][ 'ava_num_any'] > 0: # use incomplete replicas for data carousel since the completeness is guaranteed newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format( tmpNucleus, availableData[tmpNucleus] ['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format( tmpNucleus, availableData[tmpNucleus] ['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec totInputSize = list(availableData.values( ))[0]['tot_size'] / 1024 / 1024 / 1024 data_locality_check_str = ( '(ioIntensity ({0}) is None or less than {1} kBPerS ' 'and input size ({2} GB) is less than {3}) ' 'or task.currentPriority ({4}) is higher than or equal to {5}' ).format(taskSpec.ioIntensity, minIoIntensityWithLD, int(totInputSize), minInputSizeWithLD, taskSpec.currentPriority, maxTaskPrioWithLD) if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) elif ((taskSpec.ioIntensity is None or taskSpec.ioIntensity <= minIoIntensityWithLD) and totInputSize <= minInputSizeWithLD) \ or taskSpec.currentPriority >= maxTaskPrioWithLD: availableData = {} tmpLog.info( ' disable data locality check since no nucleus has input data, {}' .format(data_locality_check_str)) else: # no candidate + unavoidable data locality check nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) tmpLog.info( ' the following conditions required to disable data locality check: {}' .format(data_locality_check_str)) tmpLog.info( '{0} candidates passed data check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus not in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[ tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format( nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format( nucleusRW[tmpNucleus], cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus] ['ava_size_any']) weight /= float( availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format( availableData[tmpNucleus] ['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format( availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus][ 'ava_size_any'] > availableData[ tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format( negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format( fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except Exception: pass tmpLog.info( ' use nucleus={0} weight={1} {2} criteria=+use' .format(tmpNucleus, weight, wStr)) totalWeight += weight nucleusweights.append((tmpNucleus, weight)) tmpLog.info('final {0} candidates'.format( len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0, totalWeight) candidateNucleus = None for tmpNucleus, weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus is None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) # get destinations retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, tmpDatasetSpecs) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info( ' set nucleus={0} with {1} criteria=+set'.format( candidateNucleus, tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task_status=ready' tmpLog.sendMsg(tmpMsg, self.msgType) # update RW table self.prioRW.acquire() for prio, rwMap in iteritems(self.prioRW): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except Exception: errtype, errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format( self.__class__.__name__, errtype.__name__, errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
tbIF = JediTaskBufferInterface() tbIF.setupInterface() siteMapper = tbIF.getSiteMapper() ddmIF = DDMInterface() ddmIF.setupInterface() jediTaskID = int(sys.argv[1]) s, taskSpec = tbIF.getTaskWithID_JEDI(jediTaskID, False) body = TaskBroker(None, tbIF, ddmIF, taskSpec.vo, taskSpec.prodSourceLabel) body.initializeMods(tbIF, ddmIF) taskParam = tbIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] workQueueMapper = tbIF.getWorkQueueMap() workQueue = workQueueMapper.getQueueWithIDGshare(taskSpec.workQueue_ID, taskSpec.gshare) impl = body.getImpl(vo, prodSourceLabel) tmpListItem = tbIF.getTasksToBeProcessed_JEDI(None, None, None, None,
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # staging if tmpStat == Interaction.SC_SUCCEEDED: if 'toStaging' in taskParamMap and taskStatus <> 'staged': errStr = 'wait until staging is done' impl.taskSpec.status = 'staging' impl.taskSpec.oldStatus = taskStatus impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False) continue # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if parent_tid not in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus in ['registered', 'staged']: # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # disallow to reset some attributes for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']: impl.taskSpec.resetChangedAttr(attName) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # loop over all tasks for jediTaskID, commandMap in taskList: # make logger tmpLog = MsgWrapper( self.logger, ' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill', 'finish', 'reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI( jediTaskID, True) if pandaIDs is None: tmpLog.error( 'failed to get PandaIDs for jediTaskID={0}' .format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr is not None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[ 1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format( tmpItems[0], tmpItems[1]) tmpLog.sendMsg( tmpMsg, self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate( 'oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info( 'updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI( jediTaskID, self.pid) if not tmpStat: tmpLog.info( 'wait until datasets are updated to finish' ) # ignore failGoalUnreached when manually finished tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap( )[commandStr]['done'] tmpMsg = 'set task_status={0}'.format( tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI( tmpTaskSpec, {'jediTaskID': jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format( str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if commentStr and 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format( len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs( queuedPandaIDs, commentStr, '52', True) tmpMsg = "wating {0} jobs for soft finish".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format( str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '52', True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '51', True) else: # normal kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '50', True) tmpLog.info('done with {0}'.format( str(tmpRet))) elif commandStr in ['retry', 'incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON( taskParam) # remove some params for newKey in ['nFiles', 'fixedSandbox']: try: del taskParamMap[newKey] except Exception: pass # convert new params newParamMap = RefinerUtils.decodeJSON( commentStr) # change params for newKey, newVal in iteritems(newParamMap): if newVal is None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap[ 'jobParameters']: if tmpParam[ 'type'] == 'constant' and re.search( '^-a [^ ]+$', tmpParam['value'] ) is not None: tmpParam['value'] = '-a {0}'.format( taskParamMap['fixedSandbox']) # build if 'buildSpec' in taskParamMap: taskParamMap['buildSpec'][ 'archiveName'] = taskParamMap[ 'fixedSandbox'] # merge if 'mergeSpec' in taskParamMap: taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON( taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI( jediTaskID, strTaskParams) if tmpRet is not True: tmpLog.error( 'failed to update task params') continue except Exception as e: tmpLog.error( 'failed to change task params with {} {}'. format(str(e), traceback.format_exc())) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False # release un-staged files if 'staged ' in commentStr: releaseUnstaged = True else: releaseUnstaged = False tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI( jediTaskID, commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents, release_unstaged=releaseUnstaged) if tmpRet is True: tmpMsg = 'set task_status={0}'.format( newTaskStatus) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except Exception as e: errStr = '{} failed in runImpl() with {} {} '.format( self.__class__.__name__, str(e), traceback.format_exc()) logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # get dataset metadata tmpLog.info('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata['state'] == 'open': # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] # max attempts maxAttempt = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate': useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.info('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0): tmpErrStr = 'insufficient inputs are ready' tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.info('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods(self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)) if not tmpStat: tmpErrStr = 'failed to initialize TaskGenerator' tmpLog.error(tmpErrStr) taskSpec.status = 'tobroken' taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel))) tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = 'tobroken' tmpLog.error('failed to make parent tasks') # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True, pid=self.pid) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def run(self): try: # get process lock locked = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel=self.prodSourceLabel, cloud=None, workqueue_id=None, resource_name=None, component=self.component, pid=self.pid, timeLimit=10) if not locked: self.log.debug( 'component={0} skipped since locked by another'.format( self.component)) return # get parameters for conversion self.log.debug('component={0} start'.format(self.component)) maxTasks = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_TASKS', 'jedi', self.vo) if maxTasks is None: maxTasks = 1 nEventsToDisable = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MIN_EVENTS_DISABLE', 'jedi', self.vo) if nEventsToDisable is None: nEventsToDisable = 100000 nEventsToEnable = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MIN_EVENTS_ENABLE', 'jedi', self.vo) if nEventsToEnable is None: nEventsToEnable = nEventsToDisable * 10 maxEvents = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_EVENTS', 'jedi', self.vo) if maxEvents is None: maxEvents = maxTasks * nEventsToEnable // 2 nJumboPerTask = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PER_TASK', 'jedi', self.vo) if nJumboPerTask is None: nJumboPerTask = 1 nJumboPerSite = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PER_SITE', 'jedi', self.vo) if nJumboPerSite is None: nJumboPerSite = 1 maxPrio = self.taskBufferIF.getConfigValue(self.component, 'JUMBO_MAX_CURR_PRIO', 'jedi', self.vo) if maxPrio is None: maxPrio = 500 progressToBoost = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PROG_TO_BOOST', 'jedi', self.vo) if progressToBoost is None: progressToBoost = 95 maxFilesToBoost = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_FILES_TO_BOOST', 'jedi', self.vo) if maxFilesToBoost is None: maxFilesToBoost = 500 prioToBoost = 900 prioWhenDisabled = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PRIO_DISABLED', 'jedi', self.vo) if prioWhenDisabled is None: prioWhenDisabled = 500 # get current info tasksWithJumbo = self.taskBufferIF.getTaskWithJumbo_JEDI( self.vo, self.prodSourceLabel) totEvents = 0 doneEvents = 0 nTasks = 0 for jediTaskID, taskData in iteritems(tasksWithJumbo): # disable jumbo if taskData['useJumbo'] != JediTaskSpec.enum_useJumbo[ 'disabled'] and taskData['site'] is None: if taskData['nEvents'] - taskData[ 'nEventsDone'] < nEventsToDisable: # disable self.log.info( 'component={0} disable jumbo in jediTaskID={1} due to n_events_to_process={2} < {3}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'], nEventsToDisable)) self.taskBufferIF.enableJumboJobs(jediTaskID, 0, 0) else: # wait nTasks += 1 totEvents += taskData['nEvents'] doneEvents += taskData['nEventsDone'] self.log.info( 'component={0} keep jumbo in jediTaskID={1} due to n_events_to_process={2} > {3}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'], nEventsToDisable)) # increase priority for jumbo disabled if taskData['useJumbo'] == JediTaskSpec.enum_useJumbo[ 'disabled'] and taskData[ 'currentPriority'] < prioWhenDisabled: self.taskBufferIF.changeTaskPriorityPanda( jediTaskID, prioWhenDisabled) self.log.info( 'component={0} priority boost to {1} after disabing jumbo in in jediTaskID={2}' .format(self.component, prioWhenDisabled, jediTaskID)) # increase priority when close to completion if taskData['nEvents'] > 0 and (taskData['nEvents'] - taskData['nEventsDone']) * 100 // taskData['nEvents'] < progressToBoost \ and taskData['currentPriority'] < prioToBoost and (taskData['nFiles'] - taskData['nFilesDone']) < maxFilesToBoost: # boost tmpStr = 'component={0} priority boost to {5} for jediTaskID={1} due to n_events_done={2} > {3}*{4}% '.format( self.component, jediTaskID, taskData['nEventsDone'], taskData['nEvents'], progressToBoost, prioToBoost) tmpStr += 'n_files_remaining={0} < {1}'.format( taskData['nFiles'] - taskData['nFilesDone'], maxFilesToBoost) self.log.info(tmpStr) self.taskBufferIF.changeTaskPriorityPanda( jediTaskID, prioToBoost) # kick pending if taskData['taskStatus'] in [ 'pending', 'running' ] and taskData['useJumbo'] in [ JediTaskSpec.enum_useJumbo['pending'], JediTaskSpec.enum_useJumbo['running'] ]: nActiveJumbo = 0 for computingSite, jobStatusMap in iteritems( taskData['jumboJobs']): for jobStatus, nJobs in iteritems(jobStatusMap): if jobStatus in [ 'defined', 'assigned', 'activated', 'sent', 'starting', 'running', 'transferring', 'holding' ]: nActiveJumbo += nJobs if nActiveJumbo == 0: self.log.info( 'component={0} kick jumbo in {2} jediTaskID={1}'. format(self.component, jediTaskID, taskData['taskStatus'])) self.taskBufferIF.kickPendingTasksWithJumbo_JEDI( jediTaskID) # reset input to re-generate co-jumbo if taskData['currentPriority'] >= prioToBoost: nReset = self.taskBufferIF.resetInputToReGenCoJumbo_JEDI( jediTaskID) if nReset is not None and nReset > 0: self.log.info( 'component={0} reset {1} inputs to regenerate co-jumbo for jediTaskID={2}' .format(self.component, nReset, jediTaskID)) else: self.log.debug( 'component={0} tried to reset inputs to regenerate co-jumbo with {1} for jediTaskID={2}' .format(self.component, nReset, jediTaskID)) self.log.info( 'component={0} total_events={1} n_events_to_process={2} n_tasks={3} available for jumbo' .format(self.component, totEvents, totEvents - doneEvents, nTasks)) if True: # get list of releases and caches available at jumbo job enabled PQs jumboRels, jumboCaches = self.taskBufferIF.getRelCacheForJumbo_JEDI( ) # look for tasks to enable jumbo if self.dryRun: self.log.info( 'component={0} look for tasks to enable jumbo in dry run mode' .format(self.component)) else: self.log.info( 'component={0} look for tasks to enable jumbo due to lack of tasks and events to meet max_tasks={1} max_events={2}' .format(self.component, maxTasks, maxEvents)) tasksToEnableJumbo = self.taskBufferIF.getTaskToEnableJumbo_JEDI( self.vo, self.prodSourceLabel, maxPrio, nEventsToEnable) nGoodTasks = 0 self.log.debug('component={0} got {1} tasks to check'.format( self.component, len(tasksToEnableJumbo))) # sort by nevents nEventsMap = dict() for jediTaskID, taskData in iteritems(tasksToEnableJumbo): nEventsMap[jediTaskID] = taskData['nEvents'] sortedList = sorted(list(nEventsMap.items()), key=operator.itemgetter(1)) sortedList.reverse() for jediTaskID, nEvents in sortedList: taskData = tasksToEnableJumbo[jediTaskID] # get task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: self.log.error( 'component={0} failed to get task params for jediTaskID={1}' .format(self.component, jediTaskID)) continue taskSpec = JediTaskSpec() taskSpec.splitRule = taskData['splitRule'] # check if good for jumbo if 'esConvertible' not in taskParamMap or taskParamMap[ 'esConvertible'] is False: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since not ES-convertible' .format(self.component, jediTaskID)) continue if taskSpec.inFilePosEvtNum(): pass elif taskSpec.getNumFilesPerJob() == 1: pass elif taskSpec.getNumEventsPerJob() is not None and 'nEventsPerInputFile' in taskParamMap \ and taskSpec.getNumEventsPerJob() <= taskParamMap['nEventsPerInputFile']: pass else: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since not good for in-file positional event numbers' .format(self.component, jediTaskID)) continue # check software transHome = taskData['transHome'] cmtConfig = taskData['architecture'] if re.search('^\d+\.\d+\.\d+$', transHome.split('-')[-1]) is not None: transHome = transHome.split('-')[-1] swDict = jumboRels else: swDict = jumboCaches key = (transHome, cmtConfig) if key not in swDict: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since {2}:{3} is unavailable at jumbo job enabled PQs' .format(self.component, jediTaskID, transHome, cmtConfig)) continue if not self.dryRun and nTasks < maxTasks and ( totEvents - doneEvents) < maxEvents: self.log.info( 'component={0} enable jumbo in jediTaskID={1} with n_events_to_process={2}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'])) if taskData['eventService'] == 0: tmpS, tmpO = self.taskBufferIF.enableEventService( taskData['jediTaskID']) if tmpS != 0: self.log.error( 'component={0} failed to enable ES in jediTaskID={1} with {2}' .format(self.component, jediTaskID, tmpO)) continue self.taskBufferIF.enableJumboJobs( taskData['jediTaskID'], nJumboPerTask, nJumboPerSite) nTasks += 1 totEvents += taskData['nEvents'] doneEvents += taskData['nEventsDone'] else: nGoodTasks += 1 self.log.info( 'component={0} good to enable jumbo in jediTaskID={1} with n_events_to_process={2}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'])) self.log.info( 'component={0} there are n_good_tasks={1} tasks good for jumbo' .format(self.component, nGoodTasks)) self.log.debug('component={0} done'.format(self.component)) except Exception: # error errtype, errvalue = sys.exc_info()[:2] errStr = ": %s %s" % (errtype.__name__, errvalue) errStr.strip() errStr += traceback.format_exc() self.log.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug("%s terminating since no more items" % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10) if not tmpStat or taskSpec == None: tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue) ) taskBroken = True # renaming of parameters if taskParamMap.has_key("nEventsPerInputFile"): taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"] # the number of files per job nFilesPerJob = None if taskParamMap.has_key("nFilesPerJob"): nFilesPerJob = taskParamMap["nFilesPerJob"] # the number of files used by scout nFilesForScout = 0 if nFilesPerJob != None: nFilesForScout = 10 * nFilesPerJob else: nFilesForScout = 10 # load XML if taskSpec.useLoadXML(): try: loadXML = taskParamMap["loadXML"] xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue)) taskBroken = True else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == "running": noWaitParent = True # loop over all datasets nFilesMaster = 0 if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key("nFiles"): origNumFiles = taskParamMap["nFiles"] for datasetSpec in dsList: tmpLog.info( "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID) ) # get dataset metadata tmpLog.info("get metadata") gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {"state": "closed"} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata["state"] == "open": # dummy metadata when parent is running tmpMetadata = {"state": "mutable"} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "{0} failed to get metadata to {1}:{2}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName ) # get the number of events in metadata if taskParamMap.has_key("getNumEventsInMetadata"): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info("get files") try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if ( fileList != [] and taskParamMap.has_key("useInFilesInContainer") and not datasetSpec.containerName in ["", None] ): # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.info( "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName) ) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info("removed {0}".format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { "lfn": "pseudo_lfn", "scope": None, "filesize": 0, "checksum": None, } } else: # make dummy file list for PFN list if taskParamMap.has_key("nFiles"): nPFN = taskParamMap["nFiles"] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { "lfn": "{0:06d}:{1}".format( iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1] ), "scope": None, "filesize": 0, "checksum": None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "failed to get files due to {0}:{1}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or ( datasetSpec.isPseudo() and taskParamMap.has_key("nEvents") ): if taskParamMap.has_key("nEventsPerFile"): nEventsPerFile = taskParamMap["nEventsPerFile"] elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap["nEvents"] if taskParamMap.has_key("nEventsPerJob"): nEventsPerJob = taskParamMap["nEventsPerJob"] elif taskParamMap.has_key("nEventsPerRange"): nEventsPerRange = taskParamMap["nEventsPerRange"] # max attempts and first event number maxAttempt = None firstEventNumber = None if datasetSpec.isMaster(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key("maxAttempt"): maxAttempt = taskParamMap["maxAttempt"] else: # use default value maxAttempt = 3 # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"): nMaxEvents = taskParamMap["nEvents"] # nMaxFiles nMaxFiles = None if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): nMaxFiles = taskParamMap["nFiles"] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"): if taskParamMap.has_key("nEventsPerJob"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerJob"] ) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key("nEventsPerRange"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerRange"] ) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout(): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if ( not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key("useInFilesWithNewAttemptNr") ): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info("update contents") retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata["state"], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nFilesForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, ) if retDB == False: taskSpec.setErrDiag( "failed to insert files for {0}. {1}".format( datasetSpec.datasetName, diagMap["errMsg"] ) ) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False elif missingFileList != []: # files are missing tmpErrStr = "{0} files missing in {1}".format( len(missingFileList), datasetSpec.datasetName ) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { "datasetSpec": datasetSpec, "missingFiles": missingFileList, } else: # reduce the number of files to be read if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): taskParamMap["nFiles"] -= nFilesUnique # reduce the number of files for scout if useScout: nFilesForScout = diagMap["nFilesForScout"] # number of master input files if datasetSpec.isMaster(): nFilesMaster += nFilesUnique # running task if diagMap["isRunningTask"]: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap["nActivatedPending"] == 0: tmpErrStr = "insufficient inputs are ready" tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True tmpLog.info("end loop") # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0: tmpErrStr = "no master input files. input dataset is empty" tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = "tobroken" tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods( self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo) ) if not tmpStat: tmpErrStr = "failed to initialize TaskGenerator" tmpLog.error(tmpErrStr) taskSpec.status = "tobroken" taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info( "make parent tasks with {0} (if necessary)".format( taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel) ) ) tmpStat = taskGenerator.doGenerate( taskSpec, taskParamMap, missingFilesMap=missingMap ) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = "tobroken" tmpLog.error("failed to make parent tasks") # go to pending state if not taskSpec.status in ["broken", "tobroken"]: taskSpec.setOnHold() tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True ) tmpMsg = "set task.status={0}".format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info("done") except: errtype, errvalue = sys.exc_info()[:2] logger.error( "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue) )
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: # get active PandaIDs to be killed pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpLog.info('completed the command') tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) else: tmpLog.info('sending kill command') tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info('updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid) if not tmpStat: tmpLog.info('wait until datasets are updated to finish') # ignore failGoalUnreached when manually finished tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True) tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents) if tmpRet == True: tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # set cloud try: if not taskParamMap: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) if not taskSpec.cloud and 'cloud' in taskParamMap: taskSpec.cloud = taskParamMap['cloud'] except Exception: pass # get sites in the cloud site_preassigned = True if taskSpec.site not in ['', None]: tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) if self.siteMapper.checkSite(taskSpec.site): scanSiteList = [taskSpec.site] else: scanSiteList = [] for tmpSite in self.siteMapper.getCloud( taskSpec.cloud)['sites']: if re.search(taskSpec.site, tmpSite): scanSiteList.append(tmpSite) if not scanSiteList: tmpLog.error('unknown site={}'.format(taskSpec.site)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError elif inputChunk.getPreassignedSite() is not None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: site_preassigned = False scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites'] # remove NA if 'NA' in scanSiteList: scanSiteList.remove('NA') tmpLog.debug('cloud=%s has %s candidates' % (taskSpec.cloud, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status and PandaSite newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status if tmpSiteSpec.status != 'online' and not site_preassigned: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) continue # check PandaSite if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']: if tmpSiteSpec.pandasite != taskParamMap['PandaSite']: tmpLog.debug(' skip %s due to wrong PandaSite=%s <> %s' % (tmpSiteName, tmpSiteSpec.pandasite, taskParamMap['PandaSite'])) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS // 1024 // 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR // 1024 // 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir: if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec, inputChunk): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if minWalltime not in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory origMinRamCount = inputChunk.getMaxRamCount() if not site_preassigned and origMinRamCount: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # job memory requirement if taskSpec.ramPerCore(): minRamCount = origMinRamCount * ( tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1) minRamCount += (taskSpec.baseRamCount if taskSpec.baseRamCount else 0) else: minRamCount = origMinRamCount # site max memory requirement site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0 # check at the site if site_maxmemory and minRamCount and minRamCount > site_maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format( tmpSiteName, site_maxmemory, minRamCount) tmpLog.debug(tmpMsg) continue # site min memory requirement site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0 if site_minmemory and minRamCount and minRamCount < site_minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format( tmpSiteName, site_minmemory, minRamCount) tmpLog.info(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if tmpSiteName in nWNmap: nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned tmpLog.debug( 'getting the list of available files for {0}'.format( datasetSpec.datasetName)) fileScanSiteList = [] for tmpPseudoSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() if tmpSiteName in fileScanSiteList: continue fileScanSiteList.append(tmpSiteName) # mapping between sites and input storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap( fileScanSiteList, self.siteMapper, taskSpec.prodSourceLabel, None) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True if not datasetSpec.isMaster(): useCompleteOnly = True else: useCompleteOnly = False # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, check_completeness=checkCompleteness, file_scan_in_container=False, complete_only=useCompleteOnly) if tmpAvFileMap is None: raise Interaction.JEDITemporaryError( 'ddmIF.getAvailableFiles failed') availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except Exception as e: tmpLog.error('failed to get available files with {}'.format(e)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare( taskSpec.vo) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # files for tmpDatasetName, availableFiles in six.iteritems( availableFileMap): if tmpSiteName in availableFiles: siteCandidateSpec.add_local_disk_files( availableFiles[tmpSiteName]['localdisk']) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if weight not in weightMap: weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = list(weightMap.keys()) weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {} with weight={} nFiles={}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles))) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk