def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: # get active PandaIDs to be killed pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpLog.info('completed the command') tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) else: tmpLog.info('sending kill command') tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if not taskParamMap.has_key('walltimeUnit'): # force to set NULL so that retried tasks get data from scouts again taskSpec.forceUpdate('walltimeUnit') if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10 * 60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key( 'skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # event service flag if 'useJobCloning' in taskParamMap: taskSpec.eventService = 2 elif taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime( taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal']) * 10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # request type if 'requestType' in taskParamMap: taskSpec.requestType = taskParamMap['requestType'] self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'], JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap, 'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap, 'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap, 'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap, 'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap, 'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap, 'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap, 't1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap, 'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap, 'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap, 'nJumboJobs', JediTaskSpec.splitRuleToken['nJumboJobs']) self.setSplitRule(taskParamMap, 'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap, 'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap, 'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key( 'noWaitParent') and taskParamMap['noWaitParent'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue( taskParamMap['useJobCloning']) self.setSplitRule(None, scValue, JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap[ 'failWhenGoalUnreached'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap[ 'allowInputWAN'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['putLogToOS']) if 'mergeEsOnOS' in taskParamMap and taskParamMap[ 'mergeEsOnOS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['mergeEsOnOS']) if 'writeInputToFile' in taskParamMap and taskParamMap[ 'writeInputToFile'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['writeInputToFile']) if 'useFileAsSourceLFN' in taskParamMap and taskParamMap[ 'useFileAsSourceLFN'] == True: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['useFileAsSourceLFN']) if 'ignoreMissingInDS' in taskParamMap and taskParamMap[ 'ignoreMissingInDS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['ignoreMissingInDS']) # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueWithName( taskSpec.vo, taskSpec.prodSourceLabel, taskParamMap['workQueueName']) if workQueue is None: # get work queue based on task attributes workQueue, tmpStr = workQueueMapper.getQueueWithSelParams( taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site, eventService=taskSpec.eventService, splitRule=taskSpec.splitRule, campaign=taskSpec.campaign) if workQueue is None: errStr = 'workqueue is undefined for vo={0} label={1} '.format( taskSpec.vo, taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format( taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount, taskSpec.eventService) errStr += 'splitRule={0} campaign={1}'.format( taskSpec.splitRule, taskSpec.campaign) raise RuntimeError, errStr self.taskSpec.workQueue_ID = workQueue.queue_id # Initialize the global share gshare = None if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share( taskParamMap['gshare']): # work queue is specified gshare = taskParamMap['gshare'] else: # get share based on definition gshare = self.taskBufferIF.get_share_for_task(self.taskSpec) if gshare is None: gshare = 'No match' # errStr = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign) # raise RuntimeError,errStr self.taskSpec.gshare = gshare # return return
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) # extarct common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # no wait for parent if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError: impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() errStr = 'pending until parent produces input' tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: errStr = 'failed to refine task' tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID}) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = 'tobroken' impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
import sys import uuid from pandajedi.jedicore.JediTaskBufferInterface import JediTaskBufferInterface tbIF = JediTaskBufferInterface() tbIF.setupInterface() from pandajedi.jedicore.JediTaskSpec import JediTaskSpec task = JediTaskSpec() task.jediTaskID = sys.argv[1] task.taskName = 'pandatest.{0}'.format(uuid.uuid4()) task.status = 'defined' task.userName = '******' task.vo = 'atlas' task.prodSourceLabel = 'managed' task.taskPriority = 100 task.currentPriority = task.taskPriority task.architecture = 'i686-slc5-gcc43-opt' task.transUses = 'Atlas-17.2.7' task.transHome = 'AtlasProduction-17.2.8.10' task.transPath = 'Reco_trf.py' task.workQueue_ID = 3 tbIF.insertTask_JEDI(task) from pandajedi.jedicore.JediDatasetSpec import JediDatasetSpec ds = JediDatasetSpec() ds.jediTaskID = task.jediTaskID
def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if taskParamMap.has_key('walltimeUnit'): taskSpec.walltimeUnit = taskParamMap['walltimeUnit'] if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10 * 60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key( 'skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # event service if taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal']) * 10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # work queue workQueue, tmpStr = workQueueMapper.getQueueWithSelParams( taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site) if workQueue == None: errStr = 'workqueue is undefined for vo={0} labal={1} '.format( taskSpec.vo, taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format( taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount) raise RuntimeError, errStr taskSpec.workQueue_ID = workQueue.queue_id self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'], JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap, 'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap, 'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap, 'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap, 'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap, 'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap, 'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap, 't1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap, 'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap, 'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap, 'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap, 'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue( taskParamMap['useJobCloning']) self.setSplitRule(None, scValue, JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['switchEStoNormal']) # return return
def doRefine(self,jediTaskID,taskParamMap): try: # make logger tmpLog = self.tmpLog tmpLog.debug('start jediTaskID={0}'.format(jediTaskID)) # old dataset name oldDatasetName = taskParamMap['oldDatasetName'] # accompany datasets if taskParamMap.has_key('oldAccompanyDatasetNames'): oldAccDatasetNames = taskParamMap['oldAccompanyDatasetNames'] else: oldAccDatasetNames = None # use first file to get task and dataset info lostFileName = taskParamMap['lostFiles'][0] # get ole jediTaskID and datasetIDs tmpStat,oldIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI(oldDatasetName,lostFileName,'output') if tmpStat != True or oldIDs == None: tmpLog.error('failed to get jediTaskID and DatasetID for {0}:{1}'.format(oldDatasetName, lostFileName)) return self.SC_FAILED # get task oldJediTaskID = oldIDs['jediTaskID'] oldDatasetID = oldIDs['datasetID'] tmpStat,oldTaskSpec = self.taskBufferIF.getTaskWithID_JEDI(oldJediTaskID,True) if tmpStat != True: tmpLog.error('failed to get TaskSpec for old jediTaskId={0}'.format(oldJediTaskID)) return self.SC_FAILED # make task spec taskSpec = JediTaskSpec() taskSpec.copyAttributes(oldTaskSpec) # reset attributes taskSpec.jediTaskID = jediTaskID taskSpec.taskType = taskParamMap['taskType'] taskSpec.taskPriority = taskParamMap['taskPriority'] self.taskSpec = taskSpec # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(oldJediTaskID) if tmpStat != True: tmpLog.error('failed to get datasetSpecs') return self.SC_FAILED # loop over all datasets provenanceID = None dummyStreams = [] outDatasetSpec = None datasetNameSpecMap = {} for datasetSpec in datasetSpecList: # for output datasets if not datasetSpec.type in JediDatasetSpec.getInputTypes(): # collect output with the same provenanceID if provenanceID != None and datasetSpec.provenanceID != provenanceID: continue # set provenanceID if undefined if provenanceID == None and datasetSpec.provenanceID != None: provenanceID = datasetSpec.provenanceID # collect dummy streams if datasetSpec.type != 'log' and (datasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(datasetSpec.datasetName,oldAccDatasetNames)): if not datasetSpec.streamName in dummyStreams: dummyStreams.append(datasetSpec.streamName) continue # reset attributes datasetSpec.status = 'defined' datasetSpec.datasetID = None datasetSpec.jediTaskID = jediTaskID datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 # remove nosplit and repeat since even the same file is made for each bounaryID datasetSpec.remNoSplit() datasetSpec.remRepeat() # append to map datasetNameSpecMap[datasetSpec.datasetName] = datasetSpec # set master and secondary for input if datasetSpec.type in JediDatasetSpec.getInputTypes(): if datasetSpec.isMaster(): # master self.inMasterDatasetSpec = datasetSpec else: # secondary self.inSecDatasetSpecList.append(datasetSpec) elif datasetSpec.type == 'log': # set new attributes tmpItem = taskParamMap['log'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # extract output filename template and change the value field outFileTemplate,tmpItem['value'] = RefinerUtils.extractReplaceOutFileTemplate(tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate != None: if tmpItem.has_key('offset'): offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : offsetVal, 'streamName' : datasetSpec.streamName, 'filenameTemplate' : outFileTemplate, 'outtype' : datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [outTemplateMap] # append self.outDatasetSpecList.append(datasetSpec) else: # output dataset to make copies later outDatasetSpec = datasetSpec # replace redundant output streams with dummy files for dummyStream in dummyStreams: self.taskSpec.jobParamsTemplate = self.taskSpec.jobParamsTemplate.replace('${'+dummyStream+'}', dummyStream.lower()+'.tmp') self.setJobParamsTemplate(self.taskSpec.jobParamsTemplate) # loop over all lost files datasetIDSpecMap = {} for lostFileName in taskParamMap['lostFiles']: # get FileID tmpStat,tmpIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI(oldDatasetName,lostFileName,'output') if tmpStat != True or tmpIDs == None: tmpLog.error('failed to get FileID for {0}:{1}'.format(oldDatasetName, lostFileName)) return self.SC_FAILED # get PandaID tmpStat,pandaID = self.taskBufferIF.getPandaIDWithFileID_JEDI(tmpIDs['jediTaskID'], tmpIDs['datasetID'], tmpIDs['fileID']) if tmpStat != True or pandaID == None: tmpLog.error('failed to get PandaID for {0}'.format(str(tmpIDs))) return self.SC_FAILED # get files tmpStat,fileSpecList = self.taskBufferIF.getFilesWithPandaID_JEDI(pandaID) if tmpStat != True or fileSpecList == []: tmpLog.error('failed to get files for PandaID={0}'.format(pandaID)) return self.SC_FAILED # append for fileSpec in fileSpecList: # only input types if not fileSpec.type in JediDatasetSpec.getInputTypes(): continue # get original datasetSpec if not datasetIDSpecMap.has_key(fileSpec.datasetID): tmpStat,tmpDatasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(fileSpec.jediTaskID,fileSpec.datasetID) if tmpStat != True or tmpDatasetSpec == None: tmpLog.error('failed to get dataset for jediTaskID={0} datasetID={1}'.format(fileSpec.jediTaskID, fileSpec.datasetID)) return self.SC_FAILED datasetIDSpecMap[fileSpec.datasetID] = tmpDatasetSpec origDatasetSpec = datasetIDSpecMap[fileSpec.datasetID] if not datasetNameSpecMap.has_key(origDatasetSpec.datasetName): tmpLog.error('datasetName={0} is missing in new datasets'.format(origDatasetSpec.datasetName)) return self.SC_FAILED # not target or accompany datasets if origDatasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(origDatasetSpec.datasetName,oldAccDatasetNames): continue newDatasetSpec = datasetNameSpecMap[origDatasetSpec.datasetName] # set new attributes fileSpec.fileID = None fileSpec.datasetID = None fileSpec.jediTaskID = None fileSpec.boundaryID = pandaID fileSpec.keepTrack = 1 fileSpec.attemptNr = 1 fileSpec.status = 'ready' # append newDatasetSpec.addFile(fileSpec) # make one output dataset per file datasetSpec = copy.copy(outDatasetSpec) # set new attributes tmpItem = taskParamMap['output'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # use PandaID of original job as provenanceID datasetSpec.provenanceID = pandaID # append self.outDatasetSpecList.append(datasetSpec) # extract attempt number from original filename tmpMatch = re.search('\.(\d+)$',lostFileName) if tmpMatch == None: offsetVal = 1 else: offsetVal = 1 + int(tmpMatch.group(1)) # filename without attempt number baseFileName = re.sub('\.(\d+)$','',lostFileName) # make output template outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : offsetVal, 'streamName' : datasetSpec.streamName, 'filenameTemplate' : baseFileName + '.${SN:d}', 'outtype' : datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [outTemplateMap] # append datasets to task parameters for datasetSpec in datasetNameSpecMap.values(): if datasetSpec.Files == []: continue fileList = [] for fileSpec in datasetSpec.Files: fileList.append({'lfn':fileSpec.lfn, 'firstEvent':fileSpec.firstEvent, 'startEvent':fileSpec.startEvent, 'endEvent':fileSpec.endEvent, 'keepTrack':fileSpec.keepTrack, 'boundaryID':fileSpec.boundaryID, }) taskParamMap = RefinerUtils.appendDataset(taskParamMap,datasetSpec,fileList) self.updatedTaskParams = taskParamMap # grouping with boundaryID self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED tmpLog.debug('done') return self.SC_SUCCEEDED
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # loop over all tasks for jediTaskID, commandMap in taskList: # make logger tmpLog = MsgWrapper( self.logger, ' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill', 'finish', 'reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI( jediTaskID, True) if pandaIDs is None: tmpLog.error( 'failed to get PandaIDs for jediTaskID={0}' .format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr is not None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[ 1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format( tmpItems[0], tmpItems[1]) tmpLog.sendMsg( tmpMsg, self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate( 'oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info( 'updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI( jediTaskID, self.pid) if not tmpStat: tmpLog.info( 'wait until datasets are updated to finish' ) # ignore failGoalUnreached when manually finished tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap( )[commandStr]['done'] tmpMsg = 'set task_status={0}'.format( tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI( tmpTaskSpec, {'jediTaskID': jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format( str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if commentStr and 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format( len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs( queuedPandaIDs, commentStr, '52', True) tmpMsg = "wating {0} jobs for soft finish".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format( str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '52', True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '51', True) else: # normal kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '50', True) tmpLog.info('done with {0}'.format( str(tmpRet))) elif commandStr in ['retry', 'incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON( taskParam) # remove some params for newKey in ['nFiles', 'fixedSandbox']: try: del taskParamMap[newKey] except Exception: pass # convert new params newParamMap = RefinerUtils.decodeJSON( commentStr) # change params for newKey, newVal in iteritems(newParamMap): if newVal is None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap[ 'jobParameters']: if tmpParam[ 'type'] == 'constant' and re.search( '^-a [^ ]+$', tmpParam['value'] ) is not None: tmpParam['value'] = '-a {0}'.format( taskParamMap['fixedSandbox']) # build if 'buildSpec' in taskParamMap: taskParamMap['buildSpec'][ 'archiveName'] = taskParamMap[ 'fixedSandbox'] # merge if 'mergeSpec' in taskParamMap: taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON( taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI( jediTaskID, strTaskParams) if tmpRet is not True: tmpLog.error( 'failed to update task params') continue except Exception as e: tmpLog.error( 'failed to change task params with {} {}'. format(str(e), traceback.format_exc())) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False # release un-staged files if 'staged ' in commentStr: releaseUnstaged = True else: releaseUnstaged = False tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI( jediTaskID, commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents, release_unstaged=releaseUnstaged) if tmpRet is True: tmpMsg = 'set task_status={0}'.format( newTaskStatus) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except Exception as e: errStr = '{} failed in runImpl() with {} {} '.format( self.__class__.__name__, str(e), traceback.format_exc()) logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info('updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid) if not tmpStat: tmpLog.info('wait until datasets are updated to finish') # ignore failGoalUnreached when manually finished tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True) tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents) if tmpRet == True: tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doRefine(self, jediTaskID, taskParamMap): try: # make logger tmpLog = self.tmpLog tmpLog.debug('start jediTaskID={0}'.format(jediTaskID)) # old dataset name oldDatasetName = taskParamMap['oldDatasetName'] # accompany datasets if 'oldAccompanyDatasetNames' in taskParamMap: oldAccDatasetNames = taskParamMap['oldAccompanyDatasetNames'] else: oldAccDatasetNames = None # use first file to get task and dataset info lostFileName = taskParamMap['lostFiles'][0] # get ole jediTaskID and datasetIDs tmpStat, oldIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI( oldDatasetName, lostFileName, 'output') if tmpStat is not True or oldIDs is None: tmpLog.error( 'failed to get jediTaskID and DatasetID for {0}:{1}'. format(oldDatasetName, lostFileName)) return self.SC_FAILED # get task oldJediTaskID = oldIDs['jediTaskID'] oldDatasetID = oldIDs['datasetID'] tmpStat, oldTaskSpec = self.taskBufferIF.getTaskWithID_JEDI( oldJediTaskID, True) if tmpStat is not True: tmpLog.error( 'failed to get TaskSpec for old jediTaskId={0}'.format( oldJediTaskID)) return self.SC_FAILED # make task spec taskSpec = JediTaskSpec() taskSpec.copyAttributes(oldTaskSpec) # reset attributes taskSpec.jediTaskID = jediTaskID taskSpec.taskType = taskParamMap['taskType'] taskSpec.taskPriority = taskParamMap['taskPriority'] self.taskSpec = taskSpec # get datasets tmpStat, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( oldJediTaskID) if tmpStat is not True: tmpLog.error('failed to get datasetSpecs') return self.SC_FAILED # loop over all datasets provenanceID = None dummyStreams = [] outDatasetSpec = None datasetNameSpecMap = {} for datasetSpec in datasetSpecList: # for output datasets if datasetSpec.type not in JediDatasetSpec.getInputTypes(): # collect output with the same provenanceID if provenanceID is not None and datasetSpec.provenanceID != provenanceID: continue # set provenanceID if undefined if provenanceID is None and datasetSpec.provenanceID is not None: provenanceID = datasetSpec.provenanceID # collect dummy streams if datasetSpec.type != 'log' and (datasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(datasetSpec.datasetName,oldAccDatasetNames)): if datasetSpec.streamName not in dummyStreams: dummyStreams.append(datasetSpec.streamName) continue # reset attributes datasetSpec.status = 'defined' datasetSpec.datasetID = None datasetSpec.jediTaskID = jediTaskID datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 # remove nosplit and repeat since even the same file is made for each bounaryID datasetSpec.remNoSplit() datasetSpec.remRepeat() # append to map datasetNameSpecMap[datasetSpec.datasetName] = datasetSpec # set master and secondary for input if datasetSpec.type in JediDatasetSpec.getInputTypes(): if datasetSpec.isMaster(): # master self.inMasterDatasetSpec = datasetSpec else: # secondary self.inSecDatasetSpecList.append(datasetSpec) elif datasetSpec.type == 'log': # set new attributes tmpItem = taskParamMap['log'] datasetSpec.datasetName = tmpItem['dataset'] if 'container' in tmpItem: datasetSpec.containerName = tmpItem['container'] if 'token' in tmpItem: datasetSpec.storageToken = tmpItem['token'] if 'destination' in tmpItem: datasetSpec.destination = tmpItem['destination'] # extract output filename template and change the value field outFileTemplate, tmpItem[ 'value'] = RefinerUtils.extractReplaceOutFileTemplate( tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate is not None: if 'offset' in tmpItem: offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': offsetVal, 'streamName': datasetSpec.streamName, 'filenameTemplate': outFileTemplate, 'outtype': datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [ outTemplateMap ] # append self.outDatasetSpecList.append(datasetSpec) else: # output dataset to make copies later outDatasetSpec = datasetSpec # replace redundant output streams with dummy files for dummyStream in dummyStreams: self.taskSpec.jobParamsTemplate = self.taskSpec.jobParamsTemplate.replace( '${' + dummyStream + '}', dummyStream.lower() + '.tmp') self.setJobParamsTemplate(self.taskSpec.jobParamsTemplate) # loop over all lost files datasetIDSpecMap = {} for lostFileName in taskParamMap['lostFiles']: # get FileID tmpStat, tmpIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI( oldDatasetName, lostFileName, 'output') if tmpStat is not True or tmpIDs is None: tmpLog.error('failed to get FileID for {0}:{1}'.format( oldDatasetName, lostFileName)) return self.SC_FAILED # get PandaID tmpStat, pandaID = self.taskBufferIF.getPandaIDWithFileID_JEDI( tmpIDs['jediTaskID'], tmpIDs['datasetID'], tmpIDs['fileID']) if tmpStat is not True or pandaID is None: tmpLog.error('failed to get PandaID for {0}'.format( str(tmpIDs))) return self.SC_FAILED # get files tmpStat, fileSpecList = self.taskBufferIF.getFilesWithPandaID_JEDI( pandaID) if tmpStat is not True or fileSpecList == []: tmpLog.error( 'failed to get files for PandaID={0}'.format(pandaID)) return self.SC_FAILED # append for fileSpec in fileSpecList: # only input types if fileSpec.type not in JediDatasetSpec.getInputTypes(): continue # get original datasetSpec if fileSpec.datasetID not in datasetIDSpecMap: tmpStat, tmpDatasetSpec = self.taskBufferIF.getDatasetWithID_JEDI( fileSpec.jediTaskID, fileSpec.datasetID) if tmpStat is not True or tmpDatasetSpec is None: tmpLog.error( 'failed to get dataset for jediTaskID={0} datasetID={1}' .format(fileSpec.jediTaskID, fileSpec.datasetID)) return self.SC_FAILED datasetIDSpecMap[fileSpec.datasetID] = tmpDatasetSpec origDatasetSpec = datasetIDSpecMap[fileSpec.datasetID] if origDatasetSpec.datasetName not in datasetNameSpecMap: tmpLog.error( 'datasetName={0} is missing in new datasets'. format(origDatasetSpec.datasetName)) return self.SC_FAILED # not target or accompany datasets if origDatasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(origDatasetSpec.datasetName,oldAccDatasetNames): continue newDatasetSpec = datasetNameSpecMap[ origDatasetSpec.datasetName] # set new attributes fileSpec.fileID = None fileSpec.datasetID = None fileSpec.jediTaskID = None fileSpec.boundaryID = pandaID fileSpec.keepTrack = 1 fileSpec.attemptNr = 1 fileSpec.status = 'ready' # append newDatasetSpec.addFile(fileSpec) # make one output dataset per file datasetSpec = copy.copy(outDatasetSpec) # set new attributes tmpItem = taskParamMap['output'] datasetSpec.datasetName = tmpItem['dataset'] if 'container' in tmpItem: datasetSpec.containerName = tmpItem['container'] if 'token' in tmpItem: datasetSpec.storageToken = tmpItem['token'] if 'destination' in tmpItem: datasetSpec.destination = tmpItem['destination'] # use PandaID of original job as provenanceID datasetSpec.provenanceID = pandaID # append self.outDatasetSpecList.append(datasetSpec) # extract attempt number from original filename tmpMatch = re.search('\.(\d+)$', lostFileName) if tmpMatch is None: offsetVal = 1 else: offsetVal = 1 + int(tmpMatch.group(1)) # filename without attempt number baseFileName = re.sub('\.(\d+)$', '', lostFileName) # make output template outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': offsetVal, 'streamName': datasetSpec.streamName, 'filenameTemplate': baseFileName + '.${SN:d}', 'outtype': datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [ outTemplateMap ] # append datasets to task parameters for datasetSpec in datasetNameSpecMap.values(): if datasetSpec.Files == []: continue fileList = [] for fileSpec in datasetSpec.Files: fileList.append({ 'lfn': fileSpec.lfn, 'firstEvent': fileSpec.firstEvent, 'startEvent': fileSpec.startEvent, 'endEvent': fileSpec.endEvent, 'keepTrack': fileSpec.keepTrack, 'boundaryID': fileSpec.boundaryID, }) taskParamMap = RefinerUtils.appendDataset( taskParamMap, datasetSpec, fileList) self.updatedTaskParams = taskParamMap # grouping with boundaryID self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doRefine failed with {0}:{1}'.format( errtype.__name__, errvalue)) return self.SC_FAILED tmpLog.debug('done') return self.SC_SUCCEEDED
def run(self): try: # get process lock locked = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel=self.prodSourceLabel, cloud=None, workqueue_id=None, resource_name=None, component=self.component, pid=self.pid, timeLimit=10) if not locked: self.log.debug( 'component={0} skipped since locked by another'.format( self.component)) return # get parameters for conversion self.log.debug('component={0} start'.format(self.component)) maxTasks = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_TASKS', 'jedi', self.vo) if maxTasks is None: maxTasks = 1 nEventsToDisable = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MIN_EVENTS_DISABLE', 'jedi', self.vo) if nEventsToDisable is None: nEventsToDisable = 100000 nEventsToEnable = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MIN_EVENTS_ENABLE', 'jedi', self.vo) if nEventsToEnable is None: nEventsToEnable = nEventsToDisable * 10 maxEvents = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_EVENTS', 'jedi', self.vo) if maxEvents is None: maxEvents = maxTasks * nEventsToEnable // 2 nJumboPerTask = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PER_TASK', 'jedi', self.vo) if nJumboPerTask is None: nJumboPerTask = 1 nJumboPerSite = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PER_SITE', 'jedi', self.vo) if nJumboPerSite is None: nJumboPerSite = 1 maxPrio = self.taskBufferIF.getConfigValue(self.component, 'JUMBO_MAX_CURR_PRIO', 'jedi', self.vo) if maxPrio is None: maxPrio = 500 progressToBoost = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PROG_TO_BOOST', 'jedi', self.vo) if progressToBoost is None: progressToBoost = 95 maxFilesToBoost = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_MAX_FILES_TO_BOOST', 'jedi', self.vo) if maxFilesToBoost is None: maxFilesToBoost = 500 prioToBoost = 900 prioWhenDisabled = self.taskBufferIF.getConfigValue( self.component, 'JUMBO_PRIO_DISABLED', 'jedi', self.vo) if prioWhenDisabled is None: prioWhenDisabled = 500 # get current info tasksWithJumbo = self.taskBufferIF.getTaskWithJumbo_JEDI( self.vo, self.prodSourceLabel) totEvents = 0 doneEvents = 0 nTasks = 0 for jediTaskID, taskData in iteritems(tasksWithJumbo): # disable jumbo if taskData['useJumbo'] != JediTaskSpec.enum_useJumbo[ 'disabled'] and taskData['site'] is None: if taskData['nEvents'] - taskData[ 'nEventsDone'] < nEventsToDisable: # disable self.log.info( 'component={0} disable jumbo in jediTaskID={1} due to n_events_to_process={2} < {3}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'], nEventsToDisable)) self.taskBufferIF.enableJumboJobs(jediTaskID, 0, 0) else: # wait nTasks += 1 totEvents += taskData['nEvents'] doneEvents += taskData['nEventsDone'] self.log.info( 'component={0} keep jumbo in jediTaskID={1} due to n_events_to_process={2} > {3}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'], nEventsToDisable)) # increase priority for jumbo disabled if taskData['useJumbo'] == JediTaskSpec.enum_useJumbo[ 'disabled'] and taskData[ 'currentPriority'] < prioWhenDisabled: self.taskBufferIF.changeTaskPriorityPanda( jediTaskID, prioWhenDisabled) self.log.info( 'component={0} priority boost to {1} after disabing jumbo in in jediTaskID={2}' .format(self.component, prioWhenDisabled, jediTaskID)) # increase priority when close to completion if taskData['nEvents'] > 0 and (taskData['nEvents'] - taskData['nEventsDone']) * 100 // taskData['nEvents'] < progressToBoost \ and taskData['currentPriority'] < prioToBoost and (taskData['nFiles'] - taskData['nFilesDone']) < maxFilesToBoost: # boost tmpStr = 'component={0} priority boost to {5} for jediTaskID={1} due to n_events_done={2} > {3}*{4}% '.format( self.component, jediTaskID, taskData['nEventsDone'], taskData['nEvents'], progressToBoost, prioToBoost) tmpStr += 'n_files_remaining={0} < {1}'.format( taskData['nFiles'] - taskData['nFilesDone'], maxFilesToBoost) self.log.info(tmpStr) self.taskBufferIF.changeTaskPriorityPanda( jediTaskID, prioToBoost) # kick pending if taskData['taskStatus'] in [ 'pending', 'running' ] and taskData['useJumbo'] in [ JediTaskSpec.enum_useJumbo['pending'], JediTaskSpec.enum_useJumbo['running'] ]: nActiveJumbo = 0 for computingSite, jobStatusMap in iteritems( taskData['jumboJobs']): for jobStatus, nJobs in iteritems(jobStatusMap): if jobStatus in [ 'defined', 'assigned', 'activated', 'sent', 'starting', 'running', 'transferring', 'holding' ]: nActiveJumbo += nJobs if nActiveJumbo == 0: self.log.info( 'component={0} kick jumbo in {2} jediTaskID={1}'. format(self.component, jediTaskID, taskData['taskStatus'])) self.taskBufferIF.kickPendingTasksWithJumbo_JEDI( jediTaskID) # reset input to re-generate co-jumbo if taskData['currentPriority'] >= prioToBoost: nReset = self.taskBufferIF.resetInputToReGenCoJumbo_JEDI( jediTaskID) if nReset is not None and nReset > 0: self.log.info( 'component={0} reset {1} inputs to regenerate co-jumbo for jediTaskID={2}' .format(self.component, nReset, jediTaskID)) else: self.log.debug( 'component={0} tried to reset inputs to regenerate co-jumbo with {1} for jediTaskID={2}' .format(self.component, nReset, jediTaskID)) self.log.info( 'component={0} total_events={1} n_events_to_process={2} n_tasks={3} available for jumbo' .format(self.component, totEvents, totEvents - doneEvents, nTasks)) if True: # get list of releases and caches available at jumbo job enabled PQs jumboRels, jumboCaches = self.taskBufferIF.getRelCacheForJumbo_JEDI( ) # look for tasks to enable jumbo if self.dryRun: self.log.info( 'component={0} look for tasks to enable jumbo in dry run mode' .format(self.component)) else: self.log.info( 'component={0} look for tasks to enable jumbo due to lack of tasks and events to meet max_tasks={1} max_events={2}' .format(self.component, maxTasks, maxEvents)) tasksToEnableJumbo = self.taskBufferIF.getTaskToEnableJumbo_JEDI( self.vo, self.prodSourceLabel, maxPrio, nEventsToEnable) nGoodTasks = 0 self.log.debug('component={0} got {1} tasks to check'.format( self.component, len(tasksToEnableJumbo))) # sort by nevents nEventsMap = dict() for jediTaskID, taskData in iteritems(tasksToEnableJumbo): nEventsMap[jediTaskID] = taskData['nEvents'] sortedList = sorted(list(nEventsMap.items()), key=operator.itemgetter(1)) sortedList.reverse() for jediTaskID, nEvents in sortedList: taskData = tasksToEnableJumbo[jediTaskID] # get task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: self.log.error( 'component={0} failed to get task params for jediTaskID={1}' .format(self.component, jediTaskID)) continue taskSpec = JediTaskSpec() taskSpec.splitRule = taskData['splitRule'] # check if good for jumbo if 'esConvertible' not in taskParamMap or taskParamMap[ 'esConvertible'] is False: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since not ES-convertible' .format(self.component, jediTaskID)) continue if taskSpec.inFilePosEvtNum(): pass elif taskSpec.getNumFilesPerJob() == 1: pass elif taskSpec.getNumEventsPerJob() is not None and 'nEventsPerInputFile' in taskParamMap \ and taskSpec.getNumEventsPerJob() <= taskParamMap['nEventsPerInputFile']: pass else: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since not good for in-file positional event numbers' .format(self.component, jediTaskID)) continue # check software transHome = taskData['transHome'] cmtConfig = taskData['architecture'] if re.search('^\d+\.\d+\.\d+$', transHome.split('-')[-1]) is not None: transHome = transHome.split('-')[-1] swDict = jumboRels else: swDict = jumboCaches key = (transHome, cmtConfig) if key not in swDict: self.log.info( 'component={0} skip to enable jumbo for jediTaskID={1} since {2}:{3} is unavailable at jumbo job enabled PQs' .format(self.component, jediTaskID, transHome, cmtConfig)) continue if not self.dryRun and nTasks < maxTasks and ( totEvents - doneEvents) < maxEvents: self.log.info( 'component={0} enable jumbo in jediTaskID={1} with n_events_to_process={2}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'])) if taskData['eventService'] == 0: tmpS, tmpO = self.taskBufferIF.enableEventService( taskData['jediTaskID']) if tmpS != 0: self.log.error( 'component={0} failed to enable ES in jediTaskID={1} with {2}' .format(self.component, jediTaskID, tmpO)) continue self.taskBufferIF.enableJumboJobs( taskData['jediTaskID'], nJumboPerTask, nJumboPerSite) nTasks += 1 totEvents += taskData['nEvents'] doneEvents += taskData['nEventsDone'] else: nGoodTasks += 1 self.log.info( 'component={0} good to enable jumbo in jediTaskID={1} with n_events_to_process={2}' .format( self.component, jediTaskID, taskData['nEvents'] - taskData['nEventsDone'])) self.log.info( 'component={0} there are n_good_tasks={1} tasks good for jumbo' .format(self.component, nGoodTasks)) self.log.debug('component={0} done'.format(self.component)) except Exception: # error errtype, errvalue = sys.exc_info()[:2] errStr = ": %s %s" % (errtype.__name__, errvalue) errStr.strip() errStr += traceback.format_exc() self.log.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # staging if tmpStat == Interaction.SC_SUCCEEDED: if 'toStaging' in taskParamMap and taskStatus <> 'staged': errStr = 'wait until staging is done' impl.taskSpec.status = 'staging' impl.taskSpec.oldStatus = taskStatus impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False) continue # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if parent_tid not in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus in ['registered', 'staged']: # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # disallow to reset some attributes for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']: impl.taskSpec.resetChangedAttr(attName) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if taskParamMap.has_key('walltimeUnit'): taskSpec.walltimeUnit = taskParamMap['walltimeUnit'] if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10*60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # event service if taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal'])*10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueWithName(taskSpec.vo,taskSpec.prodSourceLabel,taskParamMap['workQueueName']) if workQueue == None: # get work queue based on task attributes workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site) if workQueue == None: errStr = 'workqueue is undefined for vo={0} labal={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format(taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount) raise RuntimeError,errStr taskSpec.workQueue_ID = workQueue.queue_id self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap,'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap,'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap,'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap,'t1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap,'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap,'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap,'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap,'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning']) self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS']) # return return
def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if taskParamMap.has_key('workDiskCount'): taskSpec.outDiskCount = taskParamMap['workDiskCount'] else: taskSpec.outDiskCount = 0 if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 # scout if not taskParamMap.has_key('skipScout'): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName # event service if taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # work queue workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount) if workQueue == None: errStr = 'workqueue is undefined for vo={0} labal={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format(taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount) raise RuntimeError,errStr taskSpec.workQueue_ID = workQueue.queue_id self.taskSpec = taskSpec # set split rule self.setSplitRule(taskParamMap,'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap,'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap,'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent']) # return return
# check a couple of shares if they are valid leave names share_name = 'wrong_share' print("Share {0} is valid: {1}".format(share_name, proxyS.is_valid_share(share_name))) share_name = 'MC16Pile' print("Share {0} is valid: {1}".format(share_name, proxyS.is_valid_share(share_name))) try: from pandajedi.jedicore.JediTaskSpec import JediTaskSpec except ImportError: print("Skipped task tests since JEDI module depency not satisfied") sys.exit(0) # create a fake tasks with relevant fields and retrieve its share task_spec = JediTaskSpec() # Analysis task task_spec.prodSourceLabel = 'user' task_spec.campaign = 'dummy_campaign' task_spec.workingGroup = 'dummy_wg' task_spec.processingType = 'dummy_type' print("Share for task is {0}(should be 'Analysis')".format( proxyS.get_share_for_task(task_spec))) # Production task without any matching leave task_spec.prodSourceLabel = 'managed' task_spec.campaign = 'dummy_campaign' task_spec.workingGroup = 'dummy_wg' task_spec.processingType = 'dummy_type' print("Share for task is {0}(should be 'Undefined')".format(
def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] if 'currentPriority' in taskParamMap: taskSpec.currentPriority = taskParamMap['currentPriority'] else: taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if not taskParamMap.has_key('walltimeUnit'): # force to set NULL so that retried tasks get data from scouts again taskSpec.forceUpdate('walltimeUnit') if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # IO if 'ioIntensity' in taskParamMap: taskSpec.ioIntensity = taskParamMap['ioIntensity'] if 'ioIntensityUnit' in taskParamMap: taskSpec.ioIntensityUnit = taskParamMap['ioIntensityUnit'] # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10*60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # minimum granularity if 'minGranularity' in taskParamMap: taskParamMap['nEventsPerRange'] = taskParamMap['minGranularity'] # event service flag if 'useJobCloning' in taskParamMap: taskSpec.eventService = 2 elif taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # OS if 'osInfo' in taskParamMap: taskSpec.termCondition = taskParamMap['osInfo'] # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal'])*10) if taskSpec.goal > 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # request type if 'requestType' in taskParamMap: taskSpec.requestType = taskParamMap['requestType'] self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap,'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap,'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap,'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap,'t1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap,'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap,'maxAttemptEsJob', JediTaskSpec.splitRuleToken['maxAttemptEsJob']) self.setSplitRule(taskParamMap,'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap,'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap,'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) self.setSplitRule(taskParamMap,'maxWalltime', JediTaskSpec.splitRuleToken['maxWalltime']) self.setSplitRule(taskParamMap,'tgtMaxOutputForNG', JediTaskSpec.splitRuleToken['tgtMaxOutputForNG']) if 'nJumboJobs' in taskParamMap: self.setSplitRule(taskParamMap,'nJumboJobs',JediTaskSpec.splitRuleToken['nJumboJobs']) taskSpec.useJumbo = JediTaskSpec.enum_useJumbo['waiting'] if 'maxJumboPerSite' in taskParamMap: self.setSplitRule(taskParamMap,'maxJumboPerSite',JediTaskSpec.splitRuleToken['maxJumboPerSite']) if 'minCpuEfficiency' in taskParamMap: self.setSplitRule(taskParamMap,'minCpuEfficiency',JediTaskSpec.splitRuleToken['minCpuEfficiency']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent') and taskParamMap['noWaitParent'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB']) if 'orderByLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['orderByLB']) if 'respectSplitRule' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectSplitRule']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning']) self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(taskParamMap,'nEventsPerRange',JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS']) if 'mergeEsOnOS' in taskParamMap and taskParamMap['mergeEsOnOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['mergeEsOnOS']) if 'writeInputToFile' in taskParamMap and taskParamMap['writeInputToFile'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['writeInputToFile']) if 'useFileAsSourceLFN' in taskParamMap and taskParamMap['useFileAsSourceLFN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useFileAsSourceLFN']) if 'ignoreMissingInDS' in taskParamMap and taskParamMap['ignoreMissingInDS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['ignoreMissingInDS']) if 'noExecStrCnv' in taskParamMap and taskParamMap['noExecStrCnv'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noExecStrCnv']) if 'inFilePosEvtNum' in taskParamMap and taskParamMap['inFilePosEvtNum'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['inFilePosEvtNum']) if self.taskSpec.useEventService() and not taskSpec.useJobCloning(): if 'registerEsFiles' in taskParamMap and taskParamMap['registerEsFiles'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['registerEsFiles']) if 'disableAutoFinish' in taskParamMap and taskParamMap['disableAutoFinish'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableAutoFinish']) if 'resurrectConsumers' in taskParamMap and taskParamMap['resurrectConsumers'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['resurrectConsumers']) if 'usePrefetcher' in taskParamMap and taskParamMap['usePrefetcher'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['usePrefetcher']) if 'notDiscardEvents' in taskParamMap and taskParamMap['notDiscardEvents'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['notDiscardEvents']) if 'decAttOnFailedES' in taskParamMap and taskParamMap['decAttOnFailedES'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['decAttOnFailedES']) if 'useZipToPin' in taskParamMap and taskParamMap['useZipToPin'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useZipToPin']) if 'osMatching' in taskParamMap and taskParamMap['osMatching'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['osMatching']) # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueByName(taskSpec.vo, taskSpec.prodSourceLabel, taskParamMap['workQueueName']) if workQueue is None: # get work queue based on task attributes workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo, taskSpec.prodSourceLabel, prodSourceLabel=taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site, eventService=taskSpec.eventService, splitRule=taskSpec.splitRule, campaign=taskSpec.campaign) if workQueue is None: errStr = 'workqueue is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format(taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount, taskSpec.eventService) errStr += 'splitRule={0} campaign={1}'.format(taskSpec.splitRule,taskSpec.campaign) raise RuntimeError,errStr self.taskSpec.workQueue_ID = workQueue.queue_id # Initialize the global share gshare = None if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share(taskParamMap['gshare']): # work queue is specified gshare = taskParamMap['gshare'] else: # get share based on definition gshare = self.taskBufferIF.get_share_for_task(self.taskSpec) if gshare is None: gshare = 'Undefined' # Should not happen. Undefined is set when no share is found # errStr = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign) # raise RuntimeError,errStr self.taskSpec.gshare = gshare # Initialize the resource type try: self.taskSpec.resource_type = self.taskBufferIF.get_resource_type_task(self.taskSpec) except: self.taskSpec.resource_type = 'Undefined' # return return