def doPreProRefine(self,taskParamMap): # no preprocessing if not taskParamMap.has_key('preproSpec'): return None,taskParamMap # already preprocessed if self.taskSpec.checkPreProcessed(): # get replaced task params tmpStat,tmpJsonStr = self.taskBufferIF.getPreprocessMetadata_JEDI(self.taskSpec.jediTaskID) try: # replace placeholders replaceParams = RefinerUtils.decodeJSON(tmpJsonStr) self.tmpLog.debug("replace placeholders with "+str(replaceParams)) for tmpKey,tmpVal in replaceParams.iteritems(): self.replacePlaceHolders(taskParamMap,tmpKey,tmpVal) except: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.error('{0} failed to get additional task params with {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) return False,taskParamMap # succeeded self.updatedTaskParams = taskParamMap return None,taskParamMap # make dummy dataset to keep track of preprocessing datasetSpec = JediDatasetSpec() datasetSpec.datasetName = 'panda.pp.in.{0}.{1}'.format(uuid.uuid4(),self.taskSpec.jediTaskID) datasetSpec.jediTaskID = self.taskSpec.jediTaskID datasetSpec.type = 'pp_input' datasetSpec.vo = self.taskSpec.vo datasetSpec.nFiles = 1 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 1 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 datasetSpec.status = 'ready' self.inMasterDatasetSpec.append(datasetSpec) # make file fileSpec = JediFileSpec() fileSpec.jediTaskID = datasetSpec.jediTaskID fileSpec.type = datasetSpec.type fileSpec.status = 'ready' fileSpec.lfn = 'pseudo_lfn' fileSpec.attemptNr = 0 fileSpec.maxAttempt = 3 fileSpec.keepTrack = 1 datasetSpec.addFile(fileSpec) # make log dataset logDatasetSpec = JediDatasetSpec() logDatasetSpec.datasetName = 'panda.pp.log.{0}.{1}'.format(uuid.uuid4(),self.taskSpec.jediTaskID) logDatasetSpec.jediTaskID = self.taskSpec.jediTaskID logDatasetSpec.type = 'tmpl_pp_log' logDatasetSpec.streamName = 'PP_LOG' logDatasetSpec.vo = self.taskSpec.vo logDatasetSpec.nFiles = 0 logDatasetSpec.nFilesUsed = 0 logDatasetSpec.nFilesToBeUsed = 0 logDatasetSpec.nFilesFinished = 0 logDatasetSpec.nFilesFailed = 0 logDatasetSpec.nFilesOnHold = 0 logDatasetSpec.status = 'defined' self.outDatasetSpecList.append(logDatasetSpec) # make output template for log outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : 1, 'streamName' : logDatasetSpec.streamName, 'filenameTemplate' : "{0}._${{SN}}.log.tgz".format(logDatasetSpec.datasetName), 'outtype' : re.sub('^tmpl_','',logDatasetSpec.type), } self.outputTemplateMap[logDatasetSpec.outputMapKey()] = [outTemplateMap] # set split rule to use preprocessing self.taskSpec.setPrePro() # set task status self.taskSpec.status = 'topreprocess' # return return True,taskParamMap
def doBasicRefine(self,taskParamMap): # get input/output/log dataset specs nIn = 0 nOutMap = {} if isinstance(taskParamMap['log'],dict): itemList = taskParamMap['jobParameters'] + [taskParamMap['log']] else: itemList = taskParamMap['jobParameters'] + taskParamMap['log'] # pseudo input if taskParamMap.has_key('noInput') and taskParamMap['noInput'] == True: tmpItem = {} tmpItem['type'] = 'template' tmpItem['value'] = '' tmpItem['dataset'] = 'pseudo_dataset' tmpItem['param_type'] = 'pseudo_input' itemList = [tmpItem] + itemList # random seed if RefinerUtils.useRandomSeed(taskParamMap): tmpItem = {} tmpItem['type'] = 'template' tmpItem['value'] = '' tmpItem['dataset'] = 'RNDMSEED' tmpItem['param_type'] = 'random_seed' itemList.append(tmpItem) # loop over all items allDsList = [] for tmpItem in itemList: # look for datasets if tmpItem['type'] == 'template' and tmpItem.has_key('dataset'): # avoid duplication if not tmpItem['dataset'] in allDsList: allDsList.append(tmpItem['dataset']) else: continue datasetSpec = JediDatasetSpec() datasetSpec.datasetName = tmpItem['dataset'] datasetSpec.jediTaskID = self.taskSpec.jediTaskID datasetSpec.type = tmpItem['param_type'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] if tmpItem.has_key('attributes'): datasetSpec.setDatasetAttribute(tmpItem['attributes']) if tmpItem.has_key('ratio'): datasetSpec.setDatasetAttribute('ratio={0}'.format(tmpItem['ratio'])) if tmpItem.has_key('check'): datasetSpec.setDatasetAttribute('cc') if tmpItem.has_key('usedup'): datasetSpec.setDatasetAttribute('ud') if tmpItem.has_key('random'): datasetSpec.setDatasetAttribute('rd') if tmpItem.has_key('reusable'): datasetSpec.setDatasetAttribute('ru') if tmpItem.has_key('offset'): datasetSpec.setOffset(tmpItem['offset']) if tmpItem.has_key('allowNoOutput'): datasetSpec.allowNoOutput() if tmpItem.has_key('nFilesPerJob'): datasetSpec.setNumFilesPerJob(tmpItem['nFilesPerJob']) if tmpItem.has_key('num_records'): datasetSpec.setNumRecords(tmpItem['num_records']) if 'transient' in tmpItem: datasetSpec.setTransient(tmpItem['transient']) datasetSpec.vo = self.taskSpec.vo datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 datasetSpec.nEvents = 0 datasetSpec.nEventsUsed = 0 datasetSpec.nEventsToBeUsed = 0 datasetSpec.status = 'defined' if datasetSpec.type in JediDatasetSpec.getInputTypes() + ['random_seed']: datasetSpec.streamName = RefinerUtils.extractStreamName(tmpItem['value']) if not tmpItem.has_key('expandedList'): tmpItem['expandedList'] = [] # dataset names could be comma-concatenated datasetNameList = datasetSpec.datasetName.split(',') # datasets could be added by incexec incexecDS = 'dsFor{0}'.format(datasetSpec.streamName) # remove /XYZ incexecDS = incexecDS.split('/')[0] if taskParamMap.has_key(incexecDS): for tmpDatasetName in taskParamMap[incexecDS].split(','): if not tmpDatasetName in datasetNameList: datasetNameList.append(tmpDatasetName) # loop over all dataset names inDatasetSpecList = [] for datasetName in datasetNameList: # skip empty if datasetName == '': continue # expand if datasetSpec.isPseudo() or datasetSpec.type in ['random_seed'] or datasetName == 'DBR_LATEST': # pseudo input tmpDatasetNameList = [datasetName] elif tmpItem.has_key('expand') and tmpItem['expand'] == True: # expand dataset container tmpDatasetNameList = self.ddmIF.getInterface(self.taskSpec.vo).expandContainer(datasetName) else: # normal dataset name tmpDatasetNameList = self.ddmIF.getInterface(self.taskSpec.vo).listDatasets(datasetName) for elementDatasetName in tmpDatasetNameList: if not elementDatasetName in tmpItem['expandedList']: tmpItem['expandedList'].append(elementDatasetName) inDatasetSpec = copy.copy(datasetSpec) inDatasetSpec.datasetName = elementDatasetName inDatasetSpec.containerName = datasetName inDatasetSpecList.append(inDatasetSpec) # empty input if inDatasetSpecList == [] and self.oldTaskStatus != 'rerefine': errStr = 'doBasicRefine : unknown input dataset "{0}"'.format(datasetSpec.datasetName) self.taskSpec.setErrDiag(errStr) if not datasetSpec.datasetName in self.unknownDatasetList: self.unknownDatasetList.append(datasetSpec.datasetName) raise JediException.UnknownDatasetError,errStr # set master flag for inDatasetSpec in inDatasetSpecList: if nIn == 0: # master self.inMasterDatasetSpec.append(inDatasetSpec) else: # secondary self.inSecDatasetSpecList.append(inDatasetSpec) nIn += 1 continue if datasetSpec.type in ['output','log']: if not nOutMap.has_key(datasetSpec.type): nOutMap[datasetSpec.type] = 0 # make stream name datasetSpec.streamName = "{0}{1}".format(datasetSpec.type.upper(),nOutMap[datasetSpec.type]) nOutMap[datasetSpec.type] += 1 # set attribute for event service if self.taskSpec.useEventService() and taskParamMap.has_key('objectStore') and datasetSpec.type in ['output']: datasetSpec.setObjectStore(taskParamMap['objectStore']) # extract output filename template and change the value field outFileTemplate,tmpItem['value'] = RefinerUtils.extractReplaceOutFileTemplate(tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate != None: if tmpItem.has_key('offset'): offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : offsetVal, 'streamName' : datasetSpec.streamName, 'filenameTemplate' : outFileTemplate, 'outtype' : datasetSpec.type, } if self.outputTemplateMap.has_key(datasetSpec.outputMapKey()): # multiple files are associated to the same output datasets self.outputTemplateMap[datasetSpec.outputMapKey()].append(outTemplateMap) # don't insert the same output dataset continue self.outputTemplateMap[datasetSpec.outputMapKey()] = [outTemplateMap] # append self.outDatasetSpecList.append(datasetSpec) # make unmerged dataset if taskParamMap.has_key('mergeOutput') and taskParamMap['mergeOutput'] == True: umDatasetSpec = JediDatasetSpec() umDatasetSpec.datasetName = 'panda.um.' + datasetSpec.datasetName umDatasetSpec.jediTaskID = self.taskSpec.jediTaskID umDatasetSpec.storageToken = 'TOMERGE' umDatasetSpec.vo = datasetSpec.vo umDatasetSpec.type = "tmpl_trn_" + datasetSpec.type umDatasetSpec.nFiles = 0 umDatasetSpec.nFilesUsed = 0 umDatasetSpec.nFilesToBeUsed = 0 umDatasetSpec.nFilesFinished = 0 umDatasetSpec.nFilesFailed = 0 umDatasetSpec.nFilesOnHold = 0 umDatasetSpec.status = 'defined' umDatasetSpec.streamName = datasetSpec.streamName if datasetSpec.isAllowedNoOutput(): umDatasetSpec.allowNoOutput() # make unmerged output template if outFileTemplate != None: umOutTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : 1, 'streamName' : umDatasetSpec.streamName, 'outtype' : datasetSpec.type, } # append temporary name if taskParamMap.has_key('umNameAtEnd') and taskParamMap['umNameAtEnd'] == True: # append temporary name at the end umOutTemplateMap['filenameTemplate'] = outFileTemplate + '.panda.um' else: umOutTemplateMap['filenameTemplate'] = 'panda.um.' + outFileTemplate if self.outputTemplateMap.has_key(umDatasetSpec.outputMapKey()): # multiple files are associated to the same output datasets self.outputTemplateMap[umDatasetSpec.outputMapKey()].append(umOutTemplateMap) # don't insert the same output dataset continue self.outputTemplateMap[umDatasetSpec.outputMapKey()] = [umOutTemplateMap] # use log as master for merging if datasetSpec.type == 'log': self.unmergeMasterDatasetSpec[datasetSpec.outputMapKey()] = umDatasetSpec else: # append self.unmergeDatasetSpecMap[datasetSpec.outputMapKey()] = umDatasetSpec # set attributes for merging if taskParamMap.has_key('mergeOutput') and taskParamMap['mergeOutput'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['mergeOutput']) # make job parameters rndmSeedOffset = None firstEventOffset = None jobParameters = '' for tmpItem in taskParamMap['jobParameters']: if tmpItem.has_key('value'): # hidden parameter if tmpItem.has_key('hidden') and tmpItem['hidden'] == True: continue # add tags for ES-only parameters esOnly = False if 'es_only' in tmpItem and tmpItem['es_only'] == True: esOnly = True if esOnly: jobParameters += '<PANDA_ES_ONLY>' jobParameters += '{0}'.format(tmpItem['value']) if esOnly: jobParameters += '</PANDA_ES_ONLY>' # padding if tmpItem.has_key('padding') and tmpItem['padding'] == False: pass else: jobParameters += ' ' # get offset for random seed and first event if tmpItem['type'] == 'template' and tmpItem['param_type'] == 'number': if '${RNDMSEED}' in tmpItem['value']: if tmpItem.has_key('offset'): rndmSeedOffset = tmpItem['offset'] else: rndmSeedOffset = 0 elif '${FIRSTEVENT}' in tmpItem['value']: if tmpItem.has_key('offset'): firstEventOffset = tmpItem['offset'] jobParameters = jobParameters[:-1] # append parameters for event service merging if necessary esmergeParams = self.getParamsForEventServiceMerging(taskParamMap) if esmergeParams != None: jobParameters += esmergeParams self.setJobParamsTemplate(jobParameters) # set random seed offset if rndmSeedOffset != None: self.setSplitRule(None,rndmSeedOffset,JediTaskSpec.splitRuleToken['randomSeed']) if firstEventOffset != None: self.setSplitRule(None,firstEventOffset,JediTaskSpec.splitRuleToken['firstEvent']) # return return
def doPreProRefine(self, taskParamMap): # no preprocessing if not taskParamMap.has_key('preproSpec'): return None, taskParamMap # already preprocessed if self.taskSpec.checkPreProcessed(): # get replaced task params tmpStat, tmpJsonStr = self.taskBufferIF.getPreprocessMetadata_JEDI( self.taskSpec.jediTaskID) try: # replace placeholders replaceParams = RefinerUtils.decodeJSON(tmpJsonStr) self.tmpLog.debug("replace placeholders with " + str(replaceParams)) for tmpKey, tmpVal in replaceParams.iteritems(): self.replacePlaceHolders(taskParamMap, tmpKey, tmpVal) except: errtype, errvalue = sys.exc_info()[:2] self.tmpLog.error( '{0} failed to get additional task params with {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) return False, taskParamMap # succeeded self.updatedTaskParams = taskParamMap return None, taskParamMap # make dummy dataset to keep track of preprocessing datasetSpec = JediDatasetSpec() datasetSpec.datasetName = 'panda.pp.in.{0}.{1}'.format( uuid.uuid4(), self.taskSpec.jediTaskID) datasetSpec.jediTaskID = self.taskSpec.jediTaskID datasetSpec.type = 'pp_input' datasetSpec.vo = self.taskSpec.vo datasetSpec.nFiles = 1 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 1 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 datasetSpec.status = 'ready' self.inMasterDatasetSpec.append(datasetSpec) # make file fileSpec = JediFileSpec() fileSpec.jediTaskID = datasetSpec.jediTaskID fileSpec.type = datasetSpec.type fileSpec.status = 'ready' fileSpec.lfn = 'pseudo_lfn' fileSpec.attemptNr = 0 fileSpec.maxAttempt = 3 fileSpec.keepTrack = 1 datasetSpec.addFile(fileSpec) # make log dataset logDatasetSpec = JediDatasetSpec() logDatasetSpec.datasetName = 'panda.pp.log.{0}.{1}'.format( uuid.uuid4(), self.taskSpec.jediTaskID) logDatasetSpec.jediTaskID = self.taskSpec.jediTaskID logDatasetSpec.type = 'tmpl_pp_log' logDatasetSpec.streamName = 'PP_LOG' logDatasetSpec.vo = self.taskSpec.vo logDatasetSpec.nFiles = 0 logDatasetSpec.nFilesUsed = 0 logDatasetSpec.nFilesToBeUsed = 0 logDatasetSpec.nFilesFinished = 0 logDatasetSpec.nFilesFailed = 0 logDatasetSpec.nFilesOnHold = 0 logDatasetSpec.status = 'defined' self.outDatasetSpecList.append(logDatasetSpec) # make output template for log outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': 1, 'streamName': logDatasetSpec.streamName, 'filenameTemplate': "{0}._${{SN}}.log.tgz".format(logDatasetSpec.datasetName), 'outtype': re.sub('^tmpl_', '', logDatasetSpec.type), } self.outputTemplateMap[logDatasetSpec.outputMapKey()] = [ outTemplateMap ] # set split rule to use preprocessing self.taskSpec.setPrePro() # set task status self.taskSpec.status = 'topreprocess' # return return True, taskParamMap
def doBasicRefine(self, taskParamMap): # get input/output/log dataset specs nIn = 0 nOutMap = {} if isinstance(taskParamMap['log'], dict): itemList = taskParamMap['jobParameters'] + [taskParamMap['log']] else: itemList = taskParamMap['jobParameters'] + taskParamMap['log'] # pseudo input if taskParamMap.has_key('noInput') and taskParamMap['noInput'] == True: tmpItem = {} tmpItem['type'] = 'template' tmpItem['value'] = '' tmpItem['dataset'] = 'pseudo_dataset' tmpItem['param_type'] = 'pseudo_input' itemList = [tmpItem] + itemList # random seed if RefinerUtils.useRandomSeed(taskParamMap): tmpItem = {} tmpItem['type'] = 'template' tmpItem['value'] = '' tmpItem['dataset'] = 'RNDMSEED' tmpItem['param_type'] = 'random_seed' itemList.append(tmpItem) # loop over all items allDsList = [] for tmpItem in itemList: # look for datasets if tmpItem['type'] == 'template' and tmpItem.has_key('dataset'): # avoid duplication if not tmpItem['dataset'] in allDsList: allDsList.append(tmpItem['dataset']) else: continue datasetSpec = JediDatasetSpec() datasetSpec.datasetName = tmpItem['dataset'] datasetSpec.jediTaskID = self.taskSpec.jediTaskID datasetSpec.type = tmpItem['param_type'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] if tmpItem.has_key('attributes'): datasetSpec.setDatasetAttribute(tmpItem['attributes']) if tmpItem.has_key('ratio'): datasetSpec.setDatasetAttribute('ratio={0}'.format( tmpItem['ratio'])) if tmpItem.has_key('eventRatio'): datasetSpec.setEventRatio(tmpItem['eventRatio']) if tmpItem.has_key('check'): datasetSpec.setDatasetAttribute('cc') if tmpItem.has_key('usedup'): datasetSpec.setDatasetAttribute('ud') if tmpItem.has_key('random'): datasetSpec.setDatasetAttribute('rd') if tmpItem.has_key('reusable'): datasetSpec.setDatasetAttribute('ru') if tmpItem.has_key('offset'): datasetSpec.setOffset(tmpItem['offset']) if tmpItem.has_key('allowNoOutput'): datasetSpec.allowNoOutput() if tmpItem.has_key('nFilesPerJob'): datasetSpec.setNumFilesPerJob(tmpItem['nFilesPerJob']) if tmpItem.has_key('num_records'): datasetSpec.setNumRecords(tmpItem['num_records']) if 'transient' in tmpItem: datasetSpec.setTransient(tmpItem['transient']) datasetSpec.vo = self.taskSpec.vo datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 datasetSpec.nEvents = 0 datasetSpec.nEventsUsed = 0 datasetSpec.nEventsToBeUsed = 0 datasetSpec.status = 'defined' if datasetSpec.type in JediDatasetSpec.getInputTypes() + [ 'random_seed' ]: datasetSpec.streamName = RefinerUtils.extractStreamName( tmpItem['value']) if not tmpItem.has_key('expandedList'): tmpItem['expandedList'] = [] # dataset names could be comma-concatenated datasetNameList = datasetSpec.datasetName.split(',') # datasets could be added by incexec incexecDS = 'dsFor{0}'.format(datasetSpec.streamName) # remove /XYZ incexecDS = incexecDS.split('/')[0] if taskParamMap.has_key(incexecDS): for tmpDatasetName in taskParamMap[incexecDS].split( ','): if not tmpDatasetName in datasetNameList: datasetNameList.append(tmpDatasetName) # loop over all dataset names inDatasetSpecList = [] for datasetName in datasetNameList: # skip empty if datasetName == '': continue # expand if datasetSpec.isPseudo() or datasetSpec.type in [ 'random_seed' ] or datasetName == 'DBR_LATEST': # pseudo input tmpDatasetNameList = [datasetName] elif tmpItem.has_key( 'expand') and tmpItem['expand'] == True: # expand dataset container tmpDatasetNameList = self.ddmIF.getInterface( self.taskSpec.vo).expandContainer(datasetName) else: # normal dataset name tmpDatasetNameList = self.ddmIF.getInterface( self.taskSpec.vo).listDatasets(datasetName) for elementDatasetName in tmpDatasetNameList: if nIn > 0 or not elementDatasetName in tmpItem[ 'expandedList']: tmpItem['expandedList'].append( elementDatasetName) inDatasetSpec = copy.copy(datasetSpec) inDatasetSpec.datasetName = elementDatasetName inDatasetSpec.containerName = datasetName inDatasetSpecList.append(inDatasetSpec) # empty input if inDatasetSpecList == [] and self.oldTaskStatus != 'rerefine': errStr = 'doBasicRefine : unknown input dataset "{0}"'.format( datasetSpec.datasetName) self.taskSpec.setErrDiag(errStr) if not datasetSpec.datasetName in self.unknownDatasetList: self.unknownDatasetList.append( datasetSpec.datasetName) raise JediException.UnknownDatasetError, errStr # set master flag for inDatasetSpec in inDatasetSpecList: if nIn == 0: # master self.inMasterDatasetSpec.append(inDatasetSpec) else: # secondary self.inSecDatasetSpecList.append(inDatasetSpec) nIn += 1 continue if datasetSpec.type in ['output', 'log']: if not nOutMap.has_key(datasetSpec.type): nOutMap[datasetSpec.type] = 0 # make stream name datasetSpec.streamName = "{0}{1}".format( datasetSpec.type.upper(), nOutMap[datasetSpec.type]) nOutMap[datasetSpec.type] += 1 # set attribute for event service if self.taskSpec.useEventService( ) and taskParamMap.has_key( 'objectStore') and datasetSpec.type in ['output']: datasetSpec.setObjectStore(taskParamMap['objectStore']) # extract output filename template and change the value field outFileTemplate, tmpItem[ 'value'] = RefinerUtils.extractReplaceOutFileTemplate( tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate != None: if tmpItem.has_key('offset'): offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': offsetVal, 'streamName': datasetSpec.streamName, 'filenameTemplate': outFileTemplate, 'outtype': datasetSpec.type, } if self.outputTemplateMap.has_key( datasetSpec.outputMapKey()): # multiple files are associated to the same output datasets self.outputTemplateMap[datasetSpec.outputMapKey( )].append(outTemplateMap) # don't insert the same output dataset continue self.outputTemplateMap[datasetSpec.outputMapKey()] = [ outTemplateMap ] # append self.outDatasetSpecList.append(datasetSpec) # make unmerged dataset if taskParamMap.has_key('mergeOutput') and taskParamMap[ 'mergeOutput'] == True: umDatasetSpec = JediDatasetSpec() umDatasetSpec.datasetName = 'panda.um.' + datasetSpec.datasetName umDatasetSpec.jediTaskID = self.taskSpec.jediTaskID umDatasetSpec.storageToken = 'TOMERGE' umDatasetSpec.vo = datasetSpec.vo umDatasetSpec.type = "tmpl_trn_" + datasetSpec.type umDatasetSpec.nFiles = 0 umDatasetSpec.nFilesUsed = 0 umDatasetSpec.nFilesToBeUsed = 0 umDatasetSpec.nFilesFinished = 0 umDatasetSpec.nFilesFailed = 0 umDatasetSpec.nFilesOnHold = 0 umDatasetSpec.status = 'defined' umDatasetSpec.streamName = datasetSpec.streamName if datasetSpec.isAllowedNoOutput(): umDatasetSpec.allowNoOutput() # ratio if datasetSpec.getRatioToMaster() > 1: umDatasetSpec.setDatasetAttribute( 'ratio={0}'.format( datasetSpec.getRatioToMaster())) # make unmerged output template if outFileTemplate != None: umOutTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': 1, 'streamName': umDatasetSpec.streamName, 'outtype': datasetSpec.type, } # append temporary name if taskParamMap.has_key( 'umNameAtEnd' ) and taskParamMap['umNameAtEnd'] == True: # append temporary name at the end umOutTemplateMap[ 'filenameTemplate'] = outFileTemplate + '.panda.um' else: umOutTemplateMap[ 'filenameTemplate'] = 'panda.um.' + outFileTemplate if self.outputTemplateMap.has_key( umDatasetSpec.outputMapKey()): # multiple files are associated to the same output datasets self.outputTemplateMap[ umDatasetSpec.outputMapKey()].append( umOutTemplateMap) # don't insert the same output dataset continue self.outputTemplateMap[ umDatasetSpec.outputMapKey()] = [ umOutTemplateMap ] # use log as master for merging if datasetSpec.type == 'log': self.unmergeMasterDatasetSpec[ datasetSpec.outputMapKey()] = umDatasetSpec else: # append self.unmergeDatasetSpecMap[ datasetSpec.outputMapKey()] = umDatasetSpec # set attributes for merging if taskParamMap.has_key( 'mergeOutput') and taskParamMap['mergeOutput'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['mergeOutput']) # make job parameters rndmSeedOffset = None firstEventOffset = None jobParameters = '' for tmpItem in taskParamMap['jobParameters']: if tmpItem.has_key('value'): # hidden parameter if tmpItem.has_key('hidden') and tmpItem['hidden'] == True: continue # add tags for ES-only parameters esOnly = False if 'es_only' in tmpItem and tmpItem['es_only'] == True: esOnly = True if esOnly: jobParameters += '<PANDA_ES_ONLY>' jobParameters += '{0}'.format(tmpItem['value']) if esOnly: jobParameters += '</PANDA_ES_ONLY>' # padding if tmpItem.has_key('padding') and tmpItem['padding'] == False: pass else: jobParameters += ' ' # get offset for random seed and first event if tmpItem['type'] == 'template' and tmpItem[ 'param_type'] == 'number': if '${RNDMSEED}' in tmpItem['value']: if tmpItem.has_key('offset'): rndmSeedOffset = tmpItem['offset'] else: rndmSeedOffset = 0 elif '${FIRSTEVENT}' in tmpItem['value']: if tmpItem.has_key('offset'): firstEventOffset = tmpItem['offset'] jobParameters = jobParameters[:-1] # append parameters for event service merging if necessary esmergeParams = self.getParamsForEventServiceMerging(taskParamMap) if esmergeParams != None: jobParameters += esmergeParams self.setJobParamsTemplate(jobParameters) # set random seed offset if rndmSeedOffset != None: self.setSplitRule(None, rndmSeedOffset, JediTaskSpec.splitRuleToken['randomSeed']) if firstEventOffset != None: self.setSplitRule(None, firstEventOffset, JediTaskSpec.splitRuleToken['firstEvent']) # return return
def doRefine(self,jediTaskID,taskParamMap): try: # make logger tmpLog = self.tmpLog tmpLog.debug('start jediTaskID={0}'.format(jediTaskID)) # old dataset name oldDatasetName = taskParamMap['oldDatasetName'] # accompany datasets if taskParamMap.has_key('oldAccompanyDatasetNames'): oldAccDatasetNames = taskParamMap['oldAccompanyDatasetNames'] else: oldAccDatasetNames = None # use first file to get task and dataset info lostFileName = taskParamMap['lostFiles'][0] # get ole jediTaskID and datasetIDs tmpStat,oldIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI(oldDatasetName,lostFileName,'output') if tmpStat != True or oldIDs == None: tmpLog.error('failed to get jediTaskID and DatasetID for {0}:{1}'.format(oldDatasetName, lostFileName)) return self.SC_FAILED # get task oldJediTaskID = oldIDs['jediTaskID'] oldDatasetID = oldIDs['datasetID'] tmpStat,oldTaskSpec = self.taskBufferIF.getTaskWithID_JEDI(oldJediTaskID,True) if tmpStat != True: tmpLog.error('failed to get TaskSpec for old jediTaskId={0}'.format(oldJediTaskID)) return self.SC_FAILED # make task spec taskSpec = JediTaskSpec() taskSpec.copyAttributes(oldTaskSpec) # reset attributes taskSpec.jediTaskID = jediTaskID taskSpec.taskType = taskParamMap['taskType'] taskSpec.taskPriority = taskParamMap['taskPriority'] self.taskSpec = taskSpec # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(oldJediTaskID) if tmpStat != True: tmpLog.error('failed to get datasetSpecs') return self.SC_FAILED # loop over all datasets provenanceID = None dummyStreams = [] outDatasetSpec = None datasetNameSpecMap = {} for datasetSpec in datasetSpecList: # for output datasets if not datasetSpec.type in JediDatasetSpec.getInputTypes(): # collect output with the same provenanceID if provenanceID != None and datasetSpec.provenanceID != provenanceID: continue # set provenanceID if undefined if provenanceID == None and datasetSpec.provenanceID != None: provenanceID = datasetSpec.provenanceID # collect dummy streams if datasetSpec.type != 'log' and (datasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(datasetSpec.datasetName,oldAccDatasetNames)): if not datasetSpec.streamName in dummyStreams: dummyStreams.append(datasetSpec.streamName) continue # reset attributes datasetSpec.status = 'defined' datasetSpec.datasetID = None datasetSpec.jediTaskID = jediTaskID datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 # remove nosplit and repeat since even the same file is made for each bounaryID datasetSpec.remNoSplit() datasetSpec.remRepeat() # append to map datasetNameSpecMap[datasetSpec.datasetName] = datasetSpec # set master and secondary for input if datasetSpec.type in JediDatasetSpec.getInputTypes(): if datasetSpec.isMaster(): # master self.inMasterDatasetSpec = datasetSpec else: # secondary self.inSecDatasetSpecList.append(datasetSpec) elif datasetSpec.type == 'log': # set new attributes tmpItem = taskParamMap['log'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # extract output filename template and change the value field outFileTemplate,tmpItem['value'] = RefinerUtils.extractReplaceOutFileTemplate(tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate != None: if tmpItem.has_key('offset'): offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : offsetVal, 'streamName' : datasetSpec.streamName, 'filenameTemplate' : outFileTemplate, 'outtype' : datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [outTemplateMap] # append self.outDatasetSpecList.append(datasetSpec) else: # output dataset to make copies later outDatasetSpec = datasetSpec # replace redundant output streams with dummy files for dummyStream in dummyStreams: self.taskSpec.jobParamsTemplate = self.taskSpec.jobParamsTemplate.replace('${'+dummyStream+'}', dummyStream.lower()+'.tmp') self.setJobParamsTemplate(self.taskSpec.jobParamsTemplate) # loop over all lost files datasetIDSpecMap = {} for lostFileName in taskParamMap['lostFiles']: # get FileID tmpStat,tmpIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI(oldDatasetName,lostFileName,'output') if tmpStat != True or tmpIDs == None: tmpLog.error('failed to get FileID for {0}:{1}'.format(oldDatasetName, lostFileName)) return self.SC_FAILED # get PandaID tmpStat,pandaID = self.taskBufferIF.getPandaIDWithFileID_JEDI(tmpIDs['jediTaskID'], tmpIDs['datasetID'], tmpIDs['fileID']) if tmpStat != True or pandaID == None: tmpLog.error('failed to get PandaID for {0}'.format(str(tmpIDs))) return self.SC_FAILED # get files tmpStat,fileSpecList = self.taskBufferIF.getFilesWithPandaID_JEDI(pandaID) if tmpStat != True or fileSpecList == []: tmpLog.error('failed to get files for PandaID={0}'.format(pandaID)) return self.SC_FAILED # append for fileSpec in fileSpecList: # only input types if not fileSpec.type in JediDatasetSpec.getInputTypes(): continue # get original datasetSpec if not datasetIDSpecMap.has_key(fileSpec.datasetID): tmpStat,tmpDatasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(fileSpec.jediTaskID,fileSpec.datasetID) if tmpStat != True or tmpDatasetSpec == None: tmpLog.error('failed to get dataset for jediTaskID={0} datasetID={1}'.format(fileSpec.jediTaskID, fileSpec.datasetID)) return self.SC_FAILED datasetIDSpecMap[fileSpec.datasetID] = tmpDatasetSpec origDatasetSpec = datasetIDSpecMap[fileSpec.datasetID] if not datasetNameSpecMap.has_key(origDatasetSpec.datasetName): tmpLog.error('datasetName={0} is missing in new datasets'.format(origDatasetSpec.datasetName)) return self.SC_FAILED # not target or accompany datasets if origDatasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(origDatasetSpec.datasetName,oldAccDatasetNames): continue newDatasetSpec = datasetNameSpecMap[origDatasetSpec.datasetName] # set new attributes fileSpec.fileID = None fileSpec.datasetID = None fileSpec.jediTaskID = None fileSpec.boundaryID = pandaID fileSpec.keepTrack = 1 fileSpec.attemptNr = 1 fileSpec.status = 'ready' # append newDatasetSpec.addFile(fileSpec) # make one output dataset per file datasetSpec = copy.copy(outDatasetSpec) # set new attributes tmpItem = taskParamMap['output'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # use PandaID of original job as provenanceID datasetSpec.provenanceID = pandaID # append self.outDatasetSpecList.append(datasetSpec) # extract attempt number from original filename tmpMatch = re.search('\.(\d+)$',lostFileName) if tmpMatch == None: offsetVal = 1 else: offsetVal = 1 + int(tmpMatch.group(1)) # filename without attempt number baseFileName = re.sub('\.(\d+)$','',lostFileName) # make output template outTemplateMap = {'jediTaskID' : self.taskSpec.jediTaskID, 'serialNr' : offsetVal, 'streamName' : datasetSpec.streamName, 'filenameTemplate' : baseFileName + '.${SN:d}', 'outtype' : datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [outTemplateMap] # append datasets to task parameters for datasetSpec in datasetNameSpecMap.values(): if datasetSpec.Files == []: continue fileList = [] for fileSpec in datasetSpec.Files: fileList.append({'lfn':fileSpec.lfn, 'firstEvent':fileSpec.firstEvent, 'startEvent':fileSpec.startEvent, 'endEvent':fileSpec.endEvent, 'keepTrack':fileSpec.keepTrack, 'boundaryID':fileSpec.boundaryID, }) taskParamMap = RefinerUtils.appendDataset(taskParamMap,datasetSpec,fileList) self.updatedTaskParams = taskParamMap # grouping with boundaryID self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED tmpLog.debug('done') return self.SC_SUCCEEDED
def doRefine(self, jediTaskID, taskParamMap): try: # make logger tmpLog = self.tmpLog tmpLog.debug('start jediTaskID={0}'.format(jediTaskID)) # old dataset name oldDatasetName = taskParamMap['oldDatasetName'] # accompany datasets if taskParamMap.has_key('oldAccompanyDatasetNames'): oldAccDatasetNames = taskParamMap['oldAccompanyDatasetNames'] else: oldAccDatasetNames = None # use first file to get task and dataset info lostFileName = taskParamMap['lostFiles'][0] # get ole jediTaskID and datasetIDs tmpStat, oldIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI( oldDatasetName, lostFileName, 'output') if tmpStat != True or oldIDs == None: tmpLog.error( 'failed to get jediTaskID and DatasetID for {0}:{1}'. format(oldDatasetName, lostFileName)) return self.SC_FAILED # get task oldJediTaskID = oldIDs['jediTaskID'] oldDatasetID = oldIDs['datasetID'] tmpStat, oldTaskSpec = self.taskBufferIF.getTaskWithID_JEDI( oldJediTaskID, True) if tmpStat != True: tmpLog.error( 'failed to get TaskSpec for old jediTaskId={0}'.format( oldJediTaskID)) return self.SC_FAILED # make task spec taskSpec = JediTaskSpec() taskSpec.copyAttributes(oldTaskSpec) # reset attributes taskSpec.jediTaskID = jediTaskID taskSpec.taskType = taskParamMap['taskType'] taskSpec.taskPriority = taskParamMap['taskPriority'] self.taskSpec = taskSpec # get datasets tmpStat, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( oldJediTaskID) if tmpStat != True: tmpLog.error('failed to get datasetSpecs') return self.SC_FAILED # loop over all datasets provenanceID = None dummyStreams = [] outDatasetSpec = None datasetNameSpecMap = {} for datasetSpec in datasetSpecList: # for output datasets if not datasetSpec.type in JediDatasetSpec.getInputTypes(): # collect output with the same provenanceID if provenanceID != None and datasetSpec.provenanceID != provenanceID: continue # set provenanceID if undefined if provenanceID == None and datasetSpec.provenanceID != None: provenanceID = datasetSpec.provenanceID # collect dummy streams if datasetSpec.type != 'log' and (datasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(datasetSpec.datasetName,oldAccDatasetNames)): if not datasetSpec.streamName in dummyStreams: dummyStreams.append(datasetSpec.streamName) continue # reset attributes datasetSpec.status = 'defined' datasetSpec.datasetID = None datasetSpec.jediTaskID = jediTaskID datasetSpec.nFiles = 0 datasetSpec.nFilesUsed = 0 datasetSpec.nFilesToBeUsed = 0 datasetSpec.nFilesFinished = 0 datasetSpec.nFilesFailed = 0 datasetSpec.nFilesOnHold = 0 # remove nosplit and repeat since even the same file is made for each bounaryID datasetSpec.remNoSplit() datasetSpec.remRepeat() # append to map datasetNameSpecMap[datasetSpec.datasetName] = datasetSpec # set master and secondary for input if datasetSpec.type in JediDatasetSpec.getInputTypes(): if datasetSpec.isMaster(): # master self.inMasterDatasetSpec = datasetSpec else: # secondary self.inSecDatasetSpecList.append(datasetSpec) elif datasetSpec.type == 'log': # set new attributes tmpItem = taskParamMap['log'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # extract output filename template and change the value field outFileTemplate, tmpItem[ 'value'] = RefinerUtils.extractReplaceOutFileTemplate( tmpItem['value'], datasetSpec.streamName) # make output template if outFileTemplate != None: if tmpItem.has_key('offset'): offsetVal = 1 + tmpItem['offset'] else: offsetVal = 1 outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': offsetVal, 'streamName': datasetSpec.streamName, 'filenameTemplate': outFileTemplate, 'outtype': datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [ outTemplateMap ] # append self.outDatasetSpecList.append(datasetSpec) else: # output dataset to make copies later outDatasetSpec = datasetSpec # replace redundant output streams with dummy files for dummyStream in dummyStreams: self.taskSpec.jobParamsTemplate = self.taskSpec.jobParamsTemplate.replace( '${' + dummyStream + '}', dummyStream.lower() + '.tmp') self.setJobParamsTemplate(self.taskSpec.jobParamsTemplate) # loop over all lost files datasetIDSpecMap = {} for lostFileName in taskParamMap['lostFiles']: # get FileID tmpStat, tmpIDs = self.taskBufferIF.getIDsWithFileDataset_JEDI( oldDatasetName, lostFileName, 'output') if tmpStat != True or tmpIDs == None: tmpLog.error('failed to get FileID for {0}:{1}'.format( oldDatasetName, lostFileName)) return self.SC_FAILED # get PandaID tmpStat, pandaID = self.taskBufferIF.getPandaIDWithFileID_JEDI( tmpIDs['jediTaskID'], tmpIDs['datasetID'], tmpIDs['fileID']) if tmpStat != True or pandaID == None: tmpLog.error('failed to get PandaID for {0}'.format( str(tmpIDs))) return self.SC_FAILED # get files tmpStat, fileSpecList = self.taskBufferIF.getFilesWithPandaID_JEDI( pandaID) if tmpStat != True or fileSpecList == []: tmpLog.error( 'failed to get files for PandaID={0}'.format(pandaID)) return self.SC_FAILED # append for fileSpec in fileSpecList: # only input types if not fileSpec.type in JediDatasetSpec.getInputTypes(): continue # get original datasetSpec if not datasetIDSpecMap.has_key(fileSpec.datasetID): tmpStat, tmpDatasetSpec = self.taskBufferIF.getDatasetWithID_JEDI( fileSpec.jediTaskID, fileSpec.datasetID) if tmpStat != True or tmpDatasetSpec == None: tmpLog.error( 'failed to get dataset for jediTaskID={0} datasetID={1}' .format(fileSpec.jediTaskID, fileSpec.datasetID)) return self.SC_FAILED datasetIDSpecMap[fileSpec.datasetID] = tmpDatasetSpec origDatasetSpec = datasetIDSpecMap[fileSpec.datasetID] if not datasetNameSpecMap.has_key( origDatasetSpec.datasetName): tmpLog.error( 'datasetName={0} is missing in new datasets'. format(origDatasetSpec.datasetName)) return self.SC_FAILED # not target or accompany datasets if origDatasetSpec.datasetID != oldDatasetID and \ not self.checkDatasetNameMatching(origDatasetSpec.datasetName,oldAccDatasetNames): continue newDatasetSpec = datasetNameSpecMap[ origDatasetSpec.datasetName] # set new attributes fileSpec.fileID = None fileSpec.datasetID = None fileSpec.jediTaskID = None fileSpec.boundaryID = pandaID fileSpec.keepTrack = 1 fileSpec.attemptNr = 1 fileSpec.status = 'ready' # append newDatasetSpec.addFile(fileSpec) # make one output dataset per file datasetSpec = copy.copy(outDatasetSpec) # set new attributes tmpItem = taskParamMap['output'] datasetSpec.datasetName = tmpItem['dataset'] if tmpItem.has_key('container'): datasetSpec.containerName = tmpItem['container'] if tmpItem.has_key('token'): datasetSpec.storageToken = tmpItem['token'] if tmpItem.has_key('destination'): datasetSpec.destination = tmpItem['destination'] # use PandaID of original job as provenanceID datasetSpec.provenanceID = pandaID # append self.outDatasetSpecList.append(datasetSpec) # extract attempt number from original filename tmpMatch = re.search('\.(\d+)$', lostFileName) if tmpMatch == None: offsetVal = 1 else: offsetVal = 1 + int(tmpMatch.group(1)) # filename without attempt number baseFileName = re.sub('\.(\d+)$', '', lostFileName) # make output template outTemplateMap = { 'jediTaskID': self.taskSpec.jediTaskID, 'serialNr': offsetVal, 'streamName': datasetSpec.streamName, 'filenameTemplate': baseFileName + '.${SN:d}', 'outtype': datasetSpec.type, } self.outputTemplateMap[datasetSpec.outputMapKey()] = [ outTemplateMap ] # append datasets to task parameters for datasetSpec in datasetNameSpecMap.values(): if datasetSpec.Files == []: continue fileList = [] for fileSpec in datasetSpec.Files: fileList.append({ 'lfn': fileSpec.lfn, 'firstEvent': fileSpec.firstEvent, 'startEvent': fileSpec.startEvent, 'endEvent': fileSpec.endEvent, 'keepTrack': fileSpec.keepTrack, 'boundaryID': fileSpec.boundaryID, }) taskParamMap = RefinerUtils.appendDataset( taskParamMap, datasetSpec, fileList) self.updatedTaskParams = taskParamMap # grouping with boundaryID self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doRefine failed with {0}:{1}'.format( errtype.__name__, errvalue)) return self.SC_FAILED tmpLog.debug('done') return self.SC_SUCCEEDED