def __init__(self, taskBuffer, jobs, logger, params, defaultMap): self.jobs = [] self.jumboJobs = [] # separate normal and jumbo jobs for tmpJob in jobs: if EventServiceUtils.isJumboJob(tmpJob): self.jumboJobs.append(tmpJob) else: self.jobs.append(tmpJob) self.taskBuffer = taskBuffer self.logger = logger # set named parameters for tmpKey in params: tmpVal = params[tmpKey] setattr(self, tmpKey, tmpVal) # set defaults for tmpKey in defaultMap: tmpVal = defaultMap[tmpKey] if not hasattr(self, tmpKey): setattr(self, tmpKey, tmpVal)
def doPostProcess(self,taskSpec,tmpLog): # pre-check try: tmpStat = self.doPreCheck(taskSpec,tmpLog) if tmpStat: return self.SC_SUCCEEDED except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doPreCheck failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FATAL # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # loop over all datasets for datasetSpec in taskSpec.datasetSpecList: # skip pseudo output datasets if datasetSpec.type in ['output'] and datasetSpec.isPseudo(): continue try: # remove wrong files if datasetSpec.type in ['output']: # get successful files okFiles = self.taskBufferIF.getSuccessfulFiles_JEDI(datasetSpec.jediTaskID,datasetSpec.datasetID) if okFiles is None: tmpLog.warning('failed to get successful files for {0}'.format(datasetSpec.datasetName)) return self.SC_FAILED # get files in dataset ddmFiles = ddmIF.getFilesInDataset(datasetSpec.datasetName,skipDuplicate=False,ignoreUnknown=True) tmpLog.debug('datasetID={0}:Name={1} has {2} files in DB, {3} files in DDM'.format(datasetSpec.datasetID, datasetSpec.datasetName, len(okFiles),len(ddmFiles))) # check all files toDelete = [] for tmpGUID,attMap in iteritems(ddmFiles): if attMap['lfn'] not in okFiles: did = {'scope':attMap['scope'], 'name':attMap['lfn']} toDelete.append(did) tmpLog.debug('delete {0} from {1}'.format(attMap['lfn'],datasetSpec.datasetName)) # delete if toDelete != []: ddmIF.deleteFilesFromDataset(datasetSpec.datasetName,toDelete) except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to remove wrong files with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED try: # freeze output and log datasets if datasetSpec.type in ['output','log','trn_log']: tmpLog.info('freezing datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName)) ddmIF.freezeDataset(datasetSpec.datasetName,ignoreUnknown=True) except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to freeze datasets with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED try: # delete transient datasets if datasetSpec.type in ['trn_output']: tmpLog.debug('deleting datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName)) retStr = ddmIF.deleteDataset(datasetSpec.datasetName,False,ignoreUnknown=True) tmpLog.info(retStr) except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to delete datasets with {0}:{1}'.format(errtype.__name__,errvalue)) # check duplication if self.getFinalTaskStatus(taskSpec) in ['finished','done'] and taskSpec.gshare != 'Test': nDup = self.taskBufferIF.checkDuplication_JEDI(taskSpec.jediTaskID) tmpLog.debug('checked duplication with {0}'.format(nDup)) if nDup > 0: errStr = 'paused since {0} duplication found'.format(nDup) taskSpec.oldStatus = self.getFinalTaskStatus(taskSpec) taskSpec.status = 'paused' taskSpec.setErrDiag(errStr) tmpLog.debug(errStr) # delete ES datasets if taskSpec.registerEsFiles(): try: targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) tmpLog.debug('deleting ES dataset name={0}'.format(targetName)) retStr = ddmIF.deleteDataset(targetName,False,ignoreUnknown=True) tmpLog.debug(retStr) except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to delete ES dataset with {0}:{1}'.format(errtype.__name__,errvalue)) try: AtlasPostProcessorUtils.send_notification(self.taskBufferIF, ddmIF, taskSpec, tmpLog) except Exception as e: tmpLog.error('failed to talk to external system with {0}'.format(str(e))) return self.SC_FAILED try: self.doBasicPostProcess(taskSpec,tmpLog) except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FATAL return self.SC_SUCCEEDED
def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if taskParamMap.has_key('walltimeUnit'): taskSpec.walltimeUnit = taskParamMap['walltimeUnit'] if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10*60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # event service if taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal'])*10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueWithName(taskSpec.vo,taskSpec.prodSourceLabel,taskParamMap['workQueueName']) if workQueue == None: # get work queue based on task attributes workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site) if workQueue == None: errStr = 'workqueue is undefined for vo={0} labal={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format(taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount) raise RuntimeError,errStr taskSpec.workQueue_ID = workQueue.queue_id self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap,'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap,'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap,'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap,'t1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap,'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap,'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap,'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap,'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning']) self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS']) # return return
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.datasetID not in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName is None: continue if targetName not in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud is not None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) else: tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) if locForRule is None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if taskSpec.campaign not in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() is not None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]: if taskSpec.workingGroup: userName = taskSpec.workingGroup else: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registering location={} lifetime={} days activity={} grouping={} ' 'owner={}'.format(locForRule, lifetime, activity, grouping, userName)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(locForRule, targetName)) return retFatal # double copy if userSetup and datasetSpec.type == 'output': if datasetSpec.destination != datasetSpec.site: tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination, datasetSpec.site)) else: second_copy = True try: if taskSpec.site: panda_site = siteMapper.getSite(taskSpec.site) if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall: tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site)) second_copy = False except Exception: second_copy = True if second_copy: locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True' tmpMsg = 'registering double copy ' tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime, activity,targetName) tmpLog.info(tmpMsg) tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName, lifetime=lifetime,activity=activity, grouping='NONE',weight='freespace', ignore_availability=False) if not tmpStat: tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if datasetSpec.containerName not in cnDatasetMap: cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # register ES datasets if taskSpec.registerEsFiles(): targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData, resurrect=True) if not tmpStat: tmpLog.error('failed to register ES dataset {0}'.format(targetName)) return retFatal # register rule location = 'type=DATADISK' activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity, grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.destinationDBlock not in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if not taskParamMap.has_key('walltimeUnit'): # force to set NULL so that retried tasks get data from scouts again taskSpec.forceUpdate('walltimeUnit') if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10 * 60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key( 'skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # event service flag if 'useJobCloning' in taskParamMap: taskSpec.eventService = 2 elif taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime( taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal']) * 10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # request type if 'requestType' in taskParamMap: taskSpec.requestType = taskParamMap['requestType'] self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'], JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap, 'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap, 'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap, 'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap, 'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap, 'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap, 'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap, 't1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap, 'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap, 'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap, 'nJumboJobs', JediTaskSpec.splitRuleToken['nJumboJobs']) self.setSplitRule(taskParamMap, 'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap, 'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap, 'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key( 'noWaitParent') and taskParamMap['noWaitParent'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue( taskParamMap['useJobCloning']) self.setSplitRule(None, scValue, JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap[ 'failWhenGoalUnreached'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap[ 'allowInputWAN'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['putLogToOS']) if 'mergeEsOnOS' in taskParamMap and taskParamMap[ 'mergeEsOnOS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['mergeEsOnOS']) if 'writeInputToFile' in taskParamMap and taskParamMap[ 'writeInputToFile'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['writeInputToFile']) if 'useFileAsSourceLFN' in taskParamMap and taskParamMap[ 'useFileAsSourceLFN'] == True: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['useFileAsSourceLFN']) if 'ignoreMissingInDS' in taskParamMap and taskParamMap[ 'ignoreMissingInDS'] == True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['ignoreMissingInDS']) # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueWithName( taskSpec.vo, taskSpec.prodSourceLabel, taskParamMap['workQueueName']) if workQueue is None: # get work queue based on task attributes workQueue, tmpStr = workQueueMapper.getQueueWithSelParams( taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site, eventService=taskSpec.eventService, splitRule=taskSpec.splitRule, campaign=taskSpec.campaign) if workQueue is None: errStr = 'workqueue is undefined for vo={0} label={1} '.format( taskSpec.vo, taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format( taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount, taskSpec.eventService) errStr += 'splitRule={0} campaign={1}'.format( taskSpec.splitRule, taskSpec.campaign) raise RuntimeError, errStr self.taskSpec.workQueue_ID = workQueue.queue_id # Initialize the global share gshare = None if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share( taskParamMap['gshare']): # work queue is specified gshare = taskParamMap['gshare'] else: # get share based on definition gshare = self.taskBufferIF.get_share_for_task(self.taskSpec) if gshare is None: gshare = 'No match' # errStr = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign) # raise RuntimeError,errStr self.taskSpec.gshare = gshare # return return
def parseXML(self): # get LFN and GUID # self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs log_out = [f for f in self.job.Files if f.type in ['log', 'output']] if not log_out: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: # root = xml.dom.minidom.parse(self.xmlFile) root = xml.dom.minidom.parseString(self.data) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError('fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # parse json try: import json # with open(self.xmlFile) as tmpF: jsonDict = json.loads(self.data) for lfn in jsonDict: fileData = jsonDict[lfn] lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError( 'fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # check if file exists # if os.path.exists(self.xmlFile): if True: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents nEventsFrom = None try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break nEventsFrom = "xml" except Exception: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except Exception: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except Exception: pass nEventsFrom = "json" except Exception: pass # use nEvents and GUIDs reported by the pilot if no job report if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \ and self.job.prodSourceLabel in ['managed']: for file in self.job.Files: if file.type == 'output': nEventsMap[file.lfn] = self.job.nEvents for lfn, guid in zip(lfns, guids): guidMap[lfn] = guid nEventsFrom = "pilot" self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('nEventsFrom=%s' % str(nEventsFrom)) self.logger.debug('guidMap=%s' % str(guidMap)) self.logger.debug('self.job.jobStatus=%s in parseXML' % self.job.jobStatus) self.logger.debug( 'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob( self.job), EventServiceUtils.isJumboJob(self.job))) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test' ] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if file.lfn not in lfns: if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \ or EventServiceUtils.isJumboJob(self.job): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if file.lfn in fullLfnMap: file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if file.lfn in nEventsMap: self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except Exception: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr is not None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if lfn not in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0
def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # got lock, get the report report_dict = self.taskBuffer.getJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr) self.data = report_dict.get('data') # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job is None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr is not None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) # elif self.attemptNr is not None and self.job.jobStatus == 'transferring': # errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus) # self.logger.error(errMsg) elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO, self.job.cloud) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled() and self.job.taskBufferErrorCode not in \ [pandaserver.taskbuffer.ErrorCode.EC_PilotRetried]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI is None: raise RuntimeError( 'failed to check file status in JEDI') if fileCheckInJEDI is False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] is True: self.logger.debug("end") # remove Catalog self.taskBuffer.deleteJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr) return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC is None: raise RuntimeError( 'failed to check the cloned job') # failed to lock semaphore if checkJC['lock'] is False: self.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if self.job.jobStatus not in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # interaction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass( self.job.VO, self.job.cloud) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except Exception: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(self.job.VO, errtype, errvalue)) self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}" .format(self.job.VO, traceback.format_exc())) addResult = None self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult is not None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock job output report self.taskBuffer.unlockJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset) return # failed if addResult is None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug( "status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None errors = [] if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) if self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' errors.append({ 'source': source, 'error_code': error_code, 'error_diag': error_diag }) # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, errors, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult is not None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult is not None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult is not None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' self.job.jobSubStatus = None # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except Exception: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error( 'failed to update DB for pandaid={0}'.format( self.job.PandaID)) # unlock job output report self.taskBuffer.unlockJobOutputReport( panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset) return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug( "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}" .format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag errors = [{ 'source': source, 'error_code': error_code, 'error_diag': error_diag }] self.logger.debug( "AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, errors, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if file.destinationDBlock not in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin is not None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") # cThr.start() # cThr.join() cThr.run() del cThr self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID in assDBlockMap: assDBlocks = assDBlockMap[assJobID] assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job is None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) # cThr.start() # cThr.join() cThr.run() del cThr self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") # try: # # remove Catalog # os.remove(self.xmlFile) # except Exception: # pass # remove Catalog self.taskBuffer.deleteJobOutputReport(panda_id=self.jobID, attempt_nr=self.attemptNr) del self.data del report_dict except Exception as e: errStr = ": {} {}".format(str(e), traceback.format_exc()) self.logger.error(errStr) self.logger.error("except") # unlock job output report self.taskBuffer.unlockJobOutputReport(panda_id=self.jobID, attempt_nr=self.attemptNr, pid=self.pid, lock_offset=self.lock_offset)
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: _logger.debug("Freezer start %s %s" % (modDate,name)) self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ", {':destinationDBlock':name}) self.proxyLock.release() if retF < 0: _logger.error("SQL error") else: allFinished = True onePandaID = None for tmpPandaID,tmpFileStatus in resF: onePandaID = tmpPandaID if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']: allFinished = False break # check sub datasets in the jobset for event service job if allFinished: self.proxyLock.acquire() tmpJobs = taskBuffer.getFullJobStatus([onePandaID]) self.proxyLock.release() if len(tmpJobs) > 0 and tmpJobs[0] is not None: if EventServiceUtils.isEventServiceMerge(tmpJobs[0]): self.proxyLock.acquire() cThr = Closer(taskBuffer, [], tmpJobs[0]) allFinished = cThr.checkSubDatasetsInJobset() self.proxyLock.release() _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished)) # no files in filesTable if allFinished: _logger.debug("freeze %s " % name) dsExists = True if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False if name.startswith('panda.um.'): self.proxyLock.acquire() retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ", {':destinationDBlock':name, ':statusM':'merging', ':statusF':'failed'}) self.proxyLock.release() if resMer is not None and len(resMer)>0: mergeID = resMer[0][0] # get merging jobs self.proxyLock.acquire() mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() mergeJob = mergingJobs[0] if mergeJob is not None: tmpDestDBlocks = [] # get destDBlock for tmpFile in mergeJob.Files: if tmpFile.type in ['output','log']: if not tmpFile.destinationDBlock in tmpDestDBlocks: tmpDestDBlocks.append(tmpFile.destinationDBlock) # run _logger.debug("start JEDI closer for %s " % name) self.proxyLock.acquire() cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob) cThr.start() cThr.join() self.proxyLock.release() _logger.debug("end JEDI closer for %s " % name) continue else: _logger.debug("failed to get merging job for %s " % name) else: _logger.debug("failed to get merging file for %s " % name) status,out = True,'' elif dsExists: # check if dataset exists status,out = rucioAPI.getMetaData(name) if status == True: if out is not None: try: rucioAPI.closeDataset(name) status = True except Exception: errtype,errvalue = sys.exc_info()[:2] out = 'failed to freeze : {0} {1}'.format(errtype,errvalue) status = False else: # dataset not exist status,out = True,'' dsExists = False else: status,out = True,'' if not status: _logger.error('{0} failed to freeze with {1}'.format(name,out)) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'completed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists: continue # set tobedeleted to dis setTobeDeletedToDis(name) # count # of files status,out = rucioAPI.getNumberOfFiles(name) if status is not True: if status is False: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) _logger.debug(nFile) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = rucioAPI.eraseDataset(name) _logger.debug('OK with %s' % name) except Exception: pass else: _logger.debug("wait %s " % name) self.proxyLock.acquire() taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) self.proxyLock.release() _logger.debug("end %s " % name) except Exception: errStr = traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
def updateJobs(self, jobList, tmpLog): updateJobs = [] failedJobs = [] activateJobs = [] waitingJobs = [] closeJobs = [] # sort out jobs for job in jobList: # failed jobs if job.jobStatus in ['failed', 'cancelled']: failedJobs.append(job) # waiting elif job.jobStatus == 'waiting': waitingJobs.append(job) # no input jobs elif job.dispatchDBlock == 'NULL': activateJobs.append(job) # normal jobs else: # change status job.jobStatus = "assigned" updateJobs.append(job) # trigger merge generation if all events are done newActivateJobs = [] nFinished = 0 for job in activateJobs: if job.notDiscardEvents() and job.allOkEvents( ) and not EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.activateJobs([job]) # change status job.jobStatus = "finished" self.taskBuffer.updateJobs([job], False) nFinished += 1 else: newActivateJobs.append(job) activateJobs = newActivateJobs tmpLog.debug('# of finished jobs in activated : {0}'.format(nFinished)) newUpdateJobs = [] nFinished = 0 for job in updateJobs: if job.notDiscardEvents() and job.allOkEvents( ) and not EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.updateJobs([job], True) # change status job.jobStatus = "finished" self.taskBuffer.updateJobs([job], True) nFinished += 1 else: newUpdateJobs.append(job) updateJobs = newUpdateJobs tmpLog.debug('# of finished jobs in defined : {0}'.format(nFinished)) # update DB tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs))) self.taskBuffer.activateJobs(activateJobs) tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs))) self.taskBuffer.updateJobs(updateJobs, True) tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs))) self.taskBuffer.updateJobs(failedJobs, True) tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs))) self.taskBuffer.keepJobs(waitingJobs) # delete local values del updateJobs del failedJobs del activateJobs del waitingJobs
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.datasetID in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName == None: continue if not targetName in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud != None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken) else: tmpLog.info('site={0} token='.format(datasetSpec.site,datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken) if locForRule == None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if not taskSpec.campaign in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() != None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registering location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime, activity,grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(locForRule, targetName)) return retFatal # double copy if userSetup and datasetSpec.type == 'output': if datasetSpec.destination != datasetSpec.site: tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination, datasetSpec.site)) else: locForDouble = '(type=SCRATCHDISK)\\notforextracopy=1' tmpMsg = 'registering double copy ' tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime, activity,targetName) tmpLog.info(tmpMsg) tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName, lifetime=lifetime,activity=activity, grouping='NONE',weight='freespace', ignore_availability=False) if not tmpStat: tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # register ES datasets if taskSpec.registerEsFiles(): targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData, resurrect=True) if not tmpStat: tmpLog.error('failed to register ES dataset {0}'.format(targetName)) return retFatal # register rule location = 'type=DATADISK' activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity, grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.destinationDBlock in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def run(self): try: while True: _logger.debug('%s start' % self.pandaID) # query job job = self.taskBuffer.peekJobs([self.pandaID], fromDefined=False, fromArchived=False, fromWaiting=False)[0] _logger.debug('%s in %s' % (self.pandaID, job.jobStatus)) # check job status if job is None: _logger.debug('%s escape : not found' % self.pandaID) return if job.jobStatus not in [ 'running', 'sent', 'starting', 'holding', 'stagein', 'stageout' ]: if job.jobStatus == 'transferring' and ( job.prodSourceLabel in ['user', 'panda'] or job.jobSubStatus not in [None, 'NULL', '']): pass else: _logger.debug('%s escape : %s' % (self.pandaID, job.jobStatus)) return # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( minutes=self.sleepTime) if job.modificationTime < timeLimit or ( job.endTime != 'NULL' and job.endTime < timeLimit): _logger.debug( '%s %s lastmod:%s endtime:%s' % (job.PandaID, job.jobStatus, str( job.modificationTime), str(job.endTime))) destDBList = [] if job.jobStatus == 'sent': # sent job didn't receive reply from pilot within 30 min job.jobDispatcherErrorCode = ErrorCode.EC_SendError job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': # lost heartbeat if job.jobDispatcherErrorDiag == 'NULL': if job.endTime == 'NULL': # normal lost heartbeat job.jobDispatcherErrorCode = ErrorCode.EC_Watcher job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str( job.modificationTime) else: if job.jobStatus == 'holding': job.jobDispatcherErrorCode = ErrorCode.EC_Holding elif job.jobStatus == 'transferring': job.jobDispatcherErrorCode = ErrorCode.EC_Transferring else: job.jobDispatcherErrorCode = ErrorCode.EC_Timeout job.jobDispatcherErrorDiag = 'timeout in {0} : last heartbeat at {1}'.format( job.jobStatus, str(job.endTime)) # get worker workerSpecs = self.taskBuffer.getWorkersForJob( job.PandaID) if len(workerSpecs) > 0: workerSpec = workerSpecs[0] if workerSpec.status in [ 'finished', 'failed', 'cancelled', 'missed' ]: job.supErrorCode = SupErrors.error_codes[ 'WORKER_ALREADY_DONE'] job.supErrorDiag = 'worker already {0} at {1} with {2}'.format( workerSpec.status, str(workerSpec.endTime), workerSpec.diagMessage) job.supErrorDiag = JobSpec.truncateStringAttr( 'supErrorDiag', job.supErrorDiag) else: # job recovery failed job.jobDispatcherErrorCode = ErrorCode.EC_Recovery job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % ( self.sleepTime / 60) # set job status job.jobStatus = 'failed' # set endTime for lost heartbeat if job.endTime == 'NULL': # normal lost heartbeat job.endTime = job.modificationTime # set files status for file in job.Files: if file.type == 'output' or file.type == 'log': file.status = 'failed' if file.destinationDBlock not in destDBList: destDBList.append(file.destinationDBlock) # event service if EventServiceUtils.isEventServiceJob( job ) and not EventServiceUtils.isJobCloningJob(job): eventStat = self.taskBuffer.getEventStat( job.jediTaskID, job.PandaID) # set sub status when no sucessful events if EventServiceUtils.ST_finished not in eventStat: job.jobSubStatus = 'es_heartbeat' # update job self.taskBuffer.updateJobs([job], False) # start closer if job.jobStatus == 'failed': source = 'jobDispatcherErrorCode' error_code = job.jobDispatcherErrorCode error_diag = job.jobDispatcherErrorDiag try: _logger.debug( "Watcher will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr) _logger.debug("apply_retrial_rules is back") except Exception as e: _logger.debug( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # updateJobs was successful and it failed a job with taskBufferErrorCode try: _logger.debug("Watcher.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] if job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag _logger.debug( "Watcher.run 2 will call apply_retrial_rules" ) retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) _logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) cThr = Closer(self.taskBuffer, destDBList, job) cThr.start() cThr.join() _logger.debug('%s end' % job.PandaID) return # single action if self.single: return # sleep time.sleep(60 * self.sleepTime) except Exception: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type, value)) return
def run(self): try: _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) flagComplete = True topUserDsList = [] usingMerger = False disableNotifier = False firstIndvDS = True finalStatusDS = [] for destinationDBlock in self.destinationDBlocks: dsList = [] _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) # ignore tid datasets if re.search('_tid[\d_]+$',destinationDBlock): _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) continue # ignore HC datasets if re.search('^hc_test\.',destinationDBlock) is not None or re.search('^user\.gangarbt\.',destinationDBlock) is not None: if re.search('_sub\d+$',destinationDBlock) is None and re.search('\.lib$',destinationDBlock) is None: _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock)) continue # query dataset if destinationDBlock in self.datasetMap: dataset = self.datasetMap[destinationDBlock] else: dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) if dataset is None: _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) flagComplete = False continue # skip tobedeleted/tobeclosed if dataset.status in ['cleanup','tobeclosed','completed','deleted']: _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) continue dsList.append(dataset) # sort dsList.sort() # count number of completed files notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, 'status':'unknown'}) if notFinish < 0: _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) flagComplete = False continue # check if completed _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: # close non-DQ2 destinationDBlock immediately finalStatus = 'closed' elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock): # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI finalStatus = 'closed' elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ and self.job.processingType != 'usermerge': # merge output files if firstIndvDS: # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS finalStatus = 'tobemerged' firstIndvDS = False else: finalStatus = 'tobeclosed' # set merging to top dataset usingMerger = True # disable Notifier disableNotifier = True elif self.job.produceUnMerge(): finalStatus = 'doing' else: # set status to 'tobeclosed' to trigger DQ2 closing finalStatus = 'tobeclosed' if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job): allInJobsetFinished = self.checkSubDatasetsInJobset() else: allInJobsetFinished = True if notFinish == 0 and allInJobsetFinished: _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) # set status dataset.status = finalStatus # update dataset in DB retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) if len(retT) > 0 and retT[0]==1: finalStatusDS += dsList # close user datasets if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ and (dataset.name.startswith('user') or dataset.name.startswith('group')): # get top-level user dataset topUserDsName = re.sub('_sub\d+$','',dataset.name) # update if it is the first attempt if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi': topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) if topUserDs is not None: # check status if topUserDs.status in ['completed','cleanup','tobeclosed','deleted', 'tobemerged','merging']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) else: # set status if self.job.processingType.startswith('gangarobot') or \ self.job.processingType.startswith('hammercloud'): # not trigger freezing for HC datasets so that files can be appended topUserDs.status = 'completed' elif not usingMerger: topUserDs.status = finalStatus else: topUserDs.status = 'merging' # append to avoid repetition topUserDsList.append(topUserDsName) # update DB retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':topUserDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) else: _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) # get parent dataset for merge job if self.job.processingType == 'usermerge': tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) if tmpMatch is None: _logger.error('%s failed to extract parentDS' % self.pandaID) else: unmergedDsName = tmpMatch.group(1) # update if it is the first attempt if not unmergedDsName in topUserDsList: unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) if unmergedDs is None: _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) else: # check status if unmergedDs.status in ['completed','cleanup','tobeclosed']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) else: # set status unmergedDs.status = finalStatus # append to avoid repetition topUserDsList.append(unmergedDsName) # update DB retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':unmergedDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) else: _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) # start Activator if re.search('_sub\d+$',dataset.name) is None: if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: # don't trigger Activator for merge jobs pass else: if self.job.jobStatus == 'finished': aThr = Activator(self.taskBuffer,dataset) aThr.start() aThr.join() else: # unset flag since another thread already updated #flagComplete = False pass else: # update dataset in DB self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) # unset flag flagComplete = False # end _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) # special actions for vo if flagComplete: closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO) if closerPluginClass is None and self.job.VO == 'atlas': # use ATLAS plugin for ATLAS from pandaserver.dataservice.CloserAtlasPlugin import CloserAtlasPlugin closerPluginClass = CloserAtlasPlugin if closerPluginClass is not None: closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger) closerPlugin.execute() # change pending jobs to failed finalizedFlag = True if flagComplete and self.job.prodSourceLabel=='user': _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID) _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag)) # update unmerged datasets in JEDI to trigger merging if flagComplete and self.job.produceUnMerge() and finalStatusDS != []: if finalizedFlag: tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS) _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat)) # start notifier _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \ self.job.lockedby != 'jedi': # don't send email for merge jobs if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: useNotifier = True summaryInfo = {} # check all jobDefIDs in jobsetID if not self.job.jobsetID in [0,None,'NULL']: useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, self.job.prodUserName) _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) if useNotifier: _logger.debug('%s start Notifier' % self.pandaID) nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) nThr.run() _logger.debug('%s end Notifier' % self.pandaID) _logger.debug('%s End' % self.pandaID) except Exception: errType,errValue = sys.exc_info()[:2] _logger.error("%s %s" % (errType,errValue))
def appendJob(self, job, siteMapperCache=None): # event service merge if EventServiceUtils.isEventServiceMerge(job): isEventServiceMerge = True else: isEventServiceMerge = False # PandaID self.data['PandaID'] = job.PandaID # prodSourceLabel self.data['prodSourceLabel'] = job.prodSourceLabel # swRelease self.data['swRelease'] = job.AtlasRelease # homepackage self.data['homepackage'] = job.homepackage # transformation self.data['transformation'] = job.transformation # job name self.data['jobName'] = job.jobName # job definition ID self.data['jobDefinitionID'] = job.jobDefinitionID # cloud self.data['cloud'] = job.cloud # files strIFiles = '' strOFiles = '' strDispatch = '' strDisToken = '' strDisTokenForOutput = '' strDestination = '' strRealDataset = '' strRealDatasetIn = '' strProdDBlock = '' strDestToken = '' strProdToken = '' strProdTokenForOutput = '' strGUID = '' strFSize = '' strCheckSum = '' strFileDestinationSE = '' strScopeIn = '' strScopeOut = '' strScopeLog = '' logFile = '' logGUID = '' ddmEndPointIn = [] ddmEndPointOut = [] noOutput = [] siteSpec = None inDsLfnMap = {} inLFNset = set() if siteMapperCache is not None: siteMapper = siteMapperCache.getObj() siteSpec = siteMapper.getSite(job.computingSite) # resolve destSE try: job.destinationSE = siteMapper.resolveNucleus( job.destinationSE) for tmpFile in job.Files: tmpFile.destinationSE = siteMapper.resolveNucleus( tmpFile.destinationSE) except Exception: pass siteMapperCache.releaseObj() for file in job.Files: if file.type == 'input': if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset: pass else: inLFNset.add(file.lfn) if strIFiles != '': strIFiles += ',' strIFiles += file.lfn if strDispatch != '': strDispatch += ',' strDispatch += file.dispatchDBlock if strDisToken != '': strDisToken += ',' strDisToken += file.dispatchDBlockToken strProdDBlock += '%s,' % file.prodDBlock if not isEventServiceMerge: strProdToken += '%s,' % file.prodDBlockToken else: strProdToken += '%s,' % job.metadata[1][file.lfn] if strGUID != '': strGUID += ',' strGUID += file.GUID strRealDatasetIn += '%s,' % file.dataset strFSize += '%s,' % file.fsize if file.checksum not in ['', 'NULL', None]: strCheckSum += '%s,' % file.checksum else: strCheckSum += '%s,' % file.md5sum strScopeIn += '%s,' % file.scope ddmEndPointIn.append( self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken, 'input', job.prodSourceLabel, job.job_label)) if file.dataset not in inDsLfnMap: inDsLfnMap[file.dataset] = [] inDsLfnMap[file.dataset].append(file.lfn) if file.type == 'output' or file.type == 'log': if strOFiles != '': strOFiles += ',' strOFiles += file.lfn if strDestination != '': strDestination += ',' strDestination += file.destinationDBlock if strRealDataset != '': strRealDataset += ',' strRealDataset += file.dataset strFileDestinationSE += '%s,' % file.destinationSE if file.type == 'log': logFile = file.lfn logGUID = file.GUID strScopeLog = file.scope else: strScopeOut += '%s,' % file.scope if strDestToken != '': strDestToken += ',' strDestToken += re.sub( '^ddd:', 'dst:', file.destinationDBlockToken.split(',')[0]) strDisTokenForOutput += '%s,' % file.dispatchDBlockToken strProdTokenForOutput += '%s,' % file.prodDBlockToken ddmEndPointOut.append( self.getDdmEndpoint( siteSpec, file.destinationDBlockToken.split(',')[0], 'output', job.prodSourceLabel, job.job_label)) if file.isAllowedNoOutput(): noOutput.append(file.lfn) # inFiles self.data['inFiles'] = strIFiles # dispatch DBlock self.data['dispatchDblock'] = strDispatch # dispatch DBlock space token self.data['dispatchDBlockToken'] = strDisToken # dispatch DBlock space token for output self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] # outFiles self.data['outFiles'] = strOFiles # destination DBlock self.data['destinationDblock'] = strDestination # destination DBlock space token self.data['destinationDBlockToken'] = strDestToken # prod DBlocks self.data['prodDBlocks'] = strProdDBlock[:-1] # prod DBlock space token self.data['prodDBlockToken'] = strProdToken[:-1] # real output datasets self.data['realDatasets'] = strRealDataset # real output datasets self.data['realDatasetsIn'] = strRealDatasetIn[:-1] # file's destinationSE self.data['fileDestinationSE'] = strFileDestinationSE[:-1] # log filename self.data['logFile'] = logFile # log GUID self.data['logGUID'] = logGUID # jobPars self.data['jobPars'], ppSteps = job.extractMultiStepExec() if ppSteps is not None: self.data.update(ppSteps) if job.to_encode_job_params(): self.data['jobPars'] = base64.b64encode( self.data['jobPars'].encode()).decode() # attempt number self.data['attemptNr'] = job.attemptNr # GUIDs self.data['GUID'] = strGUID # checksum self.data['checksum'] = strCheckSum[:-1] # fsize self.data['fsize'] = strFSize[:-1] # scope self.data['scopeIn'] = strScopeIn[:-1] self.data['scopeOut'] = strScopeOut[:-1] self.data['scopeLog'] = strScopeLog # DDM endpoints try: self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn) except TypeError: self.data['ddmEndPointIn'] = '' try: self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut) except TypeError: self.data['ddmEndPointOut'] = '' # destinationSE self.data['destinationSE'] = job.destinationSE # user ID self.data['prodUserID'] = job.prodUserID # CPU count self.data['maxCpuCount'] = job.maxCpuCount # RAM count self.data['minRamCount'] = job.minRamCount # disk count self.data['maxDiskCount'] = job.maxDiskCount # cmtconfig if ppSteps is None: self.data['cmtConfig'] = job.cmtConfig else: self.data['cmtConfig'] = '' # processingType self.data['processingType'] = job.processingType # transferType self.data['transferType'] = job.transferType # sourceSite self.data['sourceSite'] = job.sourceSite # current priority self.data['currentPriority'] = job.currentPriority # taskID if job.lockedby == 'jedi': self.data['taskID'] = job.jediTaskID else: self.data['taskID'] = job.taskID # core count if job.coreCount in ['NULL', None]: self.data['coreCount'] = 1 else: self.data['coreCount'] = job.coreCount # jobsetID self.data['jobsetID'] = job.jobsetID # nucleus self.data['nucleus'] = job.nucleus # walltime self.data['maxWalltime'] = job.maxWalltime # looping check if job.is_no_looping_check(): self.data['loopingCheck'] = False # debug mode if job.specialHandling is not None and 'debug' in job.specialHandling: self.data['debug'] = 'True' # event service or job cloning if EventServiceUtils.isJobCloningJob(job): self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job) elif EventServiceUtils.isEventServiceJob( job) or EventServiceUtils.isJumboJob(job): self.data['eventService'] = 'True' # prod DBlock space token for pre-merging output self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1] # event service merge if isEventServiceMerge: self.data['eventServiceMerge'] = 'True' # write to file for ES merge writeToFileStr = '' try: for outputName in job.metadata[0]: inputList = job.metadata[0][outputName] writeToFileStr += 'inputFor_{0}:'.format(outputName) for tmpInput in inputList: writeToFileStr += '{0},'.format(tmpInput) writeToFileStr = writeToFileStr[:-1] writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] except Exception: pass self.data['writeToFile'] = writeToFileStr elif job.writeInputToFile(): try: # write input to file writeToFileStr = '' for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] writeToFileStr += 'tmpin_{0}:'.format(inDS) writeToFileStr += ','.join(inputList) writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] self.data['writeToFile'] = writeToFileStr except Exception: pass # replace placeholder if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob( job): try: for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] srcStr = 'tmpin__cnt_{0}'.format(inDS) dstStr = ','.join(inputList) self.data['jobPars'] = self.data['jobPars'].replace( srcStr, dstStr) except Exception: pass # no output if noOutput != []: self.data['allowNoOutput'] = ','.join(noOutput) # alternative stage-out if job.getAltStgOut() is not None: self.data['altStageOut'] = job.getAltStgOut() # log to OS if job.putLogToOS(): self.data['putLogToOS'] = 'True' # suppress execute string conversion if job.noExecStrCnv(): self.data['noExecStrCnv'] = 'True' # in-file positional event number if job.inFilePosEvtNum(): self.data['inFilePosEvtNum'] = 'True' # use prefetcher if job.usePrefetcher(): self.data['usePrefetcher'] = 'True' # image name if job.container_name not in ['NULL', None]: self.data['container_name'] = job.container_name # IO self.data['ioIntensity'] = job.get_task_attribute('ioIntensity') self.data['ioIntensityUnit'] = job.get_task_attribute( 'ioIntensityUnit') # HPO if job.is_hpo_workflow(): self.data['isHPO'] = 'True' # VP if siteSpec is not None: scope_input, scope_output = DataServiceUtils.select_scope( siteSpec, job.prodSourceLabel, job.job_label) if siteSpec.use_vp(scope_input): self.data['useVP'] = 'True'
def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper, splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if taskParamMap.has_key('walltimeUnit'): taskSpec.walltimeUnit = taskParamMap['walltimeUnit'] if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10 * 60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key( 'skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # event service if taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal']) * 10) if taskSpec.goal >= 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # work queue workQueue, tmpStr = workQueueMapper.getQueueWithSelParams( taskSpec.vo, taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site) if workQueue == None: errStr = 'workqueue is undefined for vo={0} labal={1} '.format( taskSpec.vo, taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format( taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount) raise RuntimeError, errStr taskSpec.workQueue_ID = workQueue.queue_id self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'], JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap, 'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap, 'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap, 'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap, 'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap, 'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap, 'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap, 'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap, 't1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap, 'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap, 'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap, 'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap, 'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap, 'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None, 4, JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['respectLB']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue( taskParamMap['useJobCloning']) self.setSplitRule(None, scValue, JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['switchEStoNormal']) # return return
def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule): # make task spec taskSpec = JediTaskSpec() taskSpec.jediTaskID = jediTaskID taskSpec.taskName = taskParamMap['taskName'] taskSpec.userName = taskParamMap['userName'] taskSpec.vo = taskParamMap['vo'] taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel'] taskSpec.taskPriority = taskParamMap['taskPriority'] if 'currentPriority' in taskParamMap: taskSpec.currentPriority = taskParamMap['currentPriority'] else: taskSpec.currentPriority = taskSpec.taskPriority taskSpec.architecture = taskParamMap['architecture'] taskSpec.transUses = taskParamMap['transUses'] taskSpec.transHome = taskParamMap['transHome'] taskSpec.transPath = taskParamMap['transPath'] taskSpec.processingType = taskParamMap['processingType'] taskSpec.taskType = taskParamMap['taskType'] taskSpec.splitRule = splitRule taskSpec.startTime = datetime.datetime.utcnow() if taskParamMap.has_key('workingGroup'): taskSpec.workingGroup = taskParamMap['workingGroup'] if taskParamMap.has_key('countryGroup'): taskSpec.countryGroup = taskParamMap['countryGroup'] if taskParamMap.has_key('ticketID'): taskSpec.ticketID = taskParamMap['ticketID'] if taskParamMap.has_key('ticketSystemType'): taskSpec.ticketSystemType = taskParamMap['ticketSystemType'] if taskParamMap.has_key('reqID'): taskSpec.reqID = taskParamMap['reqID'] else: taskSpec.reqID = jediTaskID if taskParamMap.has_key('coreCount'): taskSpec.coreCount = taskParamMap['coreCount'] else: taskSpec.coreCount = 1 if taskParamMap.has_key('walltime'): taskSpec.walltime = taskParamMap['walltime'] else: taskSpec.walltime = 0 if not taskParamMap.has_key('walltimeUnit'): # force to set NULL so that retried tasks get data from scouts again taskSpec.forceUpdate('walltimeUnit') if taskParamMap.has_key('outDiskCount'): taskSpec.outDiskCount = taskParamMap['outDiskCount'] else: taskSpec.outDiskCount = 0 if 'outDiskUnit' in taskParamMap: taskSpec.outDiskUnit = taskParamMap['outDiskUnit'] if taskParamMap.has_key('workDiskCount'): taskSpec.workDiskCount = taskParamMap['workDiskCount'] else: taskSpec.workDiskCount = 0 if taskParamMap.has_key('workDiskUnit'): taskSpec.workDiskUnit = taskParamMap['workDiskUnit'] if taskParamMap.has_key('ramCount'): taskSpec.ramCount = taskParamMap['ramCount'] else: taskSpec.ramCount = 0 if taskParamMap.has_key('ramUnit'): taskSpec.ramUnit = taskParamMap['ramUnit'] if taskParamMap.has_key('baseRamCount'): taskSpec.baseRamCount = taskParamMap['baseRamCount'] else: taskSpec.baseRamCount = 0 # IO if 'ioIntensity' in taskParamMap: taskSpec.ioIntensity = taskParamMap['ioIntensity'] if 'ioIntensityUnit' in taskParamMap: taskSpec.ioIntensityUnit = taskParamMap['ioIntensityUnit'] # HS06 stuff if 'cpuTimeUnit' in taskParamMap: taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit'] if 'cpuTime' in taskParamMap: taskSpec.cpuTime = taskParamMap['cpuTime'] if 'cpuEfficiency' in taskParamMap: taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency'] else: # 90% of cpu efficiency by default taskSpec.cpuEfficiency = 90 if 'baseWalltime' in taskParamMap: taskSpec.baseWalltime = taskParamMap['baseWalltime'] else: # 10min of offset by default taskSpec.baseWalltime = 10*60 # for merge if 'mergeRamCount' in taskParamMap: taskSpec.mergeRamCount = taskParamMap['mergeRamCount'] if 'mergeCoreCount' in taskParamMap: taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount'] # scout if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout(): taskSpec.setUseScout(True) # cloud if taskParamMap.has_key('cloud'): self.cloudName = taskParamMap['cloud'] taskSpec.cloud = self.cloudName else: # set dummy to force update taskSpec.cloud = 'dummy' taskSpec.cloud = None # site if taskParamMap.has_key('site'): self.siteName = taskParamMap['site'] taskSpec.site = self.siteName else: # set dummy to force update taskSpec.site = 'dummy' taskSpec.site = None # nucleus if 'nucleus' in taskParamMap: taskSpec.nucleus = taskParamMap['nucleus'] # preset some parameters for job cloning if 'useJobCloning' in taskParamMap: # set implicit parameters if not 'nEventsPerWorker' in taskParamMap: taskParamMap['nEventsPerWorker'] = 1 if not 'nSitesPerJob' in taskParamMap: taskParamMap['nSitesPerJob'] = 2 if not 'nEsConsumers' in taskParamMap: taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob'] # minimum granularity if 'minGranularity' in taskParamMap: taskParamMap['nEventsPerRange'] = taskParamMap['minGranularity'] # event service flag if 'useJobCloning' in taskParamMap: taskSpec.eventService = 2 elif taskParamMap.has_key('nEventsPerWorker'): taskSpec.eventService = 1 else: taskSpec.eventService = 0 # OS if 'osInfo' in taskParamMap: taskSpec.termCondition = taskParamMap['osInfo'] # ttcr: requested time to completion if taskParamMap.has_key('ttcrTimestamp'): try: # get rid of the +00:00 timezone string and parse the timestamp taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f') except (IndexError, ValueError): pass # goal if 'goal' in taskParamMap: try: taskSpec.goal = int(float(taskParamMap['goal'])*10) if taskSpec.goal > 1000: taskSpec.goal = None except: pass # campaign if taskParamMap.has_key('campaign'): taskSpec.campaign = taskParamMap['campaign'] # request type if 'requestType' in taskParamMap: taskSpec.requestType = taskParamMap['requestType'] self.taskSpec = taskSpec # set split rule if 'tgtNumEventsPerJob' in taskParamMap: # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used if not 'nFilesPerJob' in taskParamMap: self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nFilesPerJob', JediTaskSpec.splitRuleToken['nFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerJob', JediTaskSpec.splitRuleToken['nEventsPerJob']) self.setSplitRule(taskParamMap,'nGBPerJob', JediTaskSpec.splitRuleToken['nGBPerJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerJob', JediTaskSpec.splitRuleToken['nMaxFilesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker']) self.setSplitRule(taskParamMap,'useLocalIO', JediTaskSpec.splitRuleToken['useLocalIO']) self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry']) self.setSplitRule(taskParamMap,'nEsConsumers', JediTaskSpec.splitRuleToken['nEsConsumers']) self.setSplitRule(taskParamMap,'waitInput', JediTaskSpec.splitRuleToken['waitInput']) self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN']) self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate']) self.setSplitRule(taskParamMap,'t1Weight', JediTaskSpec.splitRuleToken['t1Weight']) self.setSplitRule(taskParamMap,'maxAttemptES', JediTaskSpec.splitRuleToken['maxAttemptES']) self.setSplitRule(taskParamMap,'maxAttemptEsJob', JediTaskSpec.splitRuleToken['maxAttemptEsJob']) self.setSplitRule(taskParamMap,'nSitesPerJob', JediTaskSpec.splitRuleToken['nSitesPerJob']) self.setSplitRule(taskParamMap,'nEventsPerMergeJob', JediTaskSpec.splitRuleToken['nEventsPerMergeJob']) self.setSplitRule(taskParamMap,'nFilesPerMergeJob', JediTaskSpec.splitRuleToken['nFilesPerMergeJob']) self.setSplitRule(taskParamMap,'nGBPerMergeJob', JediTaskSpec.splitRuleToken['nGBPerMergeJob']) self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob']) self.setSplitRule(taskParamMap,'maxWalltime', JediTaskSpec.splitRuleToken['maxWalltime']) self.setSplitRule(taskParamMap,'tgtMaxOutputForNG', JediTaskSpec.splitRuleToken['tgtMaxOutputForNG']) if 'nJumboJobs' in taskParamMap: self.setSplitRule(taskParamMap,'nJumboJobs',JediTaskSpec.splitRuleToken['nJumboJobs']) taskSpec.useJumbo = JediTaskSpec.enum_useJumbo['waiting'] if 'maxJumboPerSite' in taskParamMap: self.setSplitRule(taskParamMap,'maxJumboPerSite',JediTaskSpec.splitRuleToken['maxJumboPerSite']) if 'minCpuEfficiency' in taskParamMap: self.setSplitRule(taskParamMap,'minCpuEfficiency',JediTaskSpec.splitRuleToken['minCpuEfficiency']) if taskParamMap.has_key('loadXML'): self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML']) self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID']) if taskParamMap.has_key('pfnList'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList']) if taskParamMap.has_key('noWaitParent') and taskParamMap['noWaitParent'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent']) if 'respectLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB']) if 'orderByLB' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['orderByLB']) if 'respectSplitRule' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectSplitRule']) if taskParamMap.has_key('reuseSecOnDemand'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand']) if 'ddmBackEnd' in taskParamMap: self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd']) if 'disableReassign' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign']) if 'allowPartialFinish' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish']) if 'useExhausted' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted']) if 'useRealNumEvents' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents']) if 'ipConnectivity' in taskParamMap: self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity']) if 'altStageOut' in taskParamMap: self.taskSpec.setAltStageOut(taskParamMap['altStageOut']) if 'allowInputLAN' in taskParamMap: self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN']) if 'runUntilClosed' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed']) if 'stayOutputOnSite' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite']) if 'useJobCloning' in taskParamMap: scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning']) self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning']) if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached']) if 'switchEStoNormal' in taskParamMap: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal']) if 'nEventsPerRange' in taskParamMap: self.setSplitRule(taskParamMap,'nEventsPerRange',JediTaskSpec.splitRuleToken['dynamicNumEvents']) if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN']) if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS']) if 'mergeEsOnOS' in taskParamMap and taskParamMap['mergeEsOnOS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['mergeEsOnOS']) if 'writeInputToFile' in taskParamMap and taskParamMap['writeInputToFile'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['writeInputToFile']) if 'useFileAsSourceLFN' in taskParamMap and taskParamMap['useFileAsSourceLFN'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useFileAsSourceLFN']) if 'ignoreMissingInDS' in taskParamMap and taskParamMap['ignoreMissingInDS'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['ignoreMissingInDS']) if 'noExecStrCnv' in taskParamMap and taskParamMap['noExecStrCnv'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noExecStrCnv']) if 'inFilePosEvtNum' in taskParamMap and taskParamMap['inFilePosEvtNum'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['inFilePosEvtNum']) if self.taskSpec.useEventService() and not taskSpec.useJobCloning(): if 'registerEsFiles' in taskParamMap and taskParamMap['registerEsFiles'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['registerEsFiles']) if 'disableAutoFinish' in taskParamMap and taskParamMap['disableAutoFinish'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableAutoFinish']) if 'resurrectConsumers' in taskParamMap and taskParamMap['resurrectConsumers'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['resurrectConsumers']) if 'usePrefetcher' in taskParamMap and taskParamMap['usePrefetcher'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['usePrefetcher']) if 'notDiscardEvents' in taskParamMap and taskParamMap['notDiscardEvents'] == True: self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['notDiscardEvents']) if 'decAttOnFailedES' in taskParamMap and taskParamMap['decAttOnFailedES'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['decAttOnFailedES']) if 'useZipToPin' in taskParamMap and taskParamMap['useZipToPin'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useZipToPin']) if 'osMatching' in taskParamMap and taskParamMap['osMatching'] is True: self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['osMatching']) # work queue workQueue = None if 'workQueueName' in taskParamMap: # work queue is specified workQueue = workQueueMapper.getQueueByName(taskSpec.vo, taskSpec.prodSourceLabel, taskParamMap['workQueueName']) if workQueue is None: # get work queue based on task attributes workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo, taskSpec.prodSourceLabel, prodSourceLabel=taskSpec.prodSourceLabel, processingType=taskSpec.processingType, workingGroup=taskSpec.workingGroup, coreCount=taskSpec.coreCount, site=taskSpec.site, eventService=taskSpec.eventService, splitRule=taskSpec.splitRule, campaign=taskSpec.campaign) if workQueue is None: errStr = 'workqueue is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format(taskSpec.processingType, taskSpec.workingGroup, taskSpec.coreCount, taskSpec.eventService) errStr += 'splitRule={0} campaign={1}'.format(taskSpec.splitRule,taskSpec.campaign) raise RuntimeError,errStr self.taskSpec.workQueue_ID = workQueue.queue_id # Initialize the global share gshare = None if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share(taskParamMap['gshare']): # work queue is specified gshare = taskParamMap['gshare'] else: # get share based on definition gshare = self.taskBufferIF.get_share_for_task(self.taskSpec) if gshare is None: gshare = 'Undefined' # Should not happen. Undefined is set when no share is found # errStr = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel) # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign) # raise RuntimeError,errStr self.taskSpec.gshare = gshare # Initialize the resource type try: self.taskSpec.resource_type = self.taskBufferIF.get_resource_type_task(self.taskSpec) except: self.taskSpec.resource_type = 'Undefined' # return return
def doPostProcess(self,taskSpec,tmpLog): # pre-check try: tmpStat = self.doPreCheck(taskSpec,tmpLog) if tmpStat: return self.SC_SUCCEEDED except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doPreCheck failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FATAL # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # loop over all datasets for datasetSpec in taskSpec.datasetSpecList: # skip pseudo output datasets if datasetSpec.type in ['output'] and datasetSpec.isPseudo(): continue try: # remove wrong files if datasetSpec.type in ['output']: # get successful files okFiles = self.taskBufferIF.getSuccessfulFiles_JEDI(datasetSpec.jediTaskID,datasetSpec.datasetID) if okFiles == None: tmpLog.warning('failed to get successful files for {0}'.format(datasetSpec.datasetName)) return self.SC_FAILED # get files in dataset ddmFiles = ddmIF.getFilesInDataset(datasetSpec.datasetName,skipDuplicate=False,ignoreUnknown=True) tmpLog.debug('datasetID={0}:Name={1} has {2} files in DB, {3} files in DDM'.format(datasetSpec.datasetID, datasetSpec.datasetName, len(okFiles),len(ddmFiles))) # check all files toDelete = [] for tmpGUID,attMap in ddmFiles.iteritems(): if attMap['lfn'] not in okFiles: did = {'scope':attMap['scope'], 'name':attMap['lfn']} toDelete.append(did) tmpLog.debug('delete {0} from {1}'.format(attMap['lfn'],datasetSpec.datasetName)) # delete if toDelete != []: ddmIF.deleteFilesFromDataset(datasetSpec.datasetName,toDelete) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to remove wrong files with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED try: # freeze output and log datasets if datasetSpec.type in ['output','log','trn_log']: tmpLog.info('freezing datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName)) ddmIF.freezeDataset(datasetSpec.datasetName,ignoreUnknown=True) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to freeze datasets with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FAILED try: # delete transient datasets if datasetSpec.type in ['trn_output']: tmpLog.debug('deleting datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName)) retStr = ddmIF.deleteDataset(datasetSpec.datasetName,False,ignoreUnknown=True) tmpLog.info(retStr) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to delete datasets with {0}:{1}'.format(errtype.__name__,errvalue)) # check duplication if self.getFinalTaskStatus(taskSpec) in ['finished','done']: nDup = self.taskBufferIF.checkDuplication_JEDI(taskSpec.jediTaskID) tmpLog.debug('checked duplication with {0}'.format(nDup)) if nDup > 0: errStr = 'paused since {0} duplication found'.format(nDup) taskSpec.oldStatus = self.getFinalTaskStatus(taskSpec) taskSpec.status = 'paused' taskSpec.setErrDiag(errStr) tmpLog.debug(errStr) # delete ES datasets if taskSpec.registerEsFiles(): try: targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) tmpLog.debug('deleting ES dataset name={0}'.format(targetName)) retStr = ddmIF.deleteDataset(targetName,False,ignoreUnknown=True) tmpLog.debug(retStr) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to delete ES dataset with {0}:{1}'.format(errtype.__name__,errvalue)) try: self.doBasicPostProcess(taskSpec,tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue)) return self.SC_FATAL return self.SC_SUCCEEDED