def doRefine(self,jediTaskID,taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # append attempt number for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems(): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap['filenameTemplate'] if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'): tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1' # set destination if nessesary for datasetSpec in self.outDatasetSpecList: storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) if storageToken != None: tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint(storageToken,self.siteMapper,'production') if tmpSiteList == []: raise RuntimeError,'cannot find online siteID associated to {0}'.format(storageToken) datasetSpec.destination = tmpSiteList[0] # set to register datasets #self.taskSpec.setToRegisterDatasets() except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) raise errtype,errvalue tmpLog.debug('done') return self.SC_SUCCEEDED
def doRefine(self,jediTaskID,taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: # preprocessing tmpStat,taskParamMap = self.doPreProRefine(taskParamMap) if tmpStat == True: tmpLog.debug('done for preprocessing') return self.SC_SUCCEEDED if tmpStat == False: # failed tmpLog.error('doPreProRefine failed') return self.SC_FAILED # normal refine self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: # get the latest version of DBR if datasetSpec.datasetName == 'DBR_LATEST': tmpLog.debug('resolving real name for {0}'.format(datasetSpec.datasetName)) datasetSpec.datasetName = self.ddmIF.getInterface(self.taskSpec.vo).getLatestDBRelease(useResultCache=3600) datasetSpec.containerName = datasetSpec.datasetName # set attributes to DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # destination if taskParamMap.has_key('destination'): for datasetSpec in self.outDatasetSpecList: datasetSpec.destination = taskParamMap['destination'] # use build if taskParamMap.has_key('buildSpec'): self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useBuild']) # use template dataset self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['instantiateTmpl']) self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['instantiateTmplSite']) for datasetSpec in self.outDatasetSpecList: datasetSpec.type = "tmpl_{0}".format(datasetSpec.type) # get jobsetID tmpStat,tmpJobID = self.taskBufferIF.getUserJobsetID_JEDI(self.taskSpec.userName) if not tmpStat: tmpLog.error('failed to get jobsetID failed') return self.SC_FAILED self.taskSpec.reqID = tmpJobID # site limitation if 'excludedSite' in taskParamMap and 'includedSite' in taskParamMap: self.taskSpec.setLimitedSites('incexc') elif 'excludedSite' in taskParamMap: self.taskSpec.setLimitedSites('exc') elif 'includedSite' in taskParamMap: self.taskSpec.setLimitedSites('inc') except: errtype,errvalue = sys.exc_info()[:2] errStr = 'doRefine failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) self.taskSpec.setErrDiag(errStr,None) raise errtype,errvalue tmpLog.debug('done') return self.SC_SUCCEEDED
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName)) if not tmpCoreName in ['NULL','',None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log']) # get destinations retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':tmpCoreName}) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs): # get destinations retMap = {'datasets':[],'nucleus':nucleusSpec.name} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = nucleusSpec.getAssoicatedEndpoint(datasetSpec.storageToken)['ddm_endpoint_name'] # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken.split('/')[-1]) retMap['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':'nucleus:{0}'.format(nucleusSpec.name)}) return retMap
def runImpl(self): # cutoff for disk in TB diskThreshold = 5 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] thrInputSize = 1024*1024*1024 thrInputNum = 100 thrInputSizeFrac = 0.1 thrInputNumFrac = 0.1 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.debug('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.debug(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus, tmpSpaceSize, diskThreshold, tmpEP['state'])) toSkip = True break if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys()) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpLog.debug(' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac)) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpLog.debug(' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available') # use any nuclei where input is available if no sites can run jobs tmpRet = tmpSiteList okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.debug(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*({0}=weight for TAPE)'.format(negWeightTape) tmpLog.debug(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.debug('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def getSitesWithData(siteMapper,ddmIF,datasetName,storageToken=None): # get num of files try: if not datasetName.endswith('/'): totalNumDatasets = 1 else: tmpDsMap = ddmIF.listDatasetsInContainer(datasetName) totalNumDatasets = len(tmpDsMap) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.ddmIF.getFilesInDataset failed with %s' % errvalue # get replicas try: replicaMap= {} replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue # loop over all clouds retMap = {} for tmpCloudName in siteMapper.cloudSpec.keys(): retMap[tmpCloudName] = {'t1':{},'t2':[]} # get T1 DDM endpoints tmpCloudSpec = siteMapper.getCloud(tmpCloudName) # FIXME until CERN-PROD_TZERO is added to cloudconfig.tier1SE if tmpCloudName == 'CERN': if not 'CERN-PROD_TZERO' in tmpCloudSpec['tier1SE']: tmpCloudSpec['tier1SE'].append('CERN-PROD_TZERO') for tmpSePat in tmpCloudSpec['tier1SE']: if '*' in tmpSePat: tmpSePat = tmpSePat.replace('*','.*') tmpSePat = '^' + tmpSePat +'$' for tmpSE in replicaMap[datasetName].keys(): # check name with regexp pattern if re.search(tmpSePat,tmpSE) == None: continue # check space token if not storageToken in ['',None,'NULL']: seStr = ddmIF.getSiteProperty(tmpSE,'se') try: if seStr.split(':')[1] != storageToken: continue except: pass # check archived metadata # FIXME pass # check tape attribute try: tmpOnTape = ddmIF.getSiteProperty(tmpSE,'is_tape') except: continue # errtype,errvalue = sys.exc_info()[:2] # return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue) # check completeness tmpStatistics = replicaMap[datasetName][tmpSE][-1] if tmpStatistics['found'] == None: tmpDatasetStatus = 'unknown' pass elif tmpStatistics['total'] == tmpStatistics['found'] and tmpStatistics['total'] >= totalNumDatasets: tmpDatasetStatus = 'complete' else: tmpDatasetStatus = 'incomplete' # append retMap[tmpCloudName]['t1'][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus} # get T2 list tmpSiteList = DataServiceUtils.getSitesWithDataset(datasetName,siteMapper,replicaMap, tmpCloudName,useHomeCloud=True, useOnlineSite=True,includeT1=False) # append retMap[tmpCloudName]['t2'] = tmpSiteList # remove if empty if len(retMap[tmpCloudName]['t1']) == 0 and len(retMap[tmpCloudName]['t2']) == 0: del retMap[tmpCloudName] # return return Interaction.SC_SUCCEEDED,retMap
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud sitePreAssigned = False siteListPreAssigned = False if not taskSpec.site in ['',None]: if ',' in taskSpec.site: # site list siteListPreAssigned = True scanSiteList = taskSpec.site.split(',') else: # site sitePreAssigned = True scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: siteListPreAssigned = True scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite()) scanSiteList.append(inputChunk.getPreassignedSite()) tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList), inputChunk.getPreassignedSite()) tmpMsg += 'criteria=+premerge' tmpLog.debug(tmpMsg) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # T1 if not taskSpec.useWorldCloud(): t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] else: # get destination for WORLD cloud t1Sites = [] tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log']) for datasetSpec in datasetSpecList: if not datasetSpec.destination in t1Sites: t1Sites.append(datasetSpec.destination) # sites sharing SE with T1 sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0]) # all T1 allT1Sites = self.getAllT1Sites() # core count if inputChunk.isMerging and taskSpec.mergeCoreCount != None: taskCoreCount = taskSpec.mergeCoreCount else: taskCoreCount = taskSpec.coreCount # MP if taskCoreCount != None and taskCoreCount > 1: # use MCORE only useMP = 'only' elif taskCoreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' # get workQueue workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID) ###################################### # selection for status if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for high priorities t1WeightForHighPrio = 1 if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \ and not sitePreAssigned and not siteListPreAssigned: t1WeightForHighPrio = 100 newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites: newScanSiteList.append(tmpSiteName) else: tmpMsg = ' skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName, cloudName) tmpMsg += 'criteria=-scoutprio' tmpLog.debug(tmpMsg) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection to avoid slow or inactive sites if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \ inputChunk.isMerging or taskSpec.mergeOutput()) \ and not sitePreAssigned: # get inactive sites inactiveTimeLimit = 2 inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit) newScanSiteList = [] tmpMsgList = [] for tmpSiteName in scanSiteList: nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting') if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName) tmpMsg += 'criteria=-slow' tmpMsgList.append(tmpMsg) elif tmpSiteName in inactiveSites and nToGetAll > 0: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName, inactiveTimeLimit) tmpMsg += 'criteria=-inactive' tmpMsgList.append(tmpMsg) else: newScanSiteList.append(tmpSiteName) if newScanSiteList != []: scanSiteList = newScanSiteList for tmpMsg in tmpMsgList: tmpLog.debug(tmpMsg) tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability if not sitePreAssigned and not siteListPreAssigned: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName # ignore DBR if DataServiceUtils.isDBR(datasetName): continue if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName, datasetSpec.storageToken) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) """ # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError """ ###################################### # selection for fairshare if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog): tmpLog.debug(' skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing cache=%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount) tmpMsg += 'criteria=-lowmemory' tmpLog.debug(tmpMsg) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount) tmpMsg += 'criteria=-highmemory' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk if taskSpec.outputScaleWithEvents(): minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True) else: minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True) minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpMsg = ' skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount) tmpMsg += 'criteria=-disk' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList), minDiskCount)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm) if tmpSiteSpaceMap != {}: tmpSiteFreeSpace = tmpSiteSpaceMap['free'] tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue # check if blacklisted if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm): tmpLog.debug(' skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime if not taskSpec.useHS06(): tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) minWalltime = taskSpec.walltime * tmpMaxAtomSize strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize) else: tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True) minWalltime = taskSpec.cpuTime * tmpMaxAtomSize strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize) if minWalltime != None or inputChunk.useScout(): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) siteMaxTime = tmpSiteSpec.maxtime origSiteMaxTime = siteMaxTime # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day if inputChunk.useScout() or inputChunk.isMerging or \ (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]): minTimeForZeroWalltime = 24*60*60 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime: tmpMsg = ' skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName, siteMaxTime) if inputChunk.useScout(): tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-scoutwalltime' else: tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-zerowalltime' tmpLog.debug(tmpMsg) continue # check max walltime at the site tmpSiteStr = '{0}'.format(siteMaxTime) if taskSpec.useHS06(): oldSiteMaxTime = siteMaxTime siteMaxTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime) if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMaxTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMaxTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMaxTime = long(siteMaxTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMaxTime != 0 and minWalltime > siteMaxTime: tmpMsg = ' skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-shortwalltime' tmpLog.debug(tmpMsg) continue # check min walltime at the site siteMinTime = tmpSiteSpec.mintime origSiteMinTime = siteMinTime tmpSiteStr = '{0}'.format(siteMinTime) if taskSpec.useHS06(): oldSiteMinTime = siteMinTime siteMinTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime) if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMinTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMinTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMinTime = long(siteMinTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMinTime != 0 and minWalltime < siteMinTime: tmpMsg = ' skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-longwalltime' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList if not taskSpec.useHS06(): tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) else: tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for network connectivity if not sitePreAssigned: ipConnectivity = taskSpec.getIpConnectivity() if ipConnectivity != None: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.wnconnectivity == 'full': pass elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http': pass else: tmpMsg = ' skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName, tmpSiteSpec.wnconnectivity, ipConnectivity) tmpMsg += 'criteria=-network' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList), ipConnectivity)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for event service if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # event service if taskSpec.useEventService(): if tmpSiteSpec.getJobSeed() == 'std': tmpMsg = ' skip site={0} since EventService is not allowed '.format(tmpSiteName) tmpMsg += 'criteria=-es' tmpLog.debug(tmpMsg) continue else: if tmpSiteSpec.getJobSeed() == 'es': tmpMsg = ' skip site={0} since only EventService is allowed '.format(tmpSiteName) tmpMsg += 'criteria=-nones' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for T1 weight t1Weight = taskSpec.getT1Weight() if t1Weight == 0: # use T1 weight in cloudconfig t1Weight = self.siteMapper.getCloud(cloudName)['weight'] if t1Weight < 0: newScanSiteList = [] for tmpSiteName in scanSiteList: if not tmpSiteName in t1Sites: tmpLog.debug(' skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList t1Weight = 1 t1Weight = max(t1Weight,t1WeightForHighPrio) tmpLog.debug('T1 weight {0}'.format(t1Weight)) tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nPilotMap = {} if not sitePreAssigned: nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel: tmpLog.debug(' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) nPilotMap[tmpSiteName] = nPilot scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # get available files normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper, ignoreCC=True) # disable file lookup for merge jobs or secondary datasets checkCompleteness = True useCompleteOnly = False if inputChunk.isMerging: checkCompleteness = False if not datasetSpec.isMaster(): useCompleteOnly = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkCompleteness=checkCompleteness, storageToken=datasetSpec.storageToken, useCompleteOnly=useCompleteOnly) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize # get max total size tmpTotalSizes = normalizeFactors.values() tmpTotalSizes.sort() if tmpTotalSizes != []: totalSize = tmpTotalSizes.pop() else: totalSize = 0 ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList))) weightMapPrimary = {} weightMapSecondary = {} newScanSiteList = [] for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID) nDefined = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID) if tmpSiteName in nPilotMap: nPilot = nPilotMap[tmpSiteName] else: nPilot = 0 manyAssigned = float(nAssigned + 1) / float(nActivated + 1) manyAssigned = min(2.0,manyAssigned) manyAssigned = max(1.0,manyAssigned) weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned, nStarting,nDefined, totalSize,manyAssigned, nPilot) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName]) # T1 weight if tmpSiteName in t1Sites+sitesShareSeT1: weight *= t1Weight weightStr += 't1W={0} '.format(t1Weight) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight and params siteCandidateSpec.weight = weight siteCandidateSpec.nRunningJobs = nRunning siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting siteCandidateSpec.nAssignedJobs = nAssigned # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # check if site is locked for WORLD lockedByBrokerage = False if taskSpec.useWorldCloud(): lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel, tmpSiteName,taskSpec.workQueue_ID) # check cap with nRunning cutOffValue = 20 cutOffFactor = 2 nRunningCap = max(cutOffValue,cutOffFactor*nRunning) nRunningCap = max(nRunningCap,nPilot) okMsg = ' use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr) okAsPrimay = False if lockedByBrokerage: ngMsg = ' skip site={0} due to locked by another brokerage '.format(tmpSiteName) ngMsg += 'criteria=-lock' elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap: ngMsg = ' skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName, nDefined+nActivated+nAssigned+nStarting) ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue, cutOffFactor, nRunning, nPilot) ngMsg += 'criteria=-cap' else: ngMsg = ' skip site={0} due to low weight '.format(tmpSiteName) ngMsg += 'criteria=-loweigh' okAsPrimay = True # use primay if cap/lock check is passed if okAsPrimay: weightMap = weightMapPrimary else: weightMap = weightMapSecondary # add weight if not weight in weightMap: weightMap[weight] = [] weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg)) # use second candidates if no primary candidates passed cap/lock check if weightMapPrimary == {}: tmpLog.debug('use second candidates since no sites pass cap/lock check') weightMap = weightMapSecondary # use hightest 3 weights weightRank = 3 else: weightMap = weightMapPrimary # use all weights weightRank = None # dump NG message for tmpWeight in weightMapSecondary.keys(): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]: tmpLog.debug(tmpNgMsg) # max candidates for WORLD if taskSpec.useWorldCloud(): maxSiteCandidates = 10 else: maxSiteCandidates = None newScanSiteList = [] weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightIdx,tmpWeight in enumerate(weightList): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]: if (weightRank == None or weightIdx < weightRank) and \ (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates): # use site tmpLog.debug(tmpOkMsg) newScanSiteList.append(siteCandidateSpec.siteName) inputChunk.addSiteCandidate(siteCandidateSpec) else: # dump NG message tmpLog.debug(tmpNgMsg) scanSiteList = newScanSiteList # final check if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # lock sites for WORLD if taskSpec.useWorldCloud(): for tmpSiteName in scanSiteList: self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID) tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) # return self.sendLogMessage(tmpLog) tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.datasetID in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName == None: continue if not targetName in avDatasetList: # set lifetime if targetName.startswith('panda'): lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud != None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken) else: location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken) if locForRule == None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if not taskSpec.campaign in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() != None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime, activity,grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule, targetName, ddmBackEnd)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.destinationDBlock in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def doFinalProcedure(self,taskSpec,tmpLog): tmpLog.info('final procedure for status={0} processingType={1}'.format(taskSpec.status, taskSpec.processingType)) if taskSpec.status in ['done','finished'] or \ (taskSpec.status == 'paused' and taskSpec.oldStatus in ['done','finished']): trnLifeTime = 14*24*60*60 trnLifeTimeLong = 28*24*60*60 ddmIF = self.ddmIF.getInterface(taskSpec.vo) # set lifetime to transient datasets metaData = {'lifetime':trnLifeTime} datasetTypeListI = set() datasetTypeListO = set() for datasetSpec in taskSpec.datasetSpecList: if datasetSpec.type in ['log','output']: if datasetSpec.getTransient() == True: tmpLog.debug('set metadata={0} to datasetID={1}:Name={2}'.format(str(metaData), datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue) # collect dataset types datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None]: if datasetSpec.type == 'input': datasetTypeListI.add(datasetType) elif datasetSpec.type == 'output': datasetTypeListO.add(datasetType) # set lifetime to parent transient datasets if taskSpec.processingType in ['merge']: # get parent task if not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: # get parent tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(taskSpec.parent_tid,None,False) if tmpStat and parentTaskSpec != None: # set lifetime to parent datasets if they are transient for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['output']: # check dataset type datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in datasetTypeListI or not datasetType in datasetTypeListO: continue # use longer lifetime for finished AOD merge with success rate < 90% if taskSpec.status == 'finished' and datasetType == 'AOD' \ and self.getTaskCompleteness(taskSpec)[-1] < 900: metaData = {'lifetime':trnLifeTimeLong} else: metaData = {'lifetime':trnLifeTime} tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) if tmpMetadata['transient'] == True: tmpLog.debug('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData), taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue) # delete empty datasets if taskSpec.status == 'done' or (taskSpec.status == 'paused' and taskSpec.oldStatus == 'done'): ddmIF = self.ddmIF.getInterface(taskSpec.vo) # loop over all datasets for datasetSpec in taskSpec.datasetSpecList: try: if datasetSpec.type == 'output' and datasetSpec.nFilesFinished == 0: tmpStat = ddmIF.deleteDataset(datasetSpec.datasetName,True,True) tmpLog.debug('delete empty prod dataset {0} with {1}'.format(datasetSpec.datasetName,tmpStat)) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to delete empty dataset with {0}:{1}'.format(errtype.__name__,errvalue)) # set lifetime to failed datasets if taskSpec.status in ['failed','broken','aborted']: trnLifeTime = 30*24*60*60 ddmIF = self.ddmIF.getInterface(taskSpec.vo) # only log datasets metaData = {'lifetime':trnLifeTime} for datasetSpec in taskSpec.datasetSpecList: if datasetSpec.type in ['log']: tmpLog.debug('set metadata={0} to failed datasetID={1}:Name={2}'.format(str(metaData), datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue) return self.SC_SUCCEEDED
def appendJob(self, job, siteMapperCache=None): # event service merge if EventServiceUtils.isEventServiceMerge(job): isEventServiceMerge = True else: isEventServiceMerge = False # PandaID self.data['PandaID'] = job.PandaID # prodSourceLabel self.data['prodSourceLabel'] = job.prodSourceLabel # swRelease self.data['swRelease'] = job.AtlasRelease # homepackage self.data['homepackage'] = job.homepackage # transformation self.data['transformation'] = job.transformation # job name self.data['jobName'] = job.jobName # job definition ID self.data['jobDefinitionID'] = job.jobDefinitionID # cloud self.data['cloud'] = job.cloud # files strIFiles = '' strOFiles = '' strDispatch = '' strDisToken = '' strDisTokenForOutput = '' strDestination = '' strRealDataset = '' strRealDatasetIn = '' strProdDBlock = '' strDestToken = '' strProdToken = '' strProdTokenForOutput = '' strGUID = '' strFSize = '' strCheckSum = '' strFileDestinationSE = '' strScopeIn = '' strScopeOut = '' strScopeLog = '' logFile = '' logGUID = '' ddmEndPointIn = [] ddmEndPointOut = [] noOutput = [] siteSpec = None inDsLfnMap = {} inLFNset = set() if siteMapperCache is not None: siteMapper = siteMapperCache.getObj() siteSpec = siteMapper.getSite(job.computingSite) # resolve destSE try: job.destinationSE = siteMapper.resolveNucleus( job.destinationSE) for tmpFile in job.Files: tmpFile.destinationSE = siteMapper.resolveNucleus( tmpFile.destinationSE) except Exception: pass siteMapperCache.releaseObj() for file in job.Files: if file.type == 'input': if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset: pass else: inLFNset.add(file.lfn) if strIFiles != '': strIFiles += ',' strIFiles += file.lfn if strDispatch != '': strDispatch += ',' strDispatch += file.dispatchDBlock if strDisToken != '': strDisToken += ',' strDisToken += file.dispatchDBlockToken strProdDBlock += '%s,' % file.prodDBlock if not isEventServiceMerge: strProdToken += '%s,' % file.prodDBlockToken else: strProdToken += '%s,' % job.metadata[1][file.lfn] if strGUID != '': strGUID += ',' strGUID += file.GUID strRealDatasetIn += '%s,' % file.dataset strFSize += '%s,' % file.fsize if file.checksum not in ['', 'NULL', None]: strCheckSum += '%s,' % file.checksum else: strCheckSum += '%s,' % file.md5sum strScopeIn += '%s,' % file.scope ddmEndPointIn.append( self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken, 'input', job.prodSourceLabel, job.job_label)) if file.dataset not in inDsLfnMap: inDsLfnMap[file.dataset] = [] inDsLfnMap[file.dataset].append(file.lfn) if file.type == 'output' or file.type == 'log': if strOFiles != '': strOFiles += ',' strOFiles += file.lfn if strDestination != '': strDestination += ',' strDestination += file.destinationDBlock if strRealDataset != '': strRealDataset += ',' strRealDataset += file.dataset strFileDestinationSE += '%s,' % file.destinationSE if file.type == 'log': logFile = file.lfn logGUID = file.GUID strScopeLog = file.scope else: strScopeOut += '%s,' % file.scope if strDestToken != '': strDestToken += ',' strDestToken += re.sub( '^ddd:', 'dst:', file.destinationDBlockToken.split(',')[0]) strDisTokenForOutput += '%s,' % file.dispatchDBlockToken strProdTokenForOutput += '%s,' % file.prodDBlockToken ddmEndPointOut.append( self.getDdmEndpoint( siteSpec, file.destinationDBlockToken.split(',')[0], 'output', job.prodSourceLabel, job.job_label)) if file.isAllowedNoOutput(): noOutput.append(file.lfn) # inFiles self.data['inFiles'] = strIFiles # dispatch DBlock self.data['dispatchDblock'] = strDispatch # dispatch DBlock space token self.data['dispatchDBlockToken'] = strDisToken # dispatch DBlock space token for output self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] # outFiles self.data['outFiles'] = strOFiles # destination DBlock self.data['destinationDblock'] = strDestination # destination DBlock space token self.data['destinationDBlockToken'] = strDestToken # prod DBlocks self.data['prodDBlocks'] = strProdDBlock[:-1] # prod DBlock space token self.data['prodDBlockToken'] = strProdToken[:-1] # real output datasets self.data['realDatasets'] = strRealDataset # real output datasets self.data['realDatasetsIn'] = strRealDatasetIn[:-1] # file's destinationSE self.data['fileDestinationSE'] = strFileDestinationSE[:-1] # log filename self.data['logFile'] = logFile # log GUID self.data['logGUID'] = logGUID # jobPars self.data['jobPars'], ppSteps = job.extractMultiStepExec() if ppSteps is not None: self.data.update(ppSteps) if job.to_encode_job_params(): self.data['jobPars'] = base64.b64encode( self.data['jobPars'].encode()).decode() # attempt number self.data['attemptNr'] = job.attemptNr # GUIDs self.data['GUID'] = strGUID # checksum self.data['checksum'] = strCheckSum[:-1] # fsize self.data['fsize'] = strFSize[:-1] # scope self.data['scopeIn'] = strScopeIn[:-1] self.data['scopeOut'] = strScopeOut[:-1] self.data['scopeLog'] = strScopeLog # DDM endpoints try: self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn) except TypeError: self.data['ddmEndPointIn'] = '' try: self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut) except TypeError: self.data['ddmEndPointOut'] = '' # destinationSE self.data['destinationSE'] = job.destinationSE # user ID self.data['prodUserID'] = job.prodUserID # CPU count self.data['maxCpuCount'] = job.maxCpuCount # RAM count self.data['minRamCount'] = job.minRamCount # disk count self.data['maxDiskCount'] = job.maxDiskCount # cmtconfig if ppSteps is None: self.data['cmtConfig'] = job.cmtConfig else: self.data['cmtConfig'] = '' # processingType self.data['processingType'] = job.processingType # transferType self.data['transferType'] = job.transferType # sourceSite self.data['sourceSite'] = job.sourceSite # current priority self.data['currentPriority'] = job.currentPriority # taskID if job.lockedby == 'jedi': self.data['taskID'] = job.jediTaskID else: self.data['taskID'] = job.taskID # core count if job.coreCount in ['NULL', None]: self.data['coreCount'] = 1 else: self.data['coreCount'] = job.coreCount # jobsetID self.data['jobsetID'] = job.jobsetID # nucleus self.data['nucleus'] = job.nucleus # walltime self.data['maxWalltime'] = job.maxWalltime # looping check if job.is_no_looping_check(): self.data['loopingCheck'] = False # debug mode if job.specialHandling is not None and 'debug' in job.specialHandling: self.data['debug'] = 'True' # event service or job cloning if EventServiceUtils.isJobCloningJob(job): self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job) elif EventServiceUtils.isEventServiceJob( job) or EventServiceUtils.isJumboJob(job): self.data['eventService'] = 'True' # prod DBlock space token for pre-merging output self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1] # event service merge if isEventServiceMerge: self.data['eventServiceMerge'] = 'True' # write to file for ES merge writeToFileStr = '' try: for outputName in job.metadata[0]: inputList = job.metadata[0][outputName] writeToFileStr += 'inputFor_{0}:'.format(outputName) for tmpInput in inputList: writeToFileStr += '{0},'.format(tmpInput) writeToFileStr = writeToFileStr[:-1] writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] except Exception: pass self.data['writeToFile'] = writeToFileStr elif job.writeInputToFile(): try: # write input to file writeToFileStr = '' for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] writeToFileStr += 'tmpin_{0}:'.format(inDS) writeToFileStr += ','.join(inputList) writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] self.data['writeToFile'] = writeToFileStr except Exception: pass # replace placeholder if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob( job): try: for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] srcStr = 'tmpin__cnt_{0}'.format(inDS) dstStr = ','.join(inputList) self.data['jobPars'] = self.data['jobPars'].replace( srcStr, dstStr) except Exception: pass # no output if noOutput != []: self.data['allowNoOutput'] = ','.join(noOutput) # alternative stage-out if job.getAltStgOut() is not None: self.data['altStageOut'] = job.getAltStgOut() # log to OS if job.putLogToOS(): self.data['putLogToOS'] = 'True' # suppress execute string conversion if job.noExecStrCnv(): self.data['noExecStrCnv'] = 'True' # in-file positional event number if job.inFilePosEvtNum(): self.data['inFilePosEvtNum'] = 'True' # use prefetcher if job.usePrefetcher(): self.data['usePrefetcher'] = 'True' # image name if job.container_name not in ['NULL', None]: self.data['container_name'] = job.container_name # IO self.data['ioIntensity'] = job.get_task_attribute('ioIntensity') self.data['ioIntensityUnit'] = job.get_task_attribute( 'ioIntensityUnit') # HPO if job.is_hpo_workflow(): self.data['isHPO'] = 'True' # VP if siteSpec is not None: scope_input, scope_output = DataServiceUtils.select_scope( siteSpec, job.prodSourceLabel, job.job_label) if siteSpec.use_vp(scope_input): self.data['useVP'] = 'True'
def getAvailableFiles(self,datasetSpec,siteEndPointMap,siteMapper,ngGroup=[],checkLFC=False): # make logger methodName = 'getAvailableFiles' methodName += ' <datasetID={0}>'.format(datasetSpec.datasetID) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start datasetName={0}'.format(datasetSpec.datasetName)) try: # list of NG endpoints ngEndPoints = [] if 1 in ngGroup: ngEndPoints += ['_SCRATCHDISK$','_LOCALGROUPDISK$','_LOCALGROUPTAPE$','_USERDISK$', '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$'] if 2 in ngGroup: ngEndPoints += ['_LOCALGROUPTAPE$', '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$'] # get all associated endpoints siteAllEndPointsMap = {} for siteName,endPointPattList in siteEndPointMap.iteritems(): # get all endpoints matching with patterns allEndPointList = [] for endPointPatt in endPointPattList: if '*' in endPointPatt: # wildcard endPointPatt = endPointPatt.replace('*','.*') for endPointToA in TiersOfATLAS.getAllDestinationSites(): if re.search('^'+endPointPatt+'$',endPointToA) != None: if not endPointToA in allEndPointList: allEndPointList.append(endPointToA) else: # normal endpoint if endPointPatt in TiersOfATLAS.getAllDestinationSites() and \ not endPointPatt in allEndPointList: allEndPointList.append(endPointPatt) # get associated endpoints siteAllEndPointsMap[siteName] = [] for endPoint in allEndPointList: # append if not self.checkNGEndPoint(endPoint,ngEndPoints) and \ not endPoint in siteAllEndPointsMap[siteName]: siteAllEndPointsMap[siteName].append(endPoint) else: # already checked continue # get alternate name altName = TiersOfATLAS.getSiteProperty(endPoint,'alternateName') if altName != None and altName != ['']: for assEndPoint in TiersOfATLAS.resolveGOC({altName[0]:None})[altName[0]]: if not assEndPoint in siteAllEndPointsMap[siteName] and \ not self.checkNGEndPoint(assEndPoint,ngEndPoints): siteAllEndPointsMap[siteName].append(assEndPoint) # get replica map tmpStat,tmpOut = self.listDatasetReplicas(datasetSpec.datasetName) if tmpStat != self.SC_SUCCEEDED: tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut)) raise tmpStat,tmpOut datasetReplicaMap = tmpOut # collect SE, LFC hosts, storage path, storage type lfcSeMap = {} storagePathMap = {} completeReplicaMap = {} siteHasCompleteReplica = False for siteName,allEndPointList in siteAllEndPointsMap.iteritems(): tmpLfcSeMap = {} tmpStoragePathMap = {} tmpSiteSpec = siteMapper.getSite(siteName) for tmpEndPoint in allEndPointList: # storage type if TiersOfATLAS.isTapeSite(tmpEndPoint): storageType = 'localtape' else: storageType = 'localdisk' # no scan when site has complete replicas if datasetReplicaMap.has_key(tmpEndPoint) and datasetReplicaMap[tmpEndPoint][-1]['found'] != None \ and datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']: completeReplicaMap[tmpEndPoint] = storageType siteHasCompleteReplica = True # no LFC scan for many-time datasets if datasetSpec.isManyTime(): continue # get LFC lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint) # add map if not tmpLfcSeMap.has_key(lfc): tmpLfcSeMap[lfc] = [] # get SE seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm') tmpMatch = re.search('://([^:/]+):*\d*/',seStr) if tmpMatch != None: se = tmpMatch.group(1) if not se in tmpLfcSeMap[lfc]: tmpLfcSeMap[lfc].append(se) else: tmpLog.error('faild to extract SE from %s for %s:%s' % \ (seStr,siteName,tmpEndPoint)) # get SE + path seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm') tmpMatch = re.search('(srm://.+)$',seStr) if tmpMatch == None: tmpLog.error('faild to extract SE+PATH from %s for %s:%s' % \ (seStr,siteName,tmpEndPoint)) continue # add full path to storage map tmpSePath = tmpMatch.group(1) tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType} # add compact path tmpSePath = re.sub('(:\d+)*/srm/[^\?]+\?SFN=','',tmpSePath) tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType} # add to map to trigger LFC scan if complete replica is missing at the site if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec): pass elif not siteHasCompleteReplica or checkLFC: for tmpKey,tmpVal in tmpLfcSeMap.iteritems(): if not lfcSeMap.has_key(tmpKey): lfcSeMap[tmpKey] = [] lfcSeMap[tmpKey] += tmpVal for tmpKey,tmpVal in tmpStoragePathMap.iteritems(): storagePathMap[tmpKey] = tmpVal # collect GUIDs and LFNs fileMap = {} lfnMap = {} lfnFileSepcMap = {} scopeMap = {} for tmpFile in datasetSpec.Files: fileMap[tmpFile.GUID] = tmpFile.lfn lfnMap[tmpFile.lfn] = tmpFile lfnFileSepcMap[tmpFile.lfn] = tmpFile scopeMap[tmpFile.lfn] = tmpFile.scope # get SURLs surlMap = {} for lfcHost,seList in lfcSeMap.iteritems(): tmpLog.debug('lookup in LFC:{0} for {1}'.format(lfcHost,str(seList))) tmpStat,tmpRetMap = self.getSURLsFromLFC(fileMap,lfcHost,seList,scopes=scopeMap) tmpLog.debug(str(tmpStat)) if tmpStat != self.SC_SUCCEEDED: raise RuntimeError,tmpRetMap for lfn,surls in tmpRetMap.iteritems(): if not surlMap.has_key(lfn): surlMap[lfn] = surls else: surlMap[lfn] += surls # make return returnMap = {} for siteName,allEndPointList in siteAllEndPointsMap.iteritems(): # set default return values if not returnMap.has_key(siteName): returnMap[siteName] = {'localdisk':[],'localtape':[],'cache':[],'remote':[]} # loop over all files tmpSiteSpec = siteMapper.getSite(siteName) # check if the file is cached if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec): for tmpFileSpec in datasetSpec.Files: # add to cached file list returnMap[siteName]['cache'].append(tmpFileSpec) # complete replicas if not checkLFC: for tmpEndPoint in allEndPointList: if completeReplicaMap.has_key(tmpEndPoint): storageType = completeReplicaMap[tmpEndPoint] returnMap[siteName][storageType] += datasetSpec.Files # loop over all available LFNs avaLFNs = surlMap.keys() avaLFNs.sort() for tmpLFN in avaLFNs: tmpFileSpec = lfnFileSepcMap[tmpLFN] # loop over all SURLs for tmpSURL in surlMap[tmpLFN]: for tmpSePath in storagePathMap.keys(): # check SURL if tmpSURL.startswith(tmpSePath): # add siteName = storagePathMap[tmpSePath]['siteName'] storageType = storagePathMap[tmpSePath]['storageType'] if not tmpFileSpec in returnMap[siteName][storageType]: returnMap[siteName][storageType].append(tmpFileSpec) break # dump dumpStr = '' for siteName,storageTypeFile in returnMap.iteritems(): dumpStr += '{0}:('.format(siteName) for storageType,fileList in storageTypeFile.iteritems(): dumpStr += '{0}:{1},'.format(storageType,len(fileList)) dumpStr = dumpStr[:-1] dumpStr += ') ' dumpStr= dumpStr[:-1] tmpLog.debug(dumpStr) # return tmpLog.info('done') return self.SC_SUCCEEDED,returnMap except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return self.SC_FAILED,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
if False: #outFileName is None: outFileName.append(tmpFile.lfn) if tmpFile.type in ['output', 'log']: fileList = [] if False: # tmpFile.type == 'output':# and iOut > 0: for i in range(8): newFile = copy.copy(tmpFile) newFile.lfn += '._00{0}'.format(i) fileList.append(newFile) #continue else: fileList.append(tmpFile) iOut += 1 for file in fileList: file.GUID = str(uuid.uuid4()) if DataServiceUtils.getDistributedDestination( file.destinationDBlockToken) is not None: tmpSrcDDM = DataServiceUtils.getDistributedDestination( file.destinationDBlockToken) elif job.computingSite == file.destinationSE and \ file.destinationDBlockToken in siteSpec.setokens_output[scope_output]: tmpSrcDDM = siteSpec.setokens_output[scope_output][ file.destinationDBlockToken] elif file.lfn in outFileName: tmpSrcDDM = DataServiceUtils.getDestinationSE( file.destinationDBlockToken) if tmpSrcDDM is None: tmpSrcSite = siteMapper.getSite(file.destinationSE) tmp_scope_input, tmp_scope_output = select_scope( siteSpec, job.prodSourceLabel) tmpSrcDDM = tmpSrcSite.ddm_output[tmp_scope_output] else:
def execute(self): try: # loop over all files fileMap = {} for fileSpec in self.job.Files: # ignore inputs if fileSpec.type not in ['output', 'log']: continue # ignore local if fileSpec.destinationSE == 'local': continue # collect file attributes try: fsize = int(fileSpec.fsize) except Exception: fsize = None # set GUID if empty if not fileSpec.GUID: fileSpec.GUID = str(uuid.uuid4()) fileAttrs = {'guid': fileSpec.GUID, 'lfn': fileSpec.lfn, 'size': fsize, 'checksum': fileSpec.checksum, 'ds': fileSpec.destinationDBlock} if self.extraInfo: if 'surl' in self.extraInfo and fileSpec.lfn in self.extraInfo['surl']: fileAttrs['surl'] = self.extraInfo['surl'][fileSpec.lfn] if 'nevents' in self.extraInfo and fileSpec.lfn in self.extraInfo['nevents']: fileAttrs['events'] = self.extraInfo['nevents'][fileSpec.lfn] fileMap.setdefault(fileSpec.destinationDBlock, []) fileMap[fileSpec.destinationDBlock].append(fileAttrs) # register files if fileMap: dstRSE = self.siteMapper.getSite(self.job.computingSite).ddm_output['default'] destIdMap = {dstRSE: fileMap} nTry = 3 for iTry in range(nTry): isFatal = False isFailed = False regStart = datetime.datetime.utcnow() try: self.logger.debug('{} {}'.format('registerFilesInDatasets', str(destIdMap))) out = rucioAPI.registerFilesInDataset(destIdMap, {}) except (DataIdentifierNotFound, FileConsistencyMismatch, UnsupportedOperation, InvalidPath, InvalidObject, RSENotFound, RSEProtocolNotSupported, InvalidRSEExpression, RSEFileNameNotSupported, KeyError) as e: # fatal errors out = 'failed with {}\n {}'.format(str(e), traceback.format_exc()) isFatal = True isFailed = True except Exception as e: # unknown errors isFailed = True out = 'failed with unknown error: {}\n {}'.format(str(e), traceback.format_exc()) if 'value too large for column' in out or \ 'unique constraint (ATLAS_RUCIO.DIDS_GUID_IDX) violate' in out or \ 'unique constraint (ATLAS_RUCIO.DIDS_PK) violated' in out or \ 'unique constraint (ATLAS_RUCIO.ARCH_CONTENTS_PK) violated' in out: isFatal = True else: isFatal = False regTime = datetime.datetime.utcnow() - regStart self.logger.debug('took %s.%03d sec' % (regTime.seconds, regTime.microseconds / 1000)) # failed if isFailed or isFatal: self.logger.error('%s' % out) if (iTry + 1) == nTry or isFatal: self.job.ddmErrorCode = ErrorCode.EC_Adder # extract important error string extractedErrStr = DataServiceUtils.extractImportantError(out) errMsg = "Could not add files to DDM: " if extractedErrStr == '': self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] else: self.job.ddmErrorDiag = errMsg + extractedErrStr if isFatal: self.result.setFatal() else: self.result.setTemporary() return 1 self.logger.error("Try:%s" % iTry) # sleep time.sleep(10) else: self.logger.debug('%s' % str(out)) break # done self.result.setSucceeded() self.logger.debug("end plugin") except Exception as e: errStr = 'failed to execute with {}\n'.format(str(e)) errStr += traceback.format_exc() self.logger.error(errStr) self.result.setTemporary() # return return
def doRefine(self, jediTaskID, taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: # basic refine self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # enable consistency check if self.taskSpec.parent_tid not in [ None, self.taskSpec.jediTaskID ]: for datasetSpec in self.inMasterDatasetSpec: if datasetSpec.isMaster() and datasetSpec.type == 'input': datasetSpec.enableCheckConsistency() # append attempt number for tmpKey, tmpOutTemplateMapList in iteritems( self.outputTemplateMap): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap['filenameTemplate'] if re.search( '\.\d+$', outFileTemplate ) is None and not outFileTemplate.endswith('.panda.um'): tmpOutTemplateMap[ 'filenameTemplate'] = outFileTemplate + '.1' # extract input datatype datasetTypeListIn = [] for datasetSpec in self.inMasterDatasetSpec + self.inSecDatasetSpecList: datasetType = DataServiceUtils.getDatasetType( datasetSpec.datasetName) if datasetType not in ['', None]: datasetTypeListIn.append(datasetType) # extract datatype and set destination if nessesary datasetTypeList = [] for datasetSpec in self.outDatasetSpecList: datasetType = DataServiceUtils.getDatasetType( datasetSpec.datasetName) if datasetType not in ['', None]: datasetTypeList.append(datasetType) # set numThrottled to use the task throttling mechanism if 'noThrottle' not in taskParamMap: self.taskSpec.numThrottled = 0 # set to register datasets self.taskSpec.setToRegisterDatasets() # set transient to parent datasets if self.taskSpec.processingType in [ 'merge' ] and self.taskSpec.parent_tid not in [ None, self.taskSpec.jediTaskID ]: # get parent tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI( self.taskSpec.parent_tid, None, False) if tmpStat and parentTaskSpec is not None: # set transient to parent datasets metaData = {'transient': True} for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['log', 'output']: datasetType = DataServiceUtils.getDatasetType( datasetSpec.datasetName) if datasetType not in [ '', None ] and datasetType in datasetTypeList and datasetType in datasetTypeListIn: tmpLog.info( 'set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}' .format(str(metaData), self.taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName, metadaValue in iteritems( metaData): self.ddmIF.getInterface( self.taskSpec.vo).setDatasetMetadata( datasetSpec.datasetName, metadataName, metadaValue) # input prestaging if self.taskSpec.inputPreStaging(): # set first contents feed flag self.taskSpec.set_first_contents_feed(True) except JediException.UnknownDatasetError as e: tmpLog.debug('in doRefine. {0}'.format(str(e))) raise e except Exception as e: tmpLog.error('doRefine failed with {0} {1}'.format( str(e), traceback.format_exc())) raise e tmpLog.debug('done') return self.SC_SUCCEEDED
def doRefine(self,jediTaskID,taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # enable consistency check if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: for datasetSpec in self.inMasterDatasetSpec: if datasetSpec.isMaster() and datasetSpec.type == 'input': datasetSpec.enableCheckConsistency() # append attempt number for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems(): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap['filenameTemplate'] if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'): tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1' # extract datatype and set destination if nessesary datasetTypeList = [] for datasetSpec in self.outDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if datasetType != '': datasetTypeList.append(datasetType) storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) if storageToken != None: tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint(storageToken,self.siteMapper,'production') if tmpSiteList == []: raise RuntimeError,'cannot find online siteID associated to {0}'.format(storageToken) datasetSpec.destination = tmpSiteList[0] # set numThrottled to use the task throttling mechanism if not 'noThrottle' in taskParamMap: self.taskSpec.numThrottled = 0 # set to register datasets self.taskSpec.setToRegisterDatasets() # set transient to parent datasets if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: # get parent tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False) if tmpStat and parentTaskSpec != None: # set transient to parent datasets metaData = {'transient':True} for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['log','output']: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if datasetType != '' and datasetType in datasetTypeList: tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData), self.taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName, metadataName,metadaValue) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) raise errtype,errvalue tmpLog.debug('done') return self.SC_SUCCEEDED
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format( tmpTaskID, tmpCoreName)) if not tmpCoreName in ['NULL', '', None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ['output', 'log']) # get destinations retMap[tmpTaskID] = {'datasets': [], 'nucleus': nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint( siteSpec.ddm_output, datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm_output # add origianl token if not datasetSpec.storageToken in ['', None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({ 'datasetID': datasetSpec.datasetID, 'token': 'dst:{0}'.format(token), 'destination': tmpCoreName }) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED, retMap
def main(backGround=False): _logger.debug('starting ...') # register signal handler signal.signal(signal.SIGINT, catch_sig) signal.signal(signal.SIGHUP, catch_sig) signal.signal(signal.SIGTERM, catch_sig) signal.signal(signal.SIGALRM, catch_sig) signal.alarm(overallTimeout) # forking pid = os.fork() if pid != 0: # watch child process os.wait() time.sleep(1) else: # main loop from pandaserver.taskbuffer.TaskBuffer import taskBuffer # check certificate certName = '%s/pandasv1_usercert.pem' % panda_config.certdir keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir _logger.debug('checking certificate {0}'.format(certName)) certOK, certMsg = DataServiceUtils.checkCertificate(certName) if not certOK: _logger.error('bad certificate : {0}'.format(certMsg)) # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # ActiveMQ params queue = '/queue/Consumer.panda.rucio.events' ssl_opts = { 'use_ssl': True, 'ssl_version': ssl.PROTOCOL_TLSv1, 'ssl_cert_file': certName, 'ssl_key_file': keyName } # resolve multiple brokers brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1] # set listener connList = [] for tmpBroker in brokerList: try: clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker subscription_id = 'panda-server-consumer' _logger.debug('setting listener %s to broker %s' % (clientid, tmpBroker)) conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)], **ssl_opts) connList.append(conn) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to connect to %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) while True: for conn in connList: try: if not conn.is_connected(): conn.set_listener( 'DatasetCallbackListener', DatasetCallbackListener(conn, taskBuffer, siteMapper, subscription_id)) conn.start() conn.connect(headers={'client-id': clientid}) conn.subscribe(destination=queue, id=subscription_id, ack='auto') _logger.debug('listener %s is up and running' % clientid) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to set listener on %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) time.sleep(5)
def getAnalSitesWithData(siteList,siteMapper,ddmIF,datasetName): # get replicas try: replicaMap= {} replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue # loop over all clouds retMap = {} for tmpSiteName in siteList: tmpSiteSpec = siteMapper.getSite(tmpSiteName) # loop over all DDM endpoints checkedEndPoints = [] for tmpDDM in [tmpSiteSpec.ddm] + tmpSiteSpec.setokens.values(): # skip empty if tmpDDM == '': continue # get prefix tmpPrefix = re.sub('_[^_]+$','_',tmpDDM) # already checked if tmpPrefix in checkedEndPoints: continue # DBR if DataServiceUtils.isCachedFile(datasetName,tmpSiteSpec): # no replica check since it is cached if not retMap.has_key(tmpSiteName): retMap[tmpSiteName] = {} retMap[tmpSiteName][tmpDDM] = {'tape':False,'state':'complete'} checkedEndPoints.append(tmpPrefix) continue checkedEndPoints.append(tmpPrefix) tmpSePat = '^' + tmpPrefix for tmpSE in replicaMap[datasetName].keys(): # check name with regexp pattern if re.search(tmpSePat,tmpSE) == None: continue # check archived metadata # FIXME pass # check tape attribute try: tmpOnTape = ddmIF.getSiteProperty(tmpSE,'tape') except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue) # check completeness tmpStatistics = replicaMap[datasetName][tmpSE][-1] if tmpStatistics['found'] == None: tmpDatasetStatus = 'unknown' # refresh request try: ddmIF.checkDatasetConsistency(tmpSE,datasetName) except: pass elif tmpStatistics['total'] == tmpStatistics['found']: tmpDatasetStatus = 'complete' else: tmpDatasetStatus = 'incomplete' # append if not retMap.has_key(tmpSiteName): retMap[tmpSiteName] = {} retMap[tmpSiteName][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus} # return return Interaction.SC_SUCCEEDED,retMap
def doSetup(self, taskSpec, datasetToRegister, pandaJobs): # make logger tmpLog = MsgWrapper(logger, "<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format( taskSpec.prodSourceLabel, taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output', 'log']: if not tmpFileSpec.datasetID in datasetToRegister: datasetToRegister.append( tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format( str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info( 'getting datasetSpec with datasetID={0}'.format( datasetID)) tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI( taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [ datasetSpec.datasetName, datasetSpec.containerName ]: if targetName == None: continue if not targetName in avDatasetList: # set lifetime if targetName.startswith('panda'): lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['', None]: if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken ) != None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE( datasetSpec.storageToken ) != None: location = DataServiceUtils.getDestinationSE( datasetSpec.storageToken) elif taskSpec.cloud != None: # use T1 SE tmpT1Name = siteMapper.getCloud( taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint( tmpT1Name, datasetSpec.storageToken) else: location = siteMapper.getDdmEndpoint( datasetSpec.site, datasetSpec.storageToken) if locForRule == None: locForRule = location # set metadata if taskSpec.prodSourceLabel in [ 'managed', 'test' ] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if not taskSpec.campaign in [None, '']: metaData[ 'campaign'] = taskSpec.campaign if datasetSpec.getTransient() != None: metaData[ 'transient'] = datasetSpec.getTransient( ) else: metaData = None # register dataset/container tmpLog.info( 'registering {0} with location={1} backend={2} lifetime={3} meta={4}' .format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset( targetName, backEnd=ddmBackEnd, location=location, lifetime=lifetime, metaData=metaData) if not tmpStat: tmpLog.error( 'failed to register {0}'.format( targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [ '', None ]: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut( taskSpec.prodSourceLabel) tmpLog.info( 'registring location={0} lifetime={1}days activity={2} grouping={3}' .format(locForRule, lifetime, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation( targetName, locForRule, owner=userName, lifetime=lifetime, backEnd=ddmBackEnd, activity=activity, grouping=grouping) if not tmpStat: tmpLog.error( 'failed to register location {0} with {2} for {1}' .format( locForRule, targetName, ddmBackEnd)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format( targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[ datasetSpec. containerName] = ddmIF.listDatasetsInContainer( datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[ datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer( datasetSpec.containerName, [datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append( datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI( datasetSpec, { 'jediTaskID': taskSpec.jediTaskID, 'datasetID': datasetID }) # open datasets if taskSpec.prodSourceLabel in ['managed', 'test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output', 'log']: if not tmpFileSpec.destinationDBlock in outDatasetList: outDatasetList.append( tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset, 'lifetime', None) # return tmpLog.info('done') return retOK except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format( errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def getSitesWithData(siteMapper,ddmIF,datasetName,storageToken=None): # get num of files try: if not datasetName.endswith('/'): totalNumDatasets = 1 else: tmpDsMap = ddmIF.listDatasetsInContainer(datasetName) totalNumDatasets = len(tmpDsMap) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.ddmIF.getFilesInDataset failed with %s' % errvalue # get replicas try: replicaMap= {} replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue # loop over all clouds retMap = {} for tmpCloudName in siteMapper.cloudSpec.keys(): retMap[tmpCloudName] = {'t1':{},'t2':[]} # get T1 DDM endpoints tmpCloudSpec = siteMapper.getCloud(tmpCloudName) # FIXME until CERN-PROD_TZERO is added to cloudconfig.tier1SE if tmpCloudName == 'CERN': if not 'CERN-PROD_TZERO' in tmpCloudSpec['tier1SE']: tmpCloudSpec['tier1SE'].append('CERN-PROD_TZERO') for tmpSePat in tmpCloudSpec['tier1SE']: if '*' in tmpSePat: tmpSePat = tmpSePat.replace('*','.*') tmpSePat = '^' + tmpSePat +'$' for tmpSE in replicaMap[datasetName].keys(): # check name with regexp pattern if re.search(tmpSePat,tmpSE) == None: continue # check space token if not storageToken in ['',None,'NULL']: seStr = ddmIF.getSiteProperty(tmpSE,'srm') try: if seStr.split(':')[1] != storageToken: continue except: pass # check archived metadata # FIXME pass # check tape attribute try: tmpOnTape = ddmIF.getSiteProperty(tmpSE,'tape') except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue) # check completeness tmpStatistics = replicaMap[datasetName][tmpSE][-1] if tmpStatistics['found'] == None: tmpDatasetStatus = 'unknown' # refresh request try: ddmIF.checkDatasetConsistency(tmpSE,datasetName) except: pass elif tmpStatistics['total'] == tmpStatistics['found'] and tmpStatistics['total'] >= totalNumDatasets: tmpDatasetStatus = 'complete' else: tmpDatasetStatus = 'incomplete' # append retMap[tmpCloudName]['t1'][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus} # get T2 list tmpSiteList = DataServiceUtils.getSitesWithDataset(datasetName,siteMapper,replicaMap, tmpCloudName,useHomeCloud=True, useOnlineSite=True,includeT1=False) # append retMap[tmpCloudName]['t2'] = tmpSiteList # remove if empty if len(retMap[tmpCloudName]['t1']) == 0 and len(retMap[tmpCloudName]['t2']) == 0: del retMap[tmpCloudName] # return return Interaction.SC_SUCCEEDED,retMap
def doFinalProcedure(self, taskSpec, tmpLog): tmpLog.info('final procedure for status={0} processingType={1}'.format( taskSpec.status, taskSpec.processingType)) if taskSpec.status in ['done','finished'] or \ (taskSpec.status == 'paused' and taskSpec.oldStatus in ['done','finished']): trnLifeTime = 14 * 24 * 60 * 60 trnLifeTimeLong = 28 * 24 * 60 * 60 trnLifeTimeMerge = 60 * 24 * 60 * 60 ddmIF = self.ddmIF.getInterface(taskSpec.vo) # set lifetime to transient datasets metaData = {'lifetime': trnLifeTime} datasetTypeListI = set() datasetTypeListO = set() for datasetSpec in taskSpec.datasetSpecList: if datasetSpec.type in ['log', 'output']: if datasetSpec.getTransient() == True: tmpLog.debug( 'set metadata={0} to datasetID={1}:Name={2}'. format(str(metaData), datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName, metadaValue in metaData.iteritems(): ddmIF.setDatasetMetadata(datasetSpec.datasetName, metadataName, metadaValue) # collect dataset types datasetType = DataServiceUtils.getDatasetType( datasetSpec.datasetName) if not datasetType in ['', None]: if datasetSpec.type == 'input': datasetTypeListI.add(datasetType) elif datasetSpec.type == 'output': datasetTypeListO.add(datasetType) # set lifetime to parent transient datasets if taskSpec.processingType in ['merge'] and \ (taskSpec.status == 'done' or \ (taskSpec.status == 'paused' and taskSpec.oldStatus == 'done')): # get parent task if not taskSpec.parent_tid in [None, taskSpec.jediTaskID]: # get parent tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI( taskSpec.parent_tid, None, False) if tmpStat and parentTaskSpec != None: # set lifetime to parent datasets if they are transient for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['output']: # check dataset type datasetType = DataServiceUtils.getDatasetType( datasetSpec.datasetName) if not datasetType in datasetTypeListI or not datasetType in datasetTypeListO: continue metaData = {'lifetime': trnLifeTimeMerge} tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) if tmpMetadata['transient'] == True: tmpLog.debug( 'set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}' .format(str(metaData), taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName, metadaValue in metaData.iteritems( ): ddmIF.setDatasetMetadata( datasetSpec.datasetName, metadataName, metadaValue) # delete empty datasets if taskSpec.status == 'done' or (taskSpec.status == 'paused' and taskSpec.oldStatus == 'done'): ddmIF = self.ddmIF.getInterface(taskSpec.vo) # loop over all datasets for datasetSpec in taskSpec.datasetSpecList: try: if datasetSpec.type == 'output' and datasetSpec.nFilesFinished == 0: tmpStat = ddmIF.deleteDataset(datasetSpec.datasetName, True, True) tmpLog.debug( 'delete empty prod dataset {0} with {1}'.format( datasetSpec.datasetName, tmpStat)) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.warning( 'failed to delete empty dataset with {0}:{1}'.format( errtype.__name__, errvalue)) # set lifetime to failed datasets if taskSpec.status in ['failed', 'broken', 'aborted']: trnLifeTime = 30 * 24 * 60 * 60 ddmIF = self.ddmIF.getInterface(taskSpec.vo) # only log datasets metaData = {'lifetime': trnLifeTime} for datasetSpec in taskSpec.datasetSpecList: if datasetSpec.type in ['log']: tmpLog.debug( 'set metadata={0} to failed datasetID={1}:Name={2}'. format(str(metaData), datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName, metadaValue in metaData.iteritems(): ddmIF.setDatasetMetadata(datasetSpec.datasetName, metadataName, metadaValue) return self.SC_SUCCEEDED
def doRefine(self, jediTaskID, taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: # preprocessing tmpStat, taskParamMap = self.doPreProRefine(taskParamMap) if tmpStat == True: tmpLog.debug('done for preprocessing') return self.SC_SUCCEEDED if tmpStat == False: # failed tmpLog.error('doPreProRefine failed') return self.SC_FAILED # normal refine self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: # get the latest version of DBR if datasetSpec.datasetName == 'DBR_LATEST': tmpLog.debug('resolving real name for {0}'.format( datasetSpec.datasetName)) datasetSpec.datasetName = self.ddmIF.getInterface( self.taskSpec.vo).getLatestDBRelease( useResultCache=3600) datasetSpec.containerName = datasetSpec.datasetName # set attributes to DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # check invalid characters for datasetSpec in self.outDatasetSpecList: if not DataServiceUtils.checkInvalidCharacters( datasetSpec.datasetName): errStr = "invalid characters in {0}".format( datasetSpec.datasetName) tmpLog.error(errStr) self.taskSpec.setErrDiag(errStr, None) return self.SC_FATAL # destination if taskParamMap.has_key('destination'): for datasetSpec in self.outDatasetSpecList: datasetSpec.destination = taskParamMap['destination'] # use build if taskParamMap.has_key('buildSpec'): self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useBuild']) # use template dataset self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['instantiateTmpl']) self.setSplitRule( None, 1, JediTaskSpec.splitRuleToken['instantiateTmplSite']) for datasetSpec in self.outDatasetSpecList: datasetSpec.type = "tmpl_{0}".format(datasetSpec.type) # get jobsetID tmpStat, tmpJobID = self.taskBufferIF.getUserJobsetID_JEDI( self.taskSpec.userName) if not tmpStat: tmpLog.error('failed to get jobsetID failed') return self.SC_FAILED self.taskSpec.reqID = tmpJobID # site limitation if 'excludedSite' in taskParamMap and 'includedSite' in taskParamMap: self.taskSpec.setLimitedSites('incexc') elif 'excludedSite' in taskParamMap: self.taskSpec.setLimitedSites('exc') elif 'includedSite' in taskParamMap: self.taskSpec.setLimitedSites('inc') except: errtype, errvalue = sys.exc_info()[:2] errStr = 'doRefine failed with {0}:{1}'.format( errtype.__name__, errvalue) tmpLog.error(errStr) self.taskSpec.setErrDiag(errStr, None) raise errtype, errvalue tmpLog.debug('done') return self.SC_SUCCEEDED
def doRefine(self,jediTaskID,taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: # basic refine self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # enable consistency check if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: for datasetSpec in self.inMasterDatasetSpec: if datasetSpec.isMaster() and datasetSpec.type == 'input': datasetSpec.enableCheckConsistency() # append attempt number for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems(): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap['filenameTemplate'] if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'): tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1' # extract input datatype datasetTypeListIn = [] for datasetSpec in self.inMasterDatasetSpec+self.inSecDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None]: datasetTypeListIn.append(datasetType) # extract datatype and set destination if nessesary datasetTypeList = [] for datasetSpec in self.outDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None]: datasetTypeList.append(datasetType) # set numThrottled to use the task throttling mechanism if not 'noThrottle' in taskParamMap: self.taskSpec.numThrottled = 0 # set to register datasets self.taskSpec.setToRegisterDatasets() # set transient to parent datasets if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: # get parent tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False) if tmpStat and parentTaskSpec != None: # set transient to parent datasets metaData = {'transient':True} for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['log','output']: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None] and datasetType in datasetTypeList and datasetType in datasetTypeListIn: tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData), self.taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName, metadataName,metadaValue) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) raise errtype,errvalue tmpLog.debug('done') return self.SC_SUCCEEDED
def doActionForReassgin(self, gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI( self.vo, self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger, '<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser( 'jedi', taskSpec.jediTaskID, taskSpec.cloud, 'assigned', True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format( taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None, '']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI( taskSpec, {'jediTaskID': taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug( 'set task_status={0} to trigger task brokerage again'. format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec == None: tmpLog.error("nucleus={0} doesn't exist".format( taskSpec.nucleus)) continue # set nucleus retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, datasetSpecList) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: tmpLog.debug('skip {0} is distributed'.format( datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken) # make subscription try: tmpLog.debug( 'registering subscription to {0} with backend={1}'. format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription( datasetSpec.datasetName, location, 'Production Output', asynchronous=True) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break except: errtype, errvalue = sys.exc_info()[:2] tmpLog.warning( 'failed to make subscription with {0}:{1}'.format( errtype.__name__, errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning', 'exhausted', None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI( taskSpec, {'jediTaskID': taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def doRefine(self, jediTaskID, taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug("start taskType={0}".format(self.taskSpec.taskType)) try: self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = "repeat,nosplit" # enable consistency check if not self.taskSpec.parent_tid in [None, self.taskSpec.jediTaskID]: for datasetSpec in self.inMasterDatasetSpec: if datasetSpec.isMaster() and datasetSpec.type == "input": datasetSpec.enableCheckConsistency() # append attempt number for tmpKey, tmpOutTemplateMapList in self.outputTemplateMap.iteritems(): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap["filenameTemplate"] if re.search("\.\d+$", outFileTemplate) == None and not outFileTemplate.endswith(".panda.um"): tmpOutTemplateMap["filenameTemplate"] = outFileTemplate + ".1" # extract datatype and set destination if nessesary datasetTypeList = [] for datasetSpec in self.outDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if datasetType != "": datasetTypeList.append(datasetType) storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) if storageToken != None: tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint( storageToken, self.siteMapper, "production" ) if tmpSiteList == []: raise RuntimeError, "cannot find online siteID associated to {0}".format(storageToken) datasetSpec.destination = tmpSiteList[0] # set numThrottled to use the task throttling mechanism if not "noThrottle" in taskParamMap: self.taskSpec.numThrottled = 0 # set to register datasets self.taskSpec.setToRegisterDatasets() # set transient to parent datasets if self.taskSpec.processingType in ["merge"] and not self.taskSpec.parent_tid in [ None, self.taskSpec.jediTaskID, ]: # get parent tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI( self.taskSpec.parent_tid, None, False ) if tmpStat and parentTaskSpec != None: # set transient to parent datasets metaData = {"transient": True} for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ["log", "output"]: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if datasetType != "" and datasetType in datasetTypeList: tmpLog.info( "set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}".format( str(metaData), self.taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName, ) ) for metadataName, metadaValue in metaData.iteritems(): self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata( datasetSpec.datasetName, metadataName, metadaValue ) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error("doBasicRefine failed with {0}:{1}".format(errtype.__name__, errvalue)) raise errtype, errvalue tmpLog.debug("done") return self.SC_SUCCEEDED
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud sitePreAssigned = False if not taskSpec.site in ['',None]: sitePreAssigned = True scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: sitePreAssigned = True scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # T1 t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to validatedreleases != True' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for high priorities if (taskSpec.currentPriority >= 950 or inputChunk.useScout()) and useMP != 'only' and not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip {0} due to highPrio/scouts which needs to run at {1} T1'.format(tmpSiteName, cloudName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for data availability if not sitePreAssigned: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName # ignore DBR if DataServiceUtils.isDBR(datasetName): continue if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName, datasetSpec.storageToken) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for fairshare newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog): tmpLog.debug(' skip {0} due to zero share'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to core mismatch site:%s != task:%s' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteSpec.cloud in ['ND'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip %s due to missing rel/cache %s:%s' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = taskSpec.ramCount if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True) \ + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime * inputChunk.getMaxAtomSize(effectiveSize=True) if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip %s due to too many transferring %s > max(%s,2x%s)' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for T1 weight t1Weight = taskSpec.getT1Weight() if t1Weight == 0: # use T1 weight in cloudconfig t1Weight = self.siteMapper.getCloud(cloudName)['weight'] tmpLog.debug('T1 weight {0}'.format(t1Weight)) if t1Weight < 0: newScanSiteList = [] for tmpSiteName in scanSiteList: if not tmpSiteName in t1Sites: tmpLog.debug(' skip {0} due to negative T1 weight'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot if not sitePreAssigned: nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get available files normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging or not datasetSpec.isMaster(): checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkCompleteness=checkCompleteness, storageToken=datasetSpec.storageToken) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize # get max total size tmpTotalSizes = normalizeFactors.values() tmpTotalSizes.sort() if tmpTotalSizes != []: totalSize = tmpTotalSizes.pop() else: totalSize = 0 ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # T1 weight if tmpSiteName in t1Sites: weight *= t1Weight # set weight siteCandidateSpec.weight = weight # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # append inputChunk.addSiteCandidate(siteCandidateSpec) tmpLog.debug(' use {0} with weight={1}'.format(tmpSiteName,weight)) # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def getAnalSitesWithData(siteList,siteMapper,ddmIF,datasetName): # get replicas try: replicaMap= {} replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName) except: errtype,errvalue = sys.exc_info()[:2] return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue # loop over all clouds retMap = {} for tmpSiteName in siteList: tmpSiteSpec = siteMapper.getSite(tmpSiteName) # loop over all DDM endpoints checkedEndPoints = [] for tmpDDM in [tmpSiteSpec.ddm] + tmpSiteSpec.setokens.values(): # skip empty if tmpDDM == '': continue # get prefix tmpPrefix = re.sub('_[^_]+$','_',tmpDDM) # already checked if tmpPrefix in checkedEndPoints: continue # DBR if DataServiceUtils.isCachedFile(datasetName,tmpSiteSpec): # no replica check since it is cached if not retMap.has_key(tmpSiteName): retMap[tmpSiteName] = {} retMap[tmpSiteName][tmpDDM] = {'tape':False,'state':'complete'} checkedEndPoints.append(tmpPrefix) continue checkedEndPoints.append(tmpPrefix) tmpSePat = '^' + tmpPrefix for tmpSE in replicaMap[datasetName].keys(): # check name with regexp pattern if re.search(tmpSePat,tmpSE) == None: continue # skip staging if re.search('STAGING$',tmpSE) != None: continue # check archived metadata # FIXME pass # check tape attribute try: tmpOnTape = ddmIF.getSiteProperty(tmpSE,'is_tape') except: continue # errtype,errvalue = sys.exc_info()[:2] # return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue) # check completeness tmpStatistics = replicaMap[datasetName][tmpSE][-1] if tmpStatistics['found'] == None: tmpDatasetStatus = 'unknown' # refresh request pass elif tmpStatistics['total'] == tmpStatistics['found']: tmpDatasetStatus = 'complete' else: tmpDatasetStatus = 'incomplete' # append if not retMap.has_key(tmpSiteName): retMap[tmpSiteName] = {} retMap[tmpSiteName][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus} # return return Interaction.SC_SUCCEEDED,retMap
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() upJobs = [] finJobs = [] for job in jobs: if job is None or job.jobStatus == 'unknown': continue seList = ['dummy'] tmpNucleus = siteMapper.getNucleus(job.nucleus) # get SEs if job.prodSourceLabel == 'user' and job.destinationSE not in siteMapper.siteSpecList: # using --destSE for analysis job to transfer output seList = [job.destinationSE] elif tmpNucleus is not None: seList = list(tmpNucleus.allDdmEndPoints) elif siteMapper.checkCloud(job.cloud): # normal production jobs if DataServiceUtils.checkJobDestinationSE(job) is None: tmpDstID = siteMapper.getCloud(job.cloud)['dest'] else: tmpDstID = job.destinationSE tmpDstSite = siteMapper.getSite(tmpDstID) scope_input, scope_output = select_scope(tmpDstSite, job.prodSourceLabel) seList = tmpDstSite.ddm_endpoints_output[scope_output].getLocalEndPoints() # get LFN list lfns = [] guids = [] scopes = [] nTokens = 0 for file in job.Files: # only output files are checked if file.type == 'output' or file.type == 'log': if file.status == 'nooutput': continue if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) is not None: continue lfns.append(file.lfn) guids.append(file.GUID) scopes.append(file.scope) nTokens += len(file.destinationDBlockToken.split(',')) # get files in LRC _logger.debug("%s Cloud:%s" % (job.PandaID,job.cloud)) tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns,seList) if not tmpStat: _logger.error("%s failed to get file replicas" % job.PandaID) okFiles = {} # count files nOkTokens = 0 for okLFN in okFiles: okSEs = okFiles[okLFN] nOkTokens += len(okSEs) # check all files are ready _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) if nTokens <= nOkTokens: _logger.debug("%s Finisher : Finish" % job.PandaID) for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status != 'nooutput': file.status = 'ready' # append to run Finisher finJobs.append(job) else: endTime = job.endTime if endTime == 'NULL': endTime = job.startTime # priority-dependent timeout tmpCloudSpec = siteMapper.getCloud(job.cloud) if job.currentPriority >= 800 and (job.prodSourceLabel not in ['user']): if 'transtimehi' in tmpCloudSpec: timeOutValue = tmpCloudSpec['transtimehi'] else: timeOutValue = 1 else: if 'transtimelo' in tmpCloudSpec: timeOutValue = tmpCloudSpec['transtimelo'] else: timeOutValue = 2 # protection if timeOutValue < 1: timeOutValue = 1 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) if endTime < timeOut: # timeout _logger.debug("%s Finisher : Kill" % job.PandaID) strMiss = '' for lfn in lfns: if lfn not in okFiles: strMiss += ' %s' % lfn job.jobStatus = 'failed' job.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_Transfer job.taskBufferErrorDiag = 'transfer timeout for '+strMiss guidMap = {} for file in job.Files: # set file status if file.status == 'transferring' or file.type in ['log','output']: file.status = 'failed' # collect GUIDs to delete files from _tid datasets if file.type == 'output' or file.type == 'log': if file.destinationDBlock not in guidMap: guidMap[file.destinationDBlock] = [] guidMap[file.destinationDBlock].append(file.GUID) else: # wait _logger.debug("%s Finisher : Wait" % job.PandaID) for lfn in lfns: if lfn not in okFiles: _logger.debug("%s -> %s" % (job.PandaID,lfn)) upJobs.append(job) # update _logger.debug("updating ...") self.proxyLock.acquire() taskBuffer.updateJobs(upJobs,False) self.proxyLock.release() # run Finisher for job in finJobs: fThr = Finisher(taskBuffer,None,job) fThr.start() fThr.join() _logger.debug("done") time.sleep(1) except Exception: errtype,errvalue = sys.exc_info()[:2] errStr = "FinisherThr failed with %s %s" % (errtype,errvalue) errStr += traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
# print '------------------- ddm -------------------' # print 'ddm_input: {0}, ddm_output: {1}'.format(tmp_site_spec.ddm_input, tmp_site_spec.ddm_output) # print '------------------- setokens values -------------------' # print 'setokens_input: {0}, setokens_output: {1}'.format(tmp_site_spec.setokens_input.values(), # tmp_site_spec.setokens_output.values()) # print '------------------- setokens -------------------' # print 'setokens_input: {0}, setokens_output: {1}'.format(tmp_site_spec.setokens_input, # tmp_site_spec.setokens_output) from pandaserver.dataservice import DataServiceUtils sites = site_mapper.getCloud('WORLD')['sites'] sites.sort() for tmp_site_name in sites: print 'tmp_site_name: {0}'.format(tmp_site_name) tmp_site_spec = site_mapper.getSite(tmp_site_name) #print 'tmp_site_spec.ddm_input: {0}'.format(tmp_site_spec.ddm_input) #print 'tmp_site_spec.setokens_input: {0}'.format(tmp_site_spec.setokens_input.values()) #print 'combination: {0}'.format([tmp_site_spec.ddm_input] + tmp_site_spec.setokens_input.values()) for tmp_ddm_endpoint in [tmp_site_spec.ddm_input ] + tmp_site_spec.setokens_input.values(): try: tmp_prefix = DataServiceUtils.getDQ2Prefix(tmp_ddm_endpoint) print 'prefix: {0}'.format(tmp_prefix) except TypeError: print 'excepted!' print '-------------------'
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024*1024*1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.info(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info('skip transfer backlog check due to negative T1Weight') else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei() for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in backlogged_nuclei: tmpLog.info(' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'. format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.info(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25) if tmpSpaceSize-tmpSpaceToUse < diskThreshold: tmpLog.info(' skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if not tmpNucleus in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = {'total':0,'free':0} try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except: tmpOld = None try: tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total']) except: tmpNew = None if tmpNew != None and (tmpOld == None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'], 'free':tmpSpaceSize-tmpSpaceToUse} if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # use deep scan for primary dataset if datasetSpec.isMaster(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys(), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if len(nucleusList) == 1: tmpLog.info(' disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus)) newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) else: tmpLog.info(' disable data locality check since no nucleus has input data') tmpLog.info('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format(negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except: pass tmpLog.info(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.info('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task.status=ready' tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec == None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def doRefine(self,jediTaskID,taskParamMap): # make logger tmpLog = self.tmpLog tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType)) try: # add ES paramsters if 'addEsParams' in taskParamMap and taskParamMap['addEsParams'] == True: preInclude = False preExec = False for tmpItem in taskParamMap['jobParameters']: if 'value' in tmpItem: if 'preInclude' in tmpItem['value']: tmpStr = '<PANDA_ES_ONLY>,AthenaMP/AthenaMP_EventService.py</PANDA_ES_ONLY>' tmpItem['value'] = self.insertString('preInclude',tmpStr,tmpItem['value']) preInclude = True if 'preExec' in tmpItem['value']: tmpStr = '<PANDA_ES_ONLY>;' tmpStr += 'import os;pilot_tmp=type(str(),(),{})();' tmpStr += 'pilot_tmp.__dict__.update(**os.environ);' tmpStr += 'from AthenaMP.AthenaMPFlags import jobproperties as jps;' tmpStr += 'jps.AthenaMPFlags.EventRangeChannel=pilot_tmp.PILOT_EVENTRANGECHANNEL' tmpStr += '</PANDA_ES_ONLY>' tmpItem['value'] = self.insertString('preExec',tmpStr,tmpItem['value']) preExec = True # add if missing if not preInclude: tmpStr = '<PANDA_ES_ONLY>preInclude="AthenaMP/AthenaMP_EventService.py"</PANDA_ES_ONLY>' taskParamMap['jobParameters'].append({'type':'constant', 'value':tmpStr}) if not preExec: tmpStr = '<PANDA_ES_ONLY>preExec="' tmpStr += 'import os;pilot_tmp=type(str(),(),{})();' tmpStr += 'pilot_tmp.__dict__.update(**os.environ);' tmpStr += 'from AthenaMP.AthenaMPFlags import jobproperties as jps;' tmpStr += 'jps.AthenaMPFlags.EventRangeChannel=pilot_tmp.PILOT_EVENTRANGECHANNEL' tmpStr += '"</PANDA_ES_ONLY>' taskParamMap['jobParameters'].append({'type':'constant', 'value':tmpStr}) # basic refine self.doBasicRefine(taskParamMap) # set nosplit+repeat for DBR for datasetSpec in self.inSecDatasetSpecList: if DataServiceUtils.isDBR(datasetSpec.datasetName): datasetSpec.attributes = 'repeat,nosplit' # enable consistency check if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: for datasetSpec in self.inMasterDatasetSpec: if datasetSpec.isMaster() and datasetSpec.type == 'input': datasetSpec.enableCheckConsistency() # append attempt number for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems(): for tmpOutTemplateMap in tmpOutTemplateMapList: outFileTemplate = tmpOutTemplateMap['filenameTemplate'] if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'): tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1' # extract input datatype datasetTypeListIn = [] for datasetSpec in self.inMasterDatasetSpec+self.inSecDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None]: datasetTypeListIn.append(datasetType) # extract datatype and set destination if nessesary datasetTypeList = [] for datasetSpec in self.outDatasetSpecList: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None]: datasetTypeList.append(datasetType) # set numThrottled to use the task throttling mechanism if not 'noThrottle' in taskParamMap: self.taskSpec.numThrottled = 0 # set to register datasets self.taskSpec.setToRegisterDatasets() # set transient to parent datasets if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]: # get parent tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False) if tmpStat and parentTaskSpec != None: # set transient to parent datasets metaData = {'transient':True} for datasetSpec in parentTaskSpec.datasetSpecList: if datasetSpec.type in ['log','output']: datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName) if not datasetType in ['',None] and datasetType in datasetTypeList and datasetType in datasetTypeListIn: tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData), self.taskSpec.parent_tid, datasetSpec.datasetID, datasetSpec.datasetName)) for metadataName,metadaValue in metaData.iteritems(): self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName, metadataName,metadaValue) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)) raise errtype,errvalue tmpLog.debug('done') return self.SC_SUCCEEDED