def runImpl(self): # cutoff for disk in TB diskThreshold = 5 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] thrInputSize = 1024*1024*1024 thrInputNum = 100 thrInputSizeFrac = 0.1 thrInputNumFrac = 0.1 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.debug('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.debug(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus, tmpSpaceSize, diskThreshold, tmpEP['state'])) toSkip = True break if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys()) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpLog.debug(' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac)) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpLog.debug(' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available') # use any nuclei where input is available if no sites can run jobs tmpRet = tmpSiteList okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.debug(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*({0}=weight for TAPE)'.format(negWeightTape) tmpLog.debug(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.debug('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def findMissingFiles(self, jediTaskID, cloudName): tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( jediTaskID, ['input'], True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format( datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData( self.siteMapper, ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error( 'failed to get the list of sites where {0} is available, since {1}' .format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format( datasetSpec.datasetName, cloudName, str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source'] ] + dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( checkedSites, self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error( 'failed to get available file list for {0}'.format( datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType, availableFiles in availableFilesMap.iteritems( ): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI( jediTaskID, datasetSpec.datasetID, missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format( datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def findMissingFiles(self,jediTaskID,cloudName): tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(jediTaskID,['input'],True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format(datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error('failed to get the list of sites where {0} is available, since {1}'.format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(datasetSpec.datasetName,cloudName,str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source']]+dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(checkedSites,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error('failed to get available file list for {0}'.format(datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName,availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType,availableFiles in availableFilesMap.iteritems(): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI(jediTaskID,datasetSpec.datasetID,missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format(datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format( taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess( None, taskSpec.userName) siteAccessMap = {} for tmpSiteName, tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['', None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff', 'test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug( ' skip site=%s due to status=%s criteria=-status' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format( len(scanSiteList), useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search( 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*', '', taskSpec.transHome) transHome = re.sub('_', '-', transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format( len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0, None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [ 0, None ] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug( ' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory' .format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0, None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [ 0, None ] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug( ' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory' .format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug( 'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}' .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format( minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip site={0} due to small scratch disk={1} < {2} criteria=-disk' .format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint( tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk' .format(tmpSiteName, tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug( ' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist' .format(tmpSiteName, tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug( ' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList): tmpLog.debug( ' skip site={0} excluded criteria=-excluded'.format( tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched( tmpSiteName, includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug( ' skip site={0} limited access criteria=-limitedaccess' .format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]: tmpLog.debug( ' skip site={0} cloud mismatch criteria=-cloudmismatch'. format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug( 'getting the list of sites where {0} is available'. format(datasetName)) tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData( scanSiteList, self.siteMapper, self.ddmIF, datasetName) if tmpSt in [ Interaction.JEDITemporaryError, Interaction.JEDITimeoutError ]: tmpLog.error( 'temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error( 'fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format( len(tmpRet), str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset( datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format( datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error( '{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName, tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or ( sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites( tmpDiskSiteList, self.taskBufferIF, self.siteMapper, nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems( ): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0, None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float( tmpWeightSrcMap['weight']) / wRemote else: dataWeight[tmpSiteName] += float( tmpWeightSrcMap['weight']) / wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][ datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys( ): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format( datasetName, len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format( datasetName, len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug( 'data is unavailable locally or remotely at preassigned site {0}' .format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format( taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI( taskSpec.jediTaskID, timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'starting', None, None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed + nClosed > 2 * nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs( jobStatPrioMap, tmpSiteName, 'throttled', None, None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format( tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format( nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites( candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key( tmpSiteName ) and remoteSourceList[tmpSiteName].has_key( datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( fileScanSiteList, self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug( ' skip site={0} non pre-assigned site criteria=-nonpreassigned' .format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName, availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[ tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[ tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[ tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[ tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles( availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug( tmpMsg.format( tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug( ' skip site={0} file unavailable criteria=-fileunavailable' .format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug( ' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use' .format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName) siteAccessMap = {} for tmpSiteName,tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # loop over all sites for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['',None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' ###################################### # selection for data availability dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList, self.siteMapper, self.ddmIF,datasetName) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet))) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error('{0} is unavaiable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # check if the data is available on disk if AtlasBrokerUtils.getAnalSitesWithDataDisk(self.dataSiteMap[datasetName]) == []: tmpLog.error('{0} is avaiable only on tape'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # get the list of sites where data is available scanSiteList = None normFactor = 0 for datasetName,tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where disk replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite) # get sites which can remotely access source sites if (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpSiteList,self.taskBufferIF, self.siteMapper,nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 1 else: dataWeight[tmpSiteName] += 1 # make weight map for remote for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems(): # skip since local data is available if tmpSiteName in tmpSiteList: continue # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = tmpWeightSrcMap['weight'] else: dataWeight[tmpSiteName] += tmpWeightSrcMap['weight'] # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList))) tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff','test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for release if not taskSpec.transHome in [None,'AnalysisTransforms']: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = taskBuffer.checkSitesWithRelease(scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*','',taskSpec.transHome) transHome = re.sub('_','-',transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '': # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases=taskSpec.transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteSpec.cloud in ['ND']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip %s due to missing rel/cache %s:%s' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = taskSpec.ramCount if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList): tmpLog.debug(' skip {0} excluded'.format(tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug(' skip {0} not included'.format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug(' skip {0} limited access'.format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: tmpLog.debug(' skip {0} cloud missmatch'.format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug(' skip {0} not included'.format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) tmpLog.debug('currentPriority={0}'.format(currentPriority)) tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} didn't pass all tests".format(taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) weight = float(nThrottled + 1) # noramize weights by taking data availability into account if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))] # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2]) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug(' skip {0} non pre-assigned site'.format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName,availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= availableFiles[tmpRemoteSite]['localdisk']: # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] isAvailable = True else: tmpLog.debug('{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4}'.format(tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']))) if not isAvailable: break # append if not isAvailable: tmpLog.debug(' skip {0} file unavailable'.format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5}'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName) siteAccessMap = {} for tmpSiteName,tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['',None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff','test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*','',taskSpec.transHome) transHome = re.sub('_','-',transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0,None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [0,None] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug(' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'.format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0,None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [0,None] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug(' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'.format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug('maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'.format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip site={0} due to small scratch disk={1} < {2} criteria=-disk'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'.format(tmpSiteName,tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug(' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList): tmpLog.debug(' skip site={0} excluded criteria=-excluded'.format(tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug(' skip site={0} not included criteria=-notincluded'.format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug(' skip site={0} limited access criteria=-limitedaccess'.format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: tmpLog.debug(' skip site={0} cloud mismatch criteria=-cloudmismatch'.format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug(' skip site={0} not included criteria=-notincluded'.format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is available'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList, self.siteMapper, self.ddmIF,datasetName) if tmpSt in [Interaction.JEDITemporaryError,Interaction.JEDITimeoutError]: tmpLog.error('temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset(datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format(datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error('{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName,tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpDiskSiteList,self.taskBufferIF, self.siteMapper,nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems(): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0,None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float(tmpWeightSrcMap['weight'])/wRemote else: dataWeight[tmpSiteName] += float(tmpWeightSrcMap['weight'])/wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format(datasetName,len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format(taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(taskSpec.jediTaskID,timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting', None,None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed+nClosed > 2*nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites(candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug(' skip site={0} non pre-assigned site criteria=-nonpreassigned'.format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName,availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles(availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug(tmpMsg.format(tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug(' skip site={0} file unavailable criteria=-fileunavailable'.format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get sites in the cloud if not taskSpec.site in ['', None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug( ' skip {0} due to site RAM shortage={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug( ' skip {0} due to job RAM shortage={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud if not taskSpec.site in ['',None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud sitePreAssigned = False siteListPreAssigned = False if not taskSpec.site in ['',None]: if ',' in taskSpec.site: # site list siteListPreAssigned = True scanSiteList = taskSpec.site.split(',') else: # site sitePreAssigned = True scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: siteListPreAssigned = True scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite()) scanSiteList.append(inputChunk.getPreassignedSite()) tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList), inputChunk.getPreassignedSite()) tmpMsg += 'criteria=+premerge' tmpLog.debug(tmpMsg) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # T1 if not taskSpec.useWorldCloud(): t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] else: # get destination for WORLD cloud t1Sites = [] tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log']) for datasetSpec in datasetSpecList: if not datasetSpec.destination in t1Sites: t1Sites.append(datasetSpec.destination) # sites sharing SE with T1 sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0]) # all T1 allT1Sites = self.getAllT1Sites() # core count if inputChunk.isMerging and taskSpec.mergeCoreCount != None: taskCoreCount = taskSpec.mergeCoreCount else: taskCoreCount = taskSpec.coreCount # MP if taskCoreCount != None and taskCoreCount > 1: # use MCORE only useMP = 'only' elif taskCoreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' # get workQueue workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID) ###################################### # selection for status if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for high priorities t1WeightForHighPrio = 1 if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \ and not sitePreAssigned and not siteListPreAssigned: t1WeightForHighPrio = 100 newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites: newScanSiteList.append(tmpSiteName) else: tmpMsg = ' skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName, cloudName) tmpMsg += 'criteria=-scoutprio' tmpLog.debug(tmpMsg) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection to avoid slow or inactive sites if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \ inputChunk.isMerging or taskSpec.mergeOutput()) \ and not sitePreAssigned: # get inactive sites inactiveTimeLimit = 2 inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit) newScanSiteList = [] tmpMsgList = [] for tmpSiteName in scanSiteList: nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting') if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName) tmpMsg += 'criteria=-slow' tmpMsgList.append(tmpMsg) elif tmpSiteName in inactiveSites and nToGetAll > 0: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName, inactiveTimeLimit) tmpMsg += 'criteria=-inactive' tmpMsgList.append(tmpMsg) else: newScanSiteList.append(tmpSiteName) if newScanSiteList != []: scanSiteList = newScanSiteList for tmpMsg in tmpMsgList: tmpLog.debug(tmpMsg) tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability if not sitePreAssigned and not siteListPreAssigned: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName # ignore DBR if DataServiceUtils.isDBR(datasetName): continue if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName, datasetSpec.storageToken) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) """ # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError """ ###################################### # selection for fairshare if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog): tmpLog.debug(' skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing cache=%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount) tmpMsg += 'criteria=-lowmemory' tmpLog.debug(tmpMsg) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount) tmpMsg += 'criteria=-highmemory' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk if taskSpec.outputScaleWithEvents(): minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True) else: minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True) minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpMsg = ' skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount) tmpMsg += 'criteria=-disk' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList), minDiskCount)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm) if tmpSiteSpaceMap != {}: tmpSiteFreeSpace = tmpSiteSpaceMap['free'] tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue # check if blacklisted if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm): tmpLog.debug(' skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime if not taskSpec.useHS06(): tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) minWalltime = taskSpec.walltime * tmpMaxAtomSize strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize) else: tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True) minWalltime = taskSpec.cpuTime * tmpMaxAtomSize strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize) if minWalltime != None or inputChunk.useScout(): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) siteMaxTime = tmpSiteSpec.maxtime origSiteMaxTime = siteMaxTime # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day if inputChunk.useScout() or inputChunk.isMerging or \ (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]): minTimeForZeroWalltime = 24*60*60 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime: tmpMsg = ' skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName, siteMaxTime) if inputChunk.useScout(): tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-scoutwalltime' else: tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-zerowalltime' tmpLog.debug(tmpMsg) continue # check max walltime at the site tmpSiteStr = '{0}'.format(siteMaxTime) if taskSpec.useHS06(): oldSiteMaxTime = siteMaxTime siteMaxTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime) if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMaxTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMaxTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMaxTime = long(siteMaxTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMaxTime != 0 and minWalltime > siteMaxTime: tmpMsg = ' skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-shortwalltime' tmpLog.debug(tmpMsg) continue # check min walltime at the site siteMinTime = tmpSiteSpec.mintime origSiteMinTime = siteMinTime tmpSiteStr = '{0}'.format(siteMinTime) if taskSpec.useHS06(): oldSiteMinTime = siteMinTime siteMinTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime) if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMinTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMinTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMinTime = long(siteMinTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMinTime != 0 and minWalltime < siteMinTime: tmpMsg = ' skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-longwalltime' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList if not taskSpec.useHS06(): tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) else: tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for network connectivity if not sitePreAssigned: ipConnectivity = taskSpec.getIpConnectivity() if ipConnectivity != None: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.wnconnectivity == 'full': pass elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http': pass else: tmpMsg = ' skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName, tmpSiteSpec.wnconnectivity, ipConnectivity) tmpMsg += 'criteria=-network' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList), ipConnectivity)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for event service if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # event service if taskSpec.useEventService(): if tmpSiteSpec.getJobSeed() == 'std': tmpMsg = ' skip site={0} since EventService is not allowed '.format(tmpSiteName) tmpMsg += 'criteria=-es' tmpLog.debug(tmpMsg) continue else: if tmpSiteSpec.getJobSeed() == 'es': tmpMsg = ' skip site={0} since only EventService is allowed '.format(tmpSiteName) tmpMsg += 'criteria=-nones' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for T1 weight t1Weight = taskSpec.getT1Weight() if t1Weight == 0: # use T1 weight in cloudconfig t1Weight = self.siteMapper.getCloud(cloudName)['weight'] if t1Weight < 0: newScanSiteList = [] for tmpSiteName in scanSiteList: if not tmpSiteName in t1Sites: tmpLog.debug(' skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList t1Weight = 1 t1Weight = max(t1Weight,t1WeightForHighPrio) tmpLog.debug('T1 weight {0}'.format(t1Weight)) tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nPilotMap = {} if not sitePreAssigned: nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel: tmpLog.debug(' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) nPilotMap[tmpSiteName] = nPilot scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # get available files normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper, ignoreCC=True) # disable file lookup for merge jobs or secondary datasets checkCompleteness = True useCompleteOnly = False if inputChunk.isMerging: checkCompleteness = False if not datasetSpec.isMaster(): useCompleteOnly = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkCompleteness=checkCompleteness, storageToken=datasetSpec.storageToken, useCompleteOnly=useCompleteOnly) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize # get max total size tmpTotalSizes = normalizeFactors.values() tmpTotalSizes.sort() if tmpTotalSizes != []: totalSize = tmpTotalSizes.pop() else: totalSize = 0 ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList))) weightMapPrimary = {} weightMapSecondary = {} newScanSiteList = [] for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID) nDefined = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID) if tmpSiteName in nPilotMap: nPilot = nPilotMap[tmpSiteName] else: nPilot = 0 manyAssigned = float(nAssigned + 1) / float(nActivated + 1) manyAssigned = min(2.0,manyAssigned) manyAssigned = max(1.0,manyAssigned) weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned, nStarting,nDefined, totalSize,manyAssigned, nPilot) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName]) # T1 weight if tmpSiteName in t1Sites+sitesShareSeT1: weight *= t1Weight weightStr += 't1W={0} '.format(t1Weight) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight and params siteCandidateSpec.weight = weight siteCandidateSpec.nRunningJobs = nRunning siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting siteCandidateSpec.nAssignedJobs = nAssigned # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # check if site is locked for WORLD lockedByBrokerage = False if taskSpec.useWorldCloud(): lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel, tmpSiteName,taskSpec.workQueue_ID) # check cap with nRunning cutOffValue = 20 cutOffFactor = 2 nRunningCap = max(cutOffValue,cutOffFactor*nRunning) nRunningCap = max(nRunningCap,nPilot) okMsg = ' use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr) okAsPrimay = False if lockedByBrokerage: ngMsg = ' skip site={0} due to locked by another brokerage '.format(tmpSiteName) ngMsg += 'criteria=-lock' elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap: ngMsg = ' skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName, nDefined+nActivated+nAssigned+nStarting) ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue, cutOffFactor, nRunning, nPilot) ngMsg += 'criteria=-cap' else: ngMsg = ' skip site={0} due to low weight '.format(tmpSiteName) ngMsg += 'criteria=-loweigh' okAsPrimay = True # use primay if cap/lock check is passed if okAsPrimay: weightMap = weightMapPrimary else: weightMap = weightMapSecondary # add weight if not weight in weightMap: weightMap[weight] = [] weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg)) # use second candidates if no primary candidates passed cap/lock check if weightMapPrimary == {}: tmpLog.debug('use second candidates since no sites pass cap/lock check') weightMap = weightMapSecondary # use hightest 3 weights weightRank = 3 else: weightMap = weightMapPrimary # use all weights weightRank = None # dump NG message for tmpWeight in weightMapSecondary.keys(): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]: tmpLog.debug(tmpNgMsg) # max candidates for WORLD if taskSpec.useWorldCloud(): maxSiteCandidates = 10 else: maxSiteCandidates = None newScanSiteList = [] weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightIdx,tmpWeight in enumerate(weightList): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]: if (weightRank == None or weightIdx < weightRank) and \ (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates): # use site tmpLog.debug(tmpOkMsg) newScanSiteList.append(siteCandidateSpec.siteName) inputChunk.addSiteCandidate(siteCandidateSpec) else: # dump NG message tmpLog.debug(tmpNgMsg) scanSiteList = newScanSiteList # final check if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # lock sites for WORLD if taskSpec.useWorldCloud(): for tmpSiteName in scanSiteList: self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID) tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) # return self.sendLogMessage(tmpLog) tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def __init__(self,ddmIF,taskBufferIF): JobBrokerBase.__init__(self,ddmIF,taskBufferIF) self.hospitalQueueMap = AtlasBrokerUtils.getHospitalQueues(self.siteMapper) self.dataSiteMap = {}
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud if not taskSpec.site in ['',None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # T1 t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: useMP = True else: useMP = False ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to validatedreleases != True' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for high priorities if taskSpec.currentPriority >= 950 and not useMP: newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to high prio which needs to run at T1' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for high prio'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for data availability for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for fairshare newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog): tmpLog.debug(' skip {0} due to zero share'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if (useMP and tmpSiteSpec.coreCount > 1) or \ (not useMP and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to core mismatch site:%s != task:%s' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='nightlies', cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteSpec.cloud in ['ND'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip %s due to missing rel/cache %s:%s' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = taskSpec.ramCount if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCount = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip %s due to too many transferring %s > max(%s,2x%s)' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get available files totalSize = 0 normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1]) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # get total size totalSize += datasetSpec.getSize() # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # append inputChunk.addSiteCandidate(siteCandidateSpec) tmpLog.debug(' use {0} with weight={1}'.format(tmpSiteName,weight)) # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024*1024*1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.info(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info('skip transfer backlog check due to negative T1Weight') else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei() for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in backlogged_nuclei: tmpLog.info(' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'. format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.info(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25) if tmpSpaceSize-tmpSpaceToUse < diskThreshold: tmpLog.info(' skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if not tmpNucleus in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = {'total':0,'free':0} try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except: tmpOld = None try: tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total']) except: tmpNew = None if tmpNew != None and (tmpOld == None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'], 'free':tmpSpaceSize-tmpSpaceToUse} if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # use deep scan for primary dataset if datasetSpec.isMaster(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys(), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if len(nucleusList) == 1: tmpLog.info(' disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus)) newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) else: tmpLog.info(' disable data locality check since no nucleus has input data') tmpLog.info('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format(negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except: pass tmpLog.info(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.info('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task.status=ready' tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)