def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format( taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess( None, taskSpec.userName) siteAccessMap = {} for tmpSiteName, tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['', None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff', 'test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug( ' skip site=%s due to status=%s criteria=-status' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format( len(scanSiteList), useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search( 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*', '', taskSpec.transHome) transHome = re.sub('_', '-', transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format( len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0, None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [ 0, None ] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug( ' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory' .format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0, None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [ 0, None ] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug( ' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory' .format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug( 'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}' .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format( minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip site={0} due to small scratch disk={1} < {2} criteria=-disk' .format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint( tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk' .format(tmpSiteName, tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug( ' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist' .format(tmpSiteName, tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug( ' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList): tmpLog.debug( ' skip site={0} excluded criteria=-excluded'.format( tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched( tmpSiteName, includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug( ' skip site={0} limited access criteria=-limitedaccess' .format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]: tmpLog.debug( ' skip site={0} cloud mismatch criteria=-cloudmismatch'. format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug( 'getting the list of sites where {0} is available'. format(datasetName)) tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData( scanSiteList, self.siteMapper, self.ddmIF, datasetName) if tmpSt in [ Interaction.JEDITemporaryError, Interaction.JEDITimeoutError ]: tmpLog.error( 'temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error( 'fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format( len(tmpRet), str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset( datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format( datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error( '{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName, tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or ( sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites( tmpDiskSiteList, self.taskBufferIF, self.siteMapper, nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems( ): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0, None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float( tmpWeightSrcMap['weight']) / wRemote else: dataWeight[tmpSiteName] += float( tmpWeightSrcMap['weight']) / wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][ datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys( ): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format( datasetName, len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format( datasetName, len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug( 'data is unavailable locally or remotely at preassigned site {0}' .format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format( taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI( taskSpec.jediTaskID, timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'starting', None, None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed + nClosed > 2 * nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs( jobStatPrioMap, tmpSiteName, 'throttled', None, None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format( tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format( nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites( candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key( tmpSiteName ) and remoteSourceList[tmpSiteName].has_key( datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( fileScanSiteList, self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug( ' skip site={0} non pre-assigned site criteria=-nonpreassigned' .format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName, availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[ tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[ tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[ tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[ tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles( availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug( tmpMsg.format( tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug( ' skip site={0} file unavailable criteria=-fileunavailable' .format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug( ' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use' .format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName) siteAccessMap = {} for tmpSiteName,tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # loop over all sites for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['',None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' ###################################### # selection for data availability dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList, self.siteMapper, self.ddmIF,datasetName) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet))) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error('{0} is unavaiable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # check if the data is available on disk if AtlasBrokerUtils.getAnalSitesWithDataDisk(self.dataSiteMap[datasetName]) == []: tmpLog.error('{0} is avaiable only on tape'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # get the list of sites where data is available scanSiteList = None normFactor = 0 for datasetName,tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where disk replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite) # get sites which can remotely access source sites if (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpSiteList,self.taskBufferIF, self.siteMapper,nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 1 else: dataWeight[tmpSiteName] += 1 # make weight map for remote for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems(): # skip since local data is available if tmpSiteName in tmpSiteList: continue # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = tmpWeightSrcMap['weight'] else: dataWeight[tmpSiteName] += tmpWeightSrcMap['weight'] # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList))) tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff','test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for release if not taskSpec.transHome in [None,'AnalysisTransforms']: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = taskBuffer.checkSitesWithRelease(scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*','',taskSpec.transHome) transHome = re.sub('_','-',transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '': # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases=taskSpec.transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteSpec.cloud in ['ND']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip %s due to missing rel/cache %s:%s' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = taskSpec.ramCount if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList): tmpLog.debug(' skip {0} excluded'.format(tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug(' skip {0} not included'.format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug(' skip {0} limited access'.format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: tmpLog.debug(' skip {0} cloud missmatch'.format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug(' skip {0} not included'.format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) tmpLog.debug('currentPriority={0}'.format(currentPriority)) tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} didn't pass all tests".format(taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) weight = float(nThrottled + 1) # noramize weights by taking data availability into account if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))] # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2]) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug(' skip {0} non pre-assigned site'.format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName,availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= availableFiles[tmpRemoteSite]['localdisk']: # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] isAvailable = True else: tmpLog.debug('{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4}'.format(tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']))) if not isAvailable: break # append if not isAvailable: tmpLog.debug(' skip {0} file unavailable'.format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5}'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get sites in the cloud if not taskSpec.site in ['', None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug( ' skip {0} due to site RAM shortage={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug( ' skip {0} due to job RAM shortage={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName) siteAccessMap = {} for tmpSiteName,tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['',None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff','test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*','',taskSpec.transHome) transHome = re.sub('_','-',transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0,None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [0,None] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug(' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'.format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0,None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [0,None] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug(' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'.format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug('maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'.format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip site={0} due to small scratch disk={1} < {2} criteria=-disk'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'.format(tmpSiteName,tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug(' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList): tmpLog.debug(' skip site={0} excluded criteria=-excluded'.format(tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug(' skip site={0} not included criteria=-notincluded'.format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug(' skip site={0} limited access criteria=-limitedaccess'.format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: tmpLog.debug(' skip site={0} cloud mismatch criteria=-cloudmismatch'.format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug(' skip site={0} not included criteria=-notincluded'.format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is available'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList, self.siteMapper, self.ddmIF,datasetName) if tmpSt in [Interaction.JEDITemporaryError,Interaction.JEDITimeoutError]: tmpLog.error('temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset(datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format(datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error('{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName,tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpDiskSiteList,self.taskBufferIF, self.siteMapper,nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems(): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0,None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float(tmpWeightSrcMap['weight'])/wRemote else: dataWeight[tmpSiteName] += float(tmpWeightSrcMap['weight'])/wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format(datasetName,len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format(taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(taskSpec.jediTaskID,timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting', None,None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed+nClosed > 2*nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites(candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug(' skip site={0} non pre-assigned site criteria=-nonpreassigned'.format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName,availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles(availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug(tmpMsg.format(tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug(' skip site={0} file unavailable criteria=-fileunavailable'.format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud if not taskSpec.site in ['',None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running', None,None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined', None,None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format(siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud sitePreAssigned = False siteListPreAssigned = False if not taskSpec.site in ['',None]: if ',' in taskSpec.site: # site list siteListPreAssigned = True scanSiteList = taskSpec.site.split(',') else: # site sitePreAssigned = True scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: siteListPreAssigned = True scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite()) scanSiteList.append(inputChunk.getPreassignedSite()) tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList), inputChunk.getPreassignedSite()) tmpMsg += 'criteria=+premerge' tmpLog.debug(tmpMsg) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # T1 if not taskSpec.useWorldCloud(): t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] else: # get destination for WORLD cloud t1Sites = [] tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log']) for datasetSpec in datasetSpecList: if not datasetSpec.destination in t1Sites: t1Sites.append(datasetSpec.destination) # sites sharing SE with T1 sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0]) # all T1 allT1Sites = self.getAllT1Sites() # core count if inputChunk.isMerging and taskSpec.mergeCoreCount != None: taskCoreCount = taskSpec.mergeCoreCount else: taskCoreCount = taskSpec.coreCount # MP if taskCoreCount != None and taskCoreCount > 1: # use MCORE only useMP = 'only' elif taskCoreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' # get workQueue workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID) ###################################### # selection for status if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for high priorities t1WeightForHighPrio = 1 if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \ and not sitePreAssigned and not siteListPreAssigned: t1WeightForHighPrio = 100 newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites: newScanSiteList.append(tmpSiteName) else: tmpMsg = ' skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName, cloudName) tmpMsg += 'criteria=-scoutprio' tmpLog.debug(tmpMsg) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection to avoid slow or inactive sites if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \ inputChunk.isMerging or taskSpec.mergeOutput()) \ and not sitePreAssigned: # get inactive sites inactiveTimeLimit = 2 inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit) newScanSiteList = [] tmpMsgList = [] for tmpSiteName in scanSiteList: nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting') if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName) tmpMsg += 'criteria=-slow' tmpMsgList.append(tmpMsg) elif tmpSiteName in inactiveSites and nToGetAll > 0: tmpMsg = ' skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName, inactiveTimeLimit) tmpMsg += 'criteria=-inactive' tmpMsgList.append(tmpMsg) else: newScanSiteList.append(tmpSiteName) if newScanSiteList != []: scanSiteList = newScanSiteList for tmpMsg in tmpMsgList: tmpLog.debug(tmpMsg) tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability if not sitePreAssigned and not siteListPreAssigned: for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName # ignore DBR if DataServiceUtils.isDBR(datasetName): continue if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName, datasetSpec.storageToken) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) """ # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError """ ###################################### # selection for fairshare if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog): tmpLog.debug(' skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='CVMFS') # releases='nightlies', # cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing cache=%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount) tmpMsg += 'criteria=-lowmemory' tmpLog.debug(tmpMsg) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount) tmpMsg += 'criteria=-highmemory' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk if taskSpec.outputScaleWithEvents(): minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True) else: minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True) minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpMsg = ' skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount) tmpMsg += 'criteria=-disk' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList), minDiskCount)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm) if tmpSiteSpaceMap != {}: tmpSiteFreeSpace = tmpSiteSpaceMap['free'] tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue # check if blacklisted if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm): tmpLog.debug(' skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime if not taskSpec.useHS06(): tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) minWalltime = taskSpec.walltime * tmpMaxAtomSize strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize) else: tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True) minWalltime = taskSpec.cpuTime * tmpMaxAtomSize strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize) if minWalltime != None or inputChunk.useScout(): newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) siteMaxTime = tmpSiteSpec.maxtime origSiteMaxTime = siteMaxTime # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day if inputChunk.useScout() or inputChunk.isMerging or \ (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]): minTimeForZeroWalltime = 24*60*60 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime: tmpMsg = ' skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName, siteMaxTime) if inputChunk.useScout(): tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-scoutwalltime' else: tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime) tmpMsg += 'criteria=-zerowalltime' tmpLog.debug(tmpMsg) continue # check max walltime at the site tmpSiteStr = '{0}'.format(siteMaxTime) if taskSpec.useHS06(): oldSiteMaxTime = siteMaxTime siteMaxTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime) if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMaxTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMaxTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMaxTime = long(siteMaxTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMaxTime != 0 and minWalltime > siteMaxTime: tmpMsg = ' skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-shortwalltime' tmpLog.debug(tmpMsg) continue # check min walltime at the site siteMinTime = tmpSiteSpec.mintime origSiteMinTime = siteMinTime tmpSiteStr = '{0}'.format(siteMinTime) if taskSpec.useHS06(): oldSiteMinTime = siteMinTime siteMinTime -= taskSpec.baseWalltime tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime) if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]: siteMinTime *= tmpSiteSpec.coreCount tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount) if taskSpec.useHS06(): if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]: siteMinTime *= tmpSiteSpec.corepower tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower) siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0 siteMinTime = long(siteMinTime) tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency) if origSiteMinTime != 0 and minWalltime < siteMinTime: tmpMsg = ' skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName, tmpSiteStr, strMinWalltime) tmpMsg += 'criteria=-longwalltime' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList if not taskSpec.useHS06(): tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) else: tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for network connectivity if not sitePreAssigned: ipConnectivity = taskSpec.getIpConnectivity() if ipConnectivity != None: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.wnconnectivity == 'full': pass elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http': pass else: tmpMsg = ' skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName, tmpSiteSpec.wnconnectivity, ipConnectivity) tmpMsg += 'criteria=-network' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList), ipConnectivity)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for event service if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # event service if taskSpec.useEventService(): if tmpSiteSpec.getJobSeed() == 'std': tmpMsg = ' skip site={0} since EventService is not allowed '.format(tmpSiteName) tmpMsg += 'criteria=-es' tmpLog.debug(tmpMsg) continue else: if tmpSiteSpec.getJobSeed() == 'es': tmpMsg = ' skip site={0} since only EventService is allowed '.format(tmpSiteName) tmpMsg += 'criteria=-nones' tmpLog.debug(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for T1 weight t1Weight = taskSpec.getT1Weight() if t1Weight == 0: # use T1 weight in cloudconfig t1Weight = self.siteMapper.getCloud(cloudName)['weight'] if t1Weight < 0: newScanSiteList = [] for tmpSiteName in scanSiteList: if not tmpSiteName in t1Sites: tmpLog.debug(' skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList t1Weight = 1 t1Weight = max(t1Weight,t1WeightForHighPrio) tmpLog.debug('T1 weight {0}'.format(t1Weight)) tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nPilotMap = {} if not sitePreAssigned: nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel: tmpLog.debug(' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) nPilotMap[tmpSiteName] = nPilot scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError ###################################### # get available files normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper, ignoreCC=True) # disable file lookup for merge jobs or secondary datasets checkCompleteness = True useCompleteOnly = False if inputChunk.isMerging: checkCompleteness = False if not datasetSpec.isMaster(): useCompleteOnly = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkCompleteness=checkCompleteness, storageToken=datasetSpec.storageToken, useCompleteOnly=useCompleteOnly) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize # get max total size tmpTotalSizes = normalizeFactors.values() tmpTotalSizes.sort() if tmpTotalSizes != []: totalSize = tmpTotalSizes.pop() else: totalSize = 0 ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList))) weightMapPrimary = {} weightMapSecondary = {} newScanSiteList = [] for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID) nDefined = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID) if tmpSiteName in nPilotMap: nPilot = nPilotMap[tmpSiteName] else: nPilot = 0 manyAssigned = float(nAssigned + 1) / float(nActivated + 1) manyAssigned = min(2.0,manyAssigned) manyAssigned = max(1.0,manyAssigned) weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned, nStarting,nDefined, totalSize,manyAssigned, nPilot) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName]) # T1 weight if tmpSiteName in t1Sites+sitesShareSeT1: weight *= t1Weight weightStr += 't1W={0} '.format(t1Weight) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight and params siteCandidateSpec.weight = weight siteCandidateSpec.nRunningJobs = nRunning siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting siteCandidateSpec.nAssignedJobs = nAssigned # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # check if site is locked for WORLD lockedByBrokerage = False if taskSpec.useWorldCloud(): lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel, tmpSiteName,taskSpec.workQueue_ID) # check cap with nRunning cutOffValue = 20 cutOffFactor = 2 nRunningCap = max(cutOffValue,cutOffFactor*nRunning) nRunningCap = max(nRunningCap,nPilot) okMsg = ' use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr) okAsPrimay = False if lockedByBrokerage: ngMsg = ' skip site={0} due to locked by another brokerage '.format(tmpSiteName) ngMsg += 'criteria=-lock' elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap: ngMsg = ' skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName, nDefined+nActivated+nAssigned+nStarting) ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue, cutOffFactor, nRunning, nPilot) ngMsg += 'criteria=-cap' else: ngMsg = ' skip site={0} due to low weight '.format(tmpSiteName) ngMsg += 'criteria=-loweigh' okAsPrimay = True # use primay if cap/lock check is passed if okAsPrimay: weightMap = weightMapPrimary else: weightMap = weightMapSecondary # add weight if not weight in weightMap: weightMap[weight] = [] weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg)) # use second candidates if no primary candidates passed cap/lock check if weightMapPrimary == {}: tmpLog.debug('use second candidates since no sites pass cap/lock check') weightMap = weightMapSecondary # use hightest 3 weights weightRank = 3 else: weightMap = weightMapPrimary # use all weights weightRank = None # dump NG message for tmpWeight in weightMapSecondary.keys(): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]: tmpLog.debug(tmpNgMsg) # max candidates for WORLD if taskSpec.useWorldCloud(): maxSiteCandidates = 10 else: maxSiteCandidates = None newScanSiteList = [] weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightIdx,tmpWeight in enumerate(weightList): for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]: if (weightRank == None or weightIdx < weightRank) and \ (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates): # use site tmpLog.debug(tmpOkMsg) newScanSiteList.append(siteCandidateSpec.siteName) inputChunk.addSiteCandidate(siteCandidateSpec) else: # dump NG message tmpLog.debug(tmpNgMsg) scanSiteList = newScanSiteList # final check if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) return retTmpError # lock sites for WORLD if taskSpec.useWorldCloud(): for tmpSiteName in scanSiteList: self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID) tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) # return self.sendLogMessage(tmpLog) tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk
def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap): # make logger tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL,inputChunk retTmpError = self.SC_FAILED,inputChunk # get sites in the cloud if not taskSpec.site in ['',None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList))) # get job statistics tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # T1 t1Sites = [self.siteMapper.getCloud(cloudName)['source']] # hospital sites if self.hospitalQueueMap.has_key(cloudName): t1Sites += self.hospitalQueueMap[cloudName] # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: useMP = True else: useMP = False ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for reprocessing if taskSpec.processingType == 'reprocessing': newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check schedconfig.validatedreleases if tmpSiteSpec.validatedreleases == ['True']: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to validatedreleases != True' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for high priorities if taskSpec.currentPriority >= 950 and not useMP: newScanSiteList = [] for tmpSiteName in scanSiteList: if tmpSiteName in t1Sites: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to high prio which needs to run at T1' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for high prio'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for data availability for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName)) tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper, self.ddmIF,datasetName) if tmpSt == self.SC_FAILED: tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError if tmpSt == self.SC_FATAL: tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal # append self.dataSiteMap[datasetName] = tmpRet tmpLog.debug('map of data availability : {0}'.format(str(tmpRet))) # check if T1 has the data if self.dataSiteMap[datasetName].has_key(cloudName): cloudHasData = True else: cloudHasData = False t1hasData = False if cloudHasData: for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems(): if tmpSeVal['state'] == 'complete': t1hasData = True break # T1 has incomplete data while no data at T2 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []: # use incomplete data at T1 anyway t1hasData = True # data is missing at T1 if not t1hasData: tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName)) # make subscription to T1 # FIXME pass # use T2 until data is complete at T1 newScanSiteList = [] for tmpSiteName in scanSiteList: if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']: newScanSiteList.append(tmpSiteName) else: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.cloud != cloudName: tmpLog.debug(' skip %s due to foreign T2' % tmpSiteName) else: tmpLog.debug(' skip %s due to missing data at T2' % tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for fairshare newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog): tmpLog.debug(' skip {0} due to zero share'.format(tmpSiteName)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for I/O intensive tasks # FIXME pass ###################################### # selection for MP newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if (useMP and tmpSiteSpec.coreCount > 1) or \ (not useMP and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to core mismatch site:%s != task:%s' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None: # only cache is checked for normal tasks siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, caches=taskSpec.transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList, releases='nightlies', cmtConfig=taskSpec.architecture) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY'] or \ tmpSiteSpec.cloud in ['ND'] or \ tmpSiteName in ['CERN-RELEASE']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip %s due to missing rel/cache %s:%s' % \ (tmpSiteName,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList), taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = taskSpec.ramCount if not minRamCount in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug(' skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug(' skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList), minRamCount,taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCount = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize() minDiskCount = minDiskCount / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug(' skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # don't check for T1 if tmpSiteName in t1Sites: pass else: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # the number of jobs which will produce outputs nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \ AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running') # the size of input files which will be copied to the site movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName) if movingInputSize == None: tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # free space - inputs - outputs(250MB*nJobs) must be >= 200GB outSizePerJob = 0.250 diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug(' skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space, movingInputSize,outSizePerJob, nRemJobs,diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0,None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug(' skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug(' skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for transferring newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limit def_maxTransferring = 2000 if tmpSiteSpec.transferringlimit == 0: # use default value maxTransferring = def_maxTransferring else: maxTransferring = tmpSiteSpec.transferringlimit # check at the site nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName) nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName) if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']: tmpLog.debug(' skip %s due to too many transferring %s > max(%s,2x%s)' % \ (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get available files totalSize = 0 normalizeFactors = {} availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1]) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # get total size totalSize += datasetSpec.getSize() # loop over all sites to get the size of available files for tmpSiteName in scanSiteList: if not normalizeFactors.has_key(tmpSiteName): normalizeFactors[tmpSiteName] = 0 # get the total size of available files if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName): availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName] for tmpFileSpec in \ availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']: normalizeFactors[tmpSiteName] += tmpFileSpec.fsize ###################################### # calculate weight tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} for tmpSiteName in scanSiteList: nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # normalize weights by taking data availability into account if totalSize != 0: weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # set available files for tmpDatasetName,availableFiles in availableFileMap.iteritems(): if availableFiles.has_key(tmpSiteName): siteCandidateSpec.localDiskFiles += availableFiles[tmpSiteName]['localdisk'] siteCandidateSpec.localTapeFiles += availableFiles[tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote'] # append inputChunk.addSiteCandidate(siteCandidateSpec) tmpLog.debug(' use {0} with weight={1}'.format(tmpSiteName,weight)) # return tmpLog.debug('done') return self.SC_SUCCEEDED,inputChunk