Python AtlasBrokerUtils.getNumJobs Examples

Programming Language: Python

Class/Type: AtlasBrokerUtils

Method/Function: getNumJobs

Examples at hotexamples.com: 7

Python AtlasBrokerUtils.getNumJobs - 7 examples found. These are the top rated real world Python examples of AtlasBrokerUtils.getNumJobs extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getNumJobs(3)

getAnalSitesWithData(1)

getAnalSitesWithDataDisk(1)

getDictToSetNucleus(1)

getHospitalQueues(1)

getNucleiWithData(1)

getSatelliteSites(1)

getSiteInputStorageEndpointMap(1)

Example #1

Show file

 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(
                             taskSpec.jediTaskID,
                             datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get primary site candidates
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(
         None, taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName, tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites
     for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['', None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff', 'test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug(
                 '  skip site=%s due to status=%s criteria=-status' %
                 (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(
             len(scanSiteList), useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search(
                 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList,
                 cmtConfig=taskSpec.architecture,
                 onlyCmtConfig=True)
         else:
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
             transHome = re.sub('_', '-', transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     caches=transHome,
                     cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     releases=transUses,
                     cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList, releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(
             len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
             taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0, None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug(
                     '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                     .format(tmpSiteName, site_maxmemory, minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0, None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug(
                     '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                     .format(tmpSiteName, site_minmemory, minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize = inputChunk.getMaxAtomSize()
     tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug(
         'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
         .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                 tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
         minDiskCountS, minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                     .format(tmpSiteName, tmpSiteSpec.maxwdir,
                             minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(
             tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug(
                     '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                     .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug(
                     '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                     .format(tmpSiteName, tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug(
                 '  skip site=%s due to no pilot criteria=-nopilot' %
                 tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
             tmpLog.debug(
                 '  skip site={0} excluded criteria=-excluded'.format(
                     tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(
                 tmpSiteName, includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug(
                     '  skip site={0} not included criteria=-notincluded'.
                     format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug(
                     '  skip site={0} limited access criteria=-limitedaccess'
                     .format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
             tmpLog.debug(
                 '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                 format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug(
                 '  skip site={0} not included criteria=-notincluded'.
                 format(tmpSiteName))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug(
                     'getting the list of sites where {0} is available'.
                     format(datasetName))
                 tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                     scanSiteList, self.siteMapper, self.ddmIF, datasetName)
                 if tmpSt in [
                         Interaction.JEDITemporaryError,
                         Interaction.JEDITimeoutError
                 ]:
                     tmpLog.error(
                         'temporary failed to get the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error(
                         'fatal error when getting the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(
                         len(tmpRet), str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(
                                 datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(
                                     datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error(
                     '{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (
                     sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                     tmpDiskSiteList,
                     self.taskBufferIF,
                     self.siteMapper,
                     nSites=50,
                     protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
             ):
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0, None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(
                         tmpWeightSrcMap['weight']) / wRemote
                 else:
                     dataWeight[tmpSiteName] += float(
                         tmpWeightSrcMap['weight']) / wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][
                     datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                 ):
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(
                 datasetName, len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                 datasetName, len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug(
                 'data is unavailable locally or remotely at preassigned site {0}'
                 .format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(
             taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
         taskSpec.jediTaskID, timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'starting',
                                                 None, None)
         nFailed = 0
         nClosed = 0
         nFinished = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed + nClosed > 2 * nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(
                 jobStatPrioMap, tmpSiteName, 'throttled', None, None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
             tmpSiteName, nRunning, nAssigned, nActivated, nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
             nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
             candidateSpecList, problematicSites, sitesUsedByTask,
             preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(
                         tmpSiteName
                 ) and remoteSourceList[tmpSiteName].has_key(
                         datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][
                             datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
                 fileScanSiteList, self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 ngGroup=[2],
                 checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' %
                          (errtype.__name__, errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug(
                 '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                 .format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []:
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName, availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                     tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][
                         tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[
                             tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles += availableFiles[
                         tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[
                         tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(
                         availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(
                         tmpMsg.format(
                             tmpDatasetName,
                             tmpSiteName,
                             len(tmpDatasetSpec.Files),
                             len(availableFiles[tmpSiteName]['localdisk']),
                             len(availableFiles[tmpSiteName]['cache']),
                             len(availableFiles[tmpSiteName]['localtape']),
                         ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug(
                 '  skip site={0} file unavailable criteria=-fileunavailable'
                 .format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug(
             '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
             .format(
                 siteCandidateSpec.siteName,
                 siteCandidateSpec.weight,
                 len(siteCandidateSpec.localDiskFiles),
                 len(siteCandidateSpec.localTapeFiles),
                 len(siteCandidateSpec.cacheFiles),
                 len(siteCandidateSpec.remoteFiles),
             ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk

Example #2

Show file

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     ######################################
     # selection for data availability
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:    
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavaiable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # check if the data is available on disk
             if AtlasBrokerUtils.getAnalSitesWithDataDisk(self.dataSiteMap[datasetName]) == []:
                 tmpLog.error('{0} is avaiable only on tape'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None     
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where disk replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite)
             # get sites which can remotely access source sites
             if (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 1
                 else:
                     dataWeight[tmpSiteName] += 1
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = tmpWeightSrcMap['weight']
                 else:
                     dataWeight[tmpSiteName] += tmpWeightSrcMap['weight']
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retFatal
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if not taskSpec.transHome in [None,'AnalysisTransforms']:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = taskBuffer.checkSitesWithRelease(scanSiteList,
                                                               cmtConfig=tmpCmtConfig,
                                                               onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '':
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=taskSpec.transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
                 #                                                         releases='nightlies',
                 #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}'.format(len(scanSiteList),
                                                                    taskSpec.transHome,
                                                                    taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip {0} excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip {0} limited access'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip {0} cloud missmatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} didn't pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight = float(nThrottled + 1)
         # noramize weights by taking data availability into account
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip {0} non pre-assigned site'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= availableFiles[tmpRemoteSite]['localdisk']:
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     isAvailable = True
                 else:
                     tmpLog.debug('{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4}'.format(tmpDatasetName,
                                                                                                       tmpSiteName,
                                                                                                       len(tmpDatasetSpec.Files),
                                                                                                       len(availableFiles[tmpSiteName]['localdisk']),
                                                                                                       len(availableFiles[tmpSiteName]['cache'])))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip {0} file unavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5}'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Example #3

Show file

File: GenJobBroker.py Project: tertychnyy/panda-jedi

 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['', None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' %
                      (cloudName, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug(
                     '  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxmemory,
                             minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug(
                     '  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.minmemory,
                             minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk

Example #4

Show file

File: AtlasAnalJobBroker.py Project: RRCKI/panda-jedi

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=tmpCmtConfig,
                                                                      onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=taskSpec.architecture,
                                                                      onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(len(scanSiteList),
                                                                        taskSpec.transUses,
                                                                        taskSpec.transHome,
                                                                        taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0,None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [0,None] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug('  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'.format(tmpSiteName,
                                                                                                       site_maxmemory,
                                                                                                       minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0,None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [0,None] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug('  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'.format(tmpSiteName,
                                                                                                      site_minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize  = inputChunk.getMaxAtomSize()
     tmpEffAtomSize  = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize  = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug('maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'.format(tmpMaxAtomSize,
                                                                                                   tmpEffAtomSize,
                                                                                                   tmpOutDiskSize,
                                                                                                   tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(minDiskCountS,
                                                                          minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'.format(tmpSiteName,tmpSpaceSize,
                                                                                         diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug('  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip site={0} excluded criteria=-excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip site={0} limited access criteria=-limitedaccess'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip site={0} cloud mismatch criteria=-cloudmismatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is available'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt in [Interaction.JEDITemporaryError,Interaction.JEDITimeoutError]: 
                     tmpLog.error('temporary failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpDiskSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0,None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(tmpWeightSrcMap['weight'])/wRemote
                 else:
                     dataWeight[tmpSiteName] += float(tmpWeightSrcMap['weight'])/wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(datasetName,len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(taskSpec.jediTaskID,timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting', None,None)
         nFailed    = 0
         nClosed    = 0
         nFinished  = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed+nClosed > 2*nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr  = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(tmpSiteName,
                                                                              nRunning,        
                                                                              nAssigned,       
                                                                              nActivated,      
                                                                              nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(nFailed,
                                                                                          nClosed,
                                                                                          nFinished,
                                                                                          nThrottled,
                                                                                          tmpDataWeight,
                                                                                          weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(candidateSpecList,
                                                                   problematicSites,
                                                                   sitesUsedByTask,
                                                                   preSiteCandidateSpec,
                                                                   maxNumSites,
                                                                   timeWindowForFC,
                                                                   tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2],
                                                         checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip site={0} non pre-assigned site criteria=-nonpreassigned'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(tmpMsg.format(tmpDatasetName,
                                                tmpSiteName,
                                                len(tmpDatasetSpec.Files),
                                                len(availableFiles[tmpSiteName]['localdisk']),
                                                len(availableFiles[tmpSiteName]['cache']),
                                                len(availableFiles[tmpSiteName]['localtape']),
                                                ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip site={0} file unavailable criteria=-fileunavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Example #5

Show file

File: GenJobBroker.py Project: RRCKI/panda-jedi

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(siteCandidateSpec.siteName,
                                                         siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Example #6

Show file

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     siteListPreAssigned = False
     if not taskSpec.site in ['',None]:
         if ',' in taskSpec.site:
             # site list
             siteListPreAssigned = True
             scanSiteList = taskSpec.site.split(',')
         else:
             # site
             sitePreAssigned = True
             scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         siteListPreAssigned = True
         scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite())
         scanSiteList.append(inputChunk.getPreassignedSite())
         tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList),
                                                                                                                               inputChunk.getPreassignedSite())
         tmpMsg += 'criteria=+premerge'
         tmpLog.debug(tmpMsg)
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # T1 
     if not taskSpec.useWorldCloud():
         t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
         # hospital sites
         if self.hospitalQueueMap.has_key(cloudName):
             t1Sites += self.hospitalQueueMap[cloudName]
     else:
         # get destination for WORLD cloud
         t1Sites = []
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log'])
         for datasetSpec in datasetSpecList:
             if not datasetSpec.destination in t1Sites:
                 t1Sites.append(datasetSpec.destination)
     # sites sharing SE with T1
     sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0])
     # all T1
     allT1Sites = self.getAllT1Sites()
     # core count
     if inputChunk.isMerging and taskSpec.mergeCoreCount != None:
         taskCoreCount = taskSpec.mergeCoreCount
     else:
         taskCoreCount = taskSpec.coreCount
     # MP
     if taskCoreCount != None and taskCoreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskCoreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     # get workQueue
     workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID)
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for high priorities
     t1WeightForHighPrio = 1
     if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \
             and not sitePreAssigned and not siteListPreAssigned:
         t1WeightForHighPrio = 100
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpMsg = '  skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName,
                                                                                                                                       cloudName)
                 tmpMsg += 'criteria=-scoutprio'
                 tmpLog.debug(tmpMsg)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection to avoid slow or inactive sites
     if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \
             inputChunk.isMerging or taskSpec.mergeOutput()) \
             and not sitePreAssigned:
         # get inactive sites
         inactiveTimeLimit = 2
         inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit)
         newScanSiteList = []
         tmpMsgList = []
         for tmpSiteName in scanSiteList:
             nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                 AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting')
             if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName)
                 tmpMsg += 'criteria=-slow'
                 tmpMsgList.append(tmpMsg)
             elif tmpSiteName in inactiveSites and nToGetAll > 0:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName,
                                                                                                                                              inactiveTimeLimit)
                 tmpMsg += 'criteria=-inactive'
                 tmpMsgList.append(tmpMsg)
             else:
                 newScanSiteList.append(tmpSiteName)
         if newScanSiteList != []:
             scanSiteList = newScanSiteList
             for tmpMsg in tmpMsgList:
                 tmpLog.debug(tmpMsg)
         tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned and not siteListPreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             """
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
             """        
     ######################################
     # selection for fairshare
     if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']):
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog):
                 tmpLog.debug('  skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)                
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing cache=%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.maxmemory,
                                                                                                                 minRamCount)
                 tmpMsg += 'criteria=-lowmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                   tmpSiteSpec.minmemory,
                                                                                                                   minRamCount)
                 tmpMsg += 'criteria=-highmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList),
                                                                           minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     if taskSpec.outputScaleWithEvents():
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True)
     else:
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True)
     minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpMsg = '  skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName,
                                                                                            tmpSiteSpec.maxwdir,
                                                                                            minDiskCount)
             tmpMsg += 'criteria=-disk'
             tmpLog.debug(tmpMsg)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList),
                                                                                       minDiskCount))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm)
             if tmpSiteSpaceMap != {}:
                 tmpSiteFreeSpace = tmpSiteSpaceMap['free']
                 tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob
                 if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                     tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace,
                                                                                                         movingInputSize,outSizePerJob,
                                                                                                         nRemJobs,diskThreshold))
                     continue
             # check if blacklisted
             if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm):
                 tmpLog.debug('  skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     if not taskSpec.useHS06():
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
         minWalltime = taskSpec.walltime * tmpMaxAtomSize
         strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize)
     else:
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True)
         minWalltime = taskSpec.cpuTime * tmpMaxAtomSize
         strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize)
     if minWalltime != None or inputChunk.useScout():
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             siteMaxTime = tmpSiteSpec.maxtime
             origSiteMaxTime = siteMaxTime
             # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day
             if inputChunk.useScout() or inputChunk.isMerging or \
                     (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]):
                 minTimeForZeroWalltime = 24*60*60
                 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime:
                     tmpMsg = '  skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName,
                                                                                                                 siteMaxTime)
                     if inputChunk.useScout():
                         tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-scoutwalltime'
                     else:
                         tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-zerowalltime'
                     tmpLog.debug(tmpMsg)
                     continue
             # check max walltime at the site
             tmpSiteStr = '{0}'.format(siteMaxTime)
             if taskSpec.useHS06():
                 oldSiteMaxTime = siteMaxTime
                 siteMaxTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime)
             if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMaxTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMaxTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMaxTime = long(siteMaxTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMaxTime != 0 and minWalltime > siteMaxTime:
                 tmpMsg = '  skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                    tmpSiteStr,
                                                                                                                    strMinWalltime)
                 tmpMsg += 'criteria=-shortwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             # check min walltime at the site
             siteMinTime = tmpSiteSpec.mintime
             origSiteMinTime = siteMinTime
             tmpSiteStr = '{0}'.format(siteMinTime)
             if taskSpec.useHS06():
                 oldSiteMinTime = siteMinTime
                 siteMinTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime)
             if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMinTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMinTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMinTime = long(siteMinTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMinTime != 0 and minWalltime < siteMinTime:
                 tmpMsg = '  skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                      tmpSiteStr,
                                                                                                                      strMinWalltime)
                 tmpMsg += 'criteria=-longwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         if not taskSpec.useHS06():
             tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         else:
             tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for network connectivity
     if not sitePreAssigned:
         ipConnectivity = taskSpec.getIpConnectivity()
         if ipConnectivity != None:
             newScanSiteList = []
             for tmpSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # check at the site
                 if tmpSiteSpec.wnconnectivity == 'full':
                     pass
                 elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http':
                     pass
                 else:
                     tmpMsg = '  skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.wnconnectivity,
                                                                                                                 ipConnectivity)
                     tmpMsg += 'criteria=-network'
                     tmpLog.debug(tmpMsg)
                     continue
                 newScanSiteList.append(tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList),
                                                                             ipConnectivity))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
     ######################################
     # selection for event service
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # event service
             if taskSpec.useEventService():
                 if tmpSiteSpec.getJobSeed() == 'std':
                     tmpMsg = '  skip site={0} since EventService is not allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-es'
                     tmpLog.debug(tmpMsg)
                     continue
             else:
                 if tmpSiteSpec.getJobSeed() == 'es':
                     tmpMsg = '  skip site={0} since only EventService is allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-nones'
                     tmpLog.debug(tmpMsg)
                     continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \
                              (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         t1Weight = 1
     t1Weight = max(t1Weight,t1WeightForHighPrio)
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for nPilot
     nPilotMap = {}
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
             nPilotMap[tmpSiteName] = nPilot
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper,
                                                                        ignoreCC=True)
             # disable file lookup for merge jobs or secondary datasets
             checkCompleteness = True
             useCompleteOnly = False
             if inputChunk.isMerging:
                 checkCompleteness = False
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken,
                                                         useCompleteOnly=useCompleteOnly)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList)))
     weightMapPrimary = {}
     weightMapSecondary = {}
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID)
         nDefined   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID)
         if tmpSiteName in nPilotMap:
             nPilot = nPilotMap[tmpSiteName]
         else:
             nPilot = 0
         manyAssigned = float(nAssigned + 1) / float(nActivated + 1)
         manyAssigned = min(2.0,manyAssigned)
         manyAssigned = max(1.0,manyAssigned)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned
         weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned,
                                                                                                                   nStarting,nDefined,
                                                                                                                   totalSize,manyAssigned,
                                                                                                                   nPilot)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
             weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName])
         # T1 weight
         if tmpSiteName in t1Sites+sitesShareSeT1:
             weight *= t1Weight
             weightStr += 't1W={0} '.format(t1Weight)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight and params
         siteCandidateSpec.weight = weight
         siteCandidateSpec.nRunningJobs = nRunning
         siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting
         siteCandidateSpec.nAssignedJobs = nAssigned
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # check if site is locked for WORLD
         lockedByBrokerage = False
         if taskSpec.useWorldCloud():
             lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel,
                                                    tmpSiteName,taskSpec.workQueue_ID)
         # check cap with nRunning
         cutOffValue = 20
         cutOffFactor = 2 
         nRunningCap = max(cutOffValue,cutOffFactor*nRunning)
         nRunningCap = max(nRunningCap,nPilot)
         okMsg = '  use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr)
         okAsPrimay = False
         if lockedByBrokerage:
             ngMsg = '  skip site={0} due to locked by another brokerage '.format(tmpSiteName)
             ngMsg += 'criteria=-lock'
         elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap:
             ngMsg = '  skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName,
                                                                                                  nDefined+nActivated+nAssigned+nStarting)
             ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue,
                                                                                      cutOffFactor,                                  
                                                                                      nRunning,                                      
                                                                                      nPilot)
             ngMsg += 'criteria=-cap'
         else:
             ngMsg = '  skip site={0} due to low weight '.format(tmpSiteName)
             ngMsg += 'criteria=-loweigh'
             okAsPrimay = True
         # use primay if cap/lock check is passed
         if okAsPrimay:
             weightMap = weightMapPrimary
         else:
             weightMap = weightMapSecondary
         # add weight
         if not weight in weightMap:
             weightMap[weight] = []
         weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg))
     # use second candidates if no primary candidates passed cap/lock check
     if weightMapPrimary == {}:
         tmpLog.debug('use second candidates since no sites pass cap/lock check')
         weightMap = weightMapSecondary
         # use hightest 3 weights                                                                                                                                                  
         weightRank = 3
     else:
         weightMap = weightMapPrimary
         # use all weights
         weightRank = None
         # dump NG message
         for tmpWeight in weightMapSecondary.keys():
             for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]:
                 tmpLog.debug(tmpNgMsg)
     # max candidates for WORLD
     if taskSpec.useWorldCloud():
         maxSiteCandidates = 10
     else:
         maxSiteCandidates = None
     newScanSiteList = []
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightIdx,tmpWeight in enumerate(weightList):
         for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]:
             if (weightRank == None or weightIdx < weightRank) and \
                     (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates):
                 # use site
                 tmpLog.debug(tmpOkMsg)
                 newScanSiteList.append(siteCandidateSpec.siteName)
                 inputChunk.addSiteCandidate(siteCandidateSpec)
             else:
                 # dump NG message
                 tmpLog.debug(tmpNgMsg)
     scanSiteList = newScanSiteList
     # final check
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # lock sites for WORLD
     if taskSpec.useWorldCloud():
         for tmpSiteName in scanSiteList:
             self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID)
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     # return
     self.sendLogMessage(tmpLog)
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Example #7

Show file

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # T1 
     t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
     # hospital sites
     if self.hospitalQueueMap.has_key(cloudName):
         t1Sites += self.hospitalQueueMap[cloudName]
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         useMP = True
     else:
         useMP = False
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to validatedreleases != True' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for high priorities
     if taskSpec.currentPriority >= 950 and not useMP:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to high prio which needs to run at T1' % tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for high prio'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for data availability
     for datasetSpec in inputChunk.getDatasets():
         datasetName = datasetSpec.datasetName
         if not self.dataSiteMap.has_key(datasetName):
             # get the list of sites where data is available
             tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
             tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                              self.ddmIF,datasetName)
             if tmpSt == self.SC_FAILED:
                 tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             if tmpSt == self.SC_FATAL:
                 tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # append
             self.dataSiteMap[datasetName] = tmpRet
             tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
         # check if T1 has the data
         if self.dataSiteMap[datasetName].has_key(cloudName):
             cloudHasData = True
         else:
             cloudHasData = False
         t1hasData = False
         if cloudHasData:
             for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                 if tmpSeVal['state'] == 'complete':
                     t1hasData = True
                     break
             # T1 has incomplete data while no data at T2
             if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                 # use incomplete data at T1 anyway
                 t1hasData = True
         # data is missing at T1         
         if not t1hasData:
             tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
             # make subscription to T1
             # FIXME
             pass
             # use T2 until data is complete at T1
             newScanSiteList = []
             for tmpSiteName in scanSiteList:                    
                 if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                     newScanSiteList.append(tmpSiteName)
                 else:
                     tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                     if tmpSiteSpec.cloud != cloudName:
                         tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                     else:
                         tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     ######################################
     # selection for fairshare
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog):
             tmpLog.debug('  skip {0} due to zero share'.format(tmpSiteName))
             continue
         newScanSiteList.append(tmpSiteName)                
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if (useMP and tmpSiteSpec.coreCount > 1) or \
            (not useMP and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to core mismatch site:%s != task:%s' % \
                          (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='nightlies',
                                                                      cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCount = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                  tmpSiteSpec.maxwdir,
                                                                                  minDiskCount))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob
             if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                                     movingInputSize,outSizePerJob,
                                                                                                     nRemJobs,diskThreshold))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip %s due to too many transferring %s > max(%s,2x%s)' % \
                          (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get available files
     totalSize = 0
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
         # get total size
         totalSize += datasetSpec.getSize()
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # append        
         inputChunk.addSiteCandidate(siteCandidateSpec)
         tmpLog.debug('  use {0} with weight={1}'.format(tmpSiteName,weight))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk