Ejemplo n.º 1
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     siteListPreAssigned = False
     if not taskSpec.site in ['',None]:
         if ',' in taskSpec.site:
             # site list
             siteListPreAssigned = True
             scanSiteList = taskSpec.site.split(',')
         else:
             # site
             sitePreAssigned = True
             scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         siteListPreAssigned = True
         scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite())
         scanSiteList.append(inputChunk.getPreassignedSite())
         tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList),
                                                                                                                               inputChunk.getPreassignedSite())
         tmpMsg += 'criteria=+premerge'
         tmpLog.debug(tmpMsg)
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # T1 
     if not taskSpec.useWorldCloud():
         t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
         # hospital sites
         if self.hospitalQueueMap.has_key(cloudName):
             t1Sites += self.hospitalQueueMap[cloudName]
     else:
         # get destination for WORLD cloud
         t1Sites = []
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log'])
         for datasetSpec in datasetSpecList:
             if not datasetSpec.destination in t1Sites:
                 t1Sites.append(datasetSpec.destination)
     # sites sharing SE with T1
     sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0])
     # all T1
     allT1Sites = self.getAllT1Sites()
     # core count
     if inputChunk.isMerging and taskSpec.mergeCoreCount != None:
         taskCoreCount = taskSpec.mergeCoreCount
     else:
         taskCoreCount = taskSpec.coreCount
     # MP
     if taskCoreCount != None and taskCoreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskCoreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     # get workQueue
     workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID)
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for high priorities
     t1WeightForHighPrio = 1
     if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \
             and not sitePreAssigned and not siteListPreAssigned:
         t1WeightForHighPrio = 100
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpMsg = '  skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName,
                                                                                                                                       cloudName)
                 tmpMsg += 'criteria=-scoutprio'
                 tmpLog.debug(tmpMsg)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection to avoid slow or inactive sites
     if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \
             inputChunk.isMerging or taskSpec.mergeOutput()) \
             and not sitePreAssigned:
         # get inactive sites
         inactiveTimeLimit = 2
         inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit)
         newScanSiteList = []
         tmpMsgList = []
         for tmpSiteName in scanSiteList:
             nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                 AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting')
             if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName)
                 tmpMsg += 'criteria=-slow'
                 tmpMsgList.append(tmpMsg)
             elif tmpSiteName in inactiveSites and nToGetAll > 0:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName,
                                                                                                                                              inactiveTimeLimit)
                 tmpMsg += 'criteria=-inactive'
                 tmpMsgList.append(tmpMsg)
             else:
                 newScanSiteList.append(tmpSiteName)
         if newScanSiteList != []:
             scanSiteList = newScanSiteList
             for tmpMsg in tmpMsgList:
                 tmpLog.debug(tmpMsg)
         tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned and not siteListPreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             """
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
             """        
     ######################################
     # selection for fairshare
     if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']):
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog):
                 tmpLog.debug('  skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)                
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing cache=%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.maxmemory,
                                                                                                                 minRamCount)
                 tmpMsg += 'criteria=-lowmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                   tmpSiteSpec.minmemory,
                                                                                                                   minRamCount)
                 tmpMsg += 'criteria=-highmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList),
                                                                           minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     if taskSpec.outputScaleWithEvents():
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True)
     else:
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True)
     minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpMsg = '  skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName,
                                                                                            tmpSiteSpec.maxwdir,
                                                                                            minDiskCount)
             tmpMsg += 'criteria=-disk'
             tmpLog.debug(tmpMsg)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList),
                                                                                       minDiskCount))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm)
             if tmpSiteSpaceMap != {}:
                 tmpSiteFreeSpace = tmpSiteSpaceMap['free']
                 tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob
                 if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                     tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace,
                                                                                                         movingInputSize,outSizePerJob,
                                                                                                         nRemJobs,diskThreshold))
                     continue
             # check if blacklisted
             if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm):
                 tmpLog.debug('  skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     if not taskSpec.useHS06():
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
         minWalltime = taskSpec.walltime * tmpMaxAtomSize
         strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize)
     else:
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True)
         minWalltime = taskSpec.cpuTime * tmpMaxAtomSize
         strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize)
     if minWalltime != None or inputChunk.useScout():
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             siteMaxTime = tmpSiteSpec.maxtime
             origSiteMaxTime = siteMaxTime
             # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day
             if inputChunk.useScout() or inputChunk.isMerging or \
                     (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]):
                 minTimeForZeroWalltime = 24*60*60
                 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime:
                     tmpMsg = '  skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName,
                                                                                                                 siteMaxTime)
                     if inputChunk.useScout():
                         tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-scoutwalltime'
                     else:
                         tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-zerowalltime'
                     tmpLog.debug(tmpMsg)
                     continue
             # check max walltime at the site
             tmpSiteStr = '{0}'.format(siteMaxTime)
             if taskSpec.useHS06():
                 oldSiteMaxTime = siteMaxTime
                 siteMaxTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime)
             if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMaxTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMaxTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMaxTime = long(siteMaxTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMaxTime != 0 and minWalltime > siteMaxTime:
                 tmpMsg = '  skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                    tmpSiteStr,
                                                                                                                    strMinWalltime)
                 tmpMsg += 'criteria=-shortwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             # check min walltime at the site
             siteMinTime = tmpSiteSpec.mintime
             origSiteMinTime = siteMinTime
             tmpSiteStr = '{0}'.format(siteMinTime)
             if taskSpec.useHS06():
                 oldSiteMinTime = siteMinTime
                 siteMinTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime)
             if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMinTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMinTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMinTime = long(siteMinTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMinTime != 0 and minWalltime < siteMinTime:
                 tmpMsg = '  skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                      tmpSiteStr,
                                                                                                                      strMinWalltime)
                 tmpMsg += 'criteria=-longwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         if not taskSpec.useHS06():
             tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         else:
             tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for network connectivity
     if not sitePreAssigned:
         ipConnectivity = taskSpec.getIpConnectivity()
         if ipConnectivity != None:
             newScanSiteList = []
             for tmpSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # check at the site
                 if tmpSiteSpec.wnconnectivity == 'full':
                     pass
                 elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http':
                     pass
                 else:
                     tmpMsg = '  skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.wnconnectivity,
                                                                                                                 ipConnectivity)
                     tmpMsg += 'criteria=-network'
                     tmpLog.debug(tmpMsg)
                     continue
                 newScanSiteList.append(tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList),
                                                                             ipConnectivity))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
     ######################################
     # selection for event service
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # event service
             if taskSpec.useEventService():
                 if tmpSiteSpec.getJobSeed() == 'std':
                     tmpMsg = '  skip site={0} since EventService is not allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-es'
                     tmpLog.debug(tmpMsg)
                     continue
             else:
                 if tmpSiteSpec.getJobSeed() == 'es':
                     tmpMsg = '  skip site={0} since only EventService is allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-nones'
                     tmpLog.debug(tmpMsg)
                     continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \
                              (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         t1Weight = 1
     t1Weight = max(t1Weight,t1WeightForHighPrio)
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for nPilot
     nPilotMap = {}
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
             nPilotMap[tmpSiteName] = nPilot
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper,
                                                                        ignoreCC=True)
             # disable file lookup for merge jobs or secondary datasets
             checkCompleteness = True
             useCompleteOnly = False
             if inputChunk.isMerging:
                 checkCompleteness = False
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken,
                                                         useCompleteOnly=useCompleteOnly)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList)))
     weightMapPrimary = {}
     weightMapSecondary = {}
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID)
         nDefined   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID)
         if tmpSiteName in nPilotMap:
             nPilot = nPilotMap[tmpSiteName]
         else:
             nPilot = 0
         manyAssigned = float(nAssigned + 1) / float(nActivated + 1)
         manyAssigned = min(2.0,manyAssigned)
         manyAssigned = max(1.0,manyAssigned)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned
         weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned,
                                                                                                                   nStarting,nDefined,
                                                                                                                   totalSize,manyAssigned,
                                                                                                                   nPilot)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
             weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName])
         # T1 weight
         if tmpSiteName in t1Sites+sitesShareSeT1:
             weight *= t1Weight
             weightStr += 't1W={0} '.format(t1Weight)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight and params
         siteCandidateSpec.weight = weight
         siteCandidateSpec.nRunningJobs = nRunning
         siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting
         siteCandidateSpec.nAssignedJobs = nAssigned
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # check if site is locked for WORLD
         lockedByBrokerage = False
         if taskSpec.useWorldCloud():
             lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel,
                                                    tmpSiteName,taskSpec.workQueue_ID)
         # check cap with nRunning
         cutOffValue = 20
         cutOffFactor = 2 
         nRunningCap = max(cutOffValue,cutOffFactor*nRunning)
         nRunningCap = max(nRunningCap,nPilot)
         okMsg = '  use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr)
         okAsPrimay = False
         if lockedByBrokerage:
             ngMsg = '  skip site={0} due to locked by another brokerage '.format(tmpSiteName)
             ngMsg += 'criteria=-lock'
         elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap:
             ngMsg = '  skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName,
                                                                                                  nDefined+nActivated+nAssigned+nStarting)
             ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue,
                                                                                      cutOffFactor,                                  
                                                                                      nRunning,                                      
                                                                                      nPilot)
             ngMsg += 'criteria=-cap'
         else:
             ngMsg = '  skip site={0} due to low weight '.format(tmpSiteName)
             ngMsg += 'criteria=-loweigh'
             okAsPrimay = True
         # use primay if cap/lock check is passed
         if okAsPrimay:
             weightMap = weightMapPrimary
         else:
             weightMap = weightMapSecondary
         # add weight
         if not weight in weightMap:
             weightMap[weight] = []
         weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg))
     # use second candidates if no primary candidates passed cap/lock check
     if weightMapPrimary == {}:
         tmpLog.debug('use second candidates since no sites pass cap/lock check')
         weightMap = weightMapSecondary
         # use hightest 3 weights                                                                                                                                                  
         weightRank = 3
     else:
         weightMap = weightMapPrimary
         # use all weights
         weightRank = None
         # dump NG message
         for tmpWeight in weightMapSecondary.keys():
             for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]:
                 tmpLog.debug(tmpNgMsg)
     # max candidates for WORLD
     if taskSpec.useWorldCloud():
         maxSiteCandidates = 10
     else:
         maxSiteCandidates = None
     newScanSiteList = []
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightIdx,tmpWeight in enumerate(weightList):
         for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]:
             if (weightRank == None or weightIdx < weightRank) and \
                     (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates):
                 # use site
                 tmpLog.debug(tmpOkMsg)
                 newScanSiteList.append(siteCandidateSpec.siteName)
                 inputChunk.addSiteCandidate(siteCandidateSpec)
             else:
                 # dump NG message
                 tmpLog.debug(tmpNgMsg)
     scanSiteList = newScanSiteList
     # final check
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # lock sites for WORLD
     if taskSpec.useWorldCloud():
         for tmpSiteName in scanSiteList:
             self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID)
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     # return
     self.sendLogMessage(tmpLog)
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk