break # check if jobs with the jobID have run recently varMap = {} varMap[':jediTaskID'] = jediTaskID varMap[':computingSite'] = computingSite varMap[':prodUserName'] = prodUserName varMap[':jobDefinitionID'] = jobDefinitionID varMap[':modificationTime'] = recentRuntimeLimit varMap[':jobStatus1'] = 'starting' _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID, prodUserName,jediTaskID, computingSite)) iComb += 1 hasRecentJobs = False # check site if not siteMapper.checkSite(computingSite): _logger.debug(" -> skip unknown site=%s" % computingSite) continue # check site status tmpSiteStatus = siteMapper.getSite(computingSite).status if not tmpSiteStatus in ['offline','test']: # use normal time limit for nornal site status if maxModificationTime > normalTimeLimit: _logger.debug(" -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime)) continue for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: retU,resU = taskBuffer.querySQLS(sql % tableName, varMap) if resU == None: # database error raise RuntimeError,"failed to check modTime" if resU != []:
totalFlowFromSource = {} # loop over all sources to get total flows tmpLog.debug(" >>> checking limits") for sinkSite, sinkMap in wanMX.iteritems(): totalFlowToSink = 0 # loop over all sinks for sourceSite, sourceMap in sinkMap.iteritems(): # get total flows totalFlowToSink += sourceMap['flow'] if not totalFlowFromSource.has_key(sourceSite): totalFlowFromSource[sourceSite] = 0 totalFlowFromSource[sourceSite] += sourceMap['flow'] # check limit for sink tmpSiteSpec = siteMapper.getSite(sinkSite) if siteMapper.checkSite( sinkSite ) and tmpSiteSpec.wansinklimit * 1024 * 1024 * 1024 > totalFlowToSink: throttleForSink[sinkSite] = False tmpLog.debug( " release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format( sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit)) else: throttleForSink[sinkSite] = True tmpLog.debug( " throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format( sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit)) # check limit for source for sourceSite, totalFlow in totalFlowFromSource.iteritems(): tmpSiteSpec = siteMapper.getSite(sourceSite) if siteMapper.checkSite( sourceSite
_logger.debug("terminate since close to log-rotate time") break # check if jobs with the jobID have run recently varMap = {} varMap[':jediTaskID'] = jediTaskID varMap[':computingSite'] = computingSite varMap[':prodUserName'] = prodUserName varMap[':jobDefinitionID'] = jobDefinitionID varMap[':modificationTime'] = recentRuntimeLimit _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID, prodUserName,jediTaskID, computingSite)) iComb += 1 hasRecentJobs = False # check site if not siteMapper.checkSite(computingSite): _logger.debug(" -> skip unknown site=%s" % computingSite) continue # check site status tmpSiteStatus = siteMapper.getSite(computingSite).status if not tmpSiteStatus in ['offline','test']: # use normal time limit for nornal site status if maxModificationTime > normalTimeLimit: _logger.debug(" -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime)) continue for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: retU,resU = taskBuffer.querySQLS(sql % tableName, varMap) if resU == None: # database error raise RuntimeError,"failed to check modTime" if resU != []:
def run(self): try: # get job tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID]) if tmpJobs == [] or tmpJobs[0] == None: _logger.debug("cannot find job for PandaID=%s" % self.rPandaID) return self.job = tmpJobs[0] _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite)) # using output container if not self.job.destinationDBlock.endswith('/'): _logger.debug("%s ouput dataset container is required" % self.token) _logger.debug("%s end" % self.token) return # FIXEME : dont' touch group jobs for now if self.job.destinationDBlock.startswith('group') and (not self.userRequest): _logger.debug("%s skip group jobs" % self.token) _logger.debug("%s end" % self.token) return # check processingType typesForRebro = ['pathena','prun','ganga','ganga-rbtest'] if not self.job.processingType in typesForRebro: _logger.debug("%s skip processingType=%s not in %s" % \ (self.token,self.job.processingType,str(typesForRebro))) _logger.debug("%s end" % self.token) return # check jobsetID if self.job.jobsetID in [0,'NULL',None]: _logger.debug("%s jobsetID is undefined" % self.token) _logger.debug("%s end" % self.token) return # check metadata if self.job.metadata in [None,'NULL']: _logger.debug("%s metadata is unavailable" % self.token) _logger.debug("%s end" % self.token) return # check --disableRebrokerage match = re.search("--disableRebrokerage",self.job.metadata) if match != None and (not self.simulation) and (not self.forceOpt) \ and (not self.userRequest): _logger.debug("%s diabled rebrokerage" % self.token) _logger.debug("%s end" % self.token) return # check --site match = re.search("--site",self.job.metadata) if match != None and (not self.simulation) and (not self.forceOpt) \ and (not self.userRequest): _logger.debug("%s --site is used" % self.token) _logger.debug("%s end" % self.token) return # check --libDS match = re.search("--libDS",self.job.metadata) if match != None: _logger.debug("%s --libDS is used" % self.token) _logger.debug("%s end" % self.token) return # check --workingGroup since it is site-specific match = re.search("--workingGroup",self.job.metadata) if match != None: _logger.debug("%s workingGroup is specified" % self.token) _logger.debug("%s end" % self.token) return # avoid too many rebrokerage if not self.checkRev(): _logger.debug("%s avoid too many rebrokerage" % self.token) _logger.debug("%s end" % self.token) return # check if multiple JobIDs use the same libDS if self.bPandaID != None and self.buildStatus not in ['finished','failed']: if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None: _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token) _logger.debug("%s end" % self.token) return tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS]) if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None: _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token) _logger.debug("%s end" % self.token) return # check if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID: _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID, self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID, self.maxPandaIDlibDS)) _logger.debug("%s end" % self.token) return # check excludedSite if self.excludedSite == None: self.excludedSite = [] match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) if match != None: self.excludedSite = match.group(3).split(',') # remove empty try: self.excludedSite.remove('') except: pass _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite))) # check cloud if self.cloud == None: match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) if match != None: self.cloud = match.group(3) _logger.debug("%s cloud=%s" % (self.token,self.cloud)) # get inDS/LFNs status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName) if not status: # failed _logger.error("%s failed to get inDS/LFN from DB" % self.token) return status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS) if not status: # failed _logger.error("%s failed" % self.token) return # get relicas replicaMap = {} unknownSites = {} for tmpDS in inputDS: if tmpDS.endswith('/'): # container status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS) else: # normal dataset status,tmpRepMap = self.getListDatasetReplicas(tmpDS) tmpRepMaps = {tmpDS:tmpRepMap} if not status: # failed _logger.debug("%s failed" % self.token) return # make map per site for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): for tmpSite,tmpStat in tmpRepMap.iteritems(): # ignore special sites if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']: continue # ignore tape sites if tmpSite.endswith('TAPE'): continue # keep sites with unknown replica info if tmpStat[-1]['found'] == None: if not unknownSites.has_key(tmpDS): unknownSites[tmpDS] = [] unknownSites[tmpDS].append(tmpSite) # ignore ToBeDeleted if tmpStat[-1]['archived'] in ['ToBeDeleted',]: continue # change EOS if tmpSite.startswith('CERN-PROD_EOS'): tmpSite = 'CERN-PROD_EOS' # change EOS TMP if tmpSite.startswith('CERN-PROD_TMP'): tmpSite = 'CERN-PROD_TMP' # change DISK to SCRATCHDISK tmpSite = re.sub('_[^_-]+DISK$','',tmpSite) # change PERF-XYZ to SCRATCHDISK tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite) # change PHYS-XYZ to SCRATCHDISK tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite) # patch for BNLPANDA if tmpSite in ['BNLPANDA']: tmpSite = 'BNL-OSG2' # add to map if not replicaMap.has_key(tmpSite): replicaMap[tmpSite] = {} replicaMap[tmpSite][tmpDS] = tmpStat[-1] _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap))) # refresh replica info in needed self.refreshReplicaInfo(unknownSites) # instantiate SiteMapper siteMapper = SiteMapper(self.taskBuffer) # get original DDM origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm) # check all datasets maxDQ2Sites = [] if inputDS != []: # loop over all sites for tmpSite,tmpDsVal in replicaMap.iteritems(): # loop over all datasets appendFlag = True for tmpOrigDS in inputDS: # check completeness if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \ tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']: pass else: appendFlag = False # append if appendFlag: if not tmpSite in maxDQ2Sites: maxDQ2Sites.append(tmpSite) _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites))) if inputDS != [] and maxDQ2Sites == []: _logger.debug("%s no DQ2 candidate" % self.token) else: maxPandaSites = [] # original maxinputsize origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize # look for Panda siteIDs for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems(): # use ANALY_ only if not tmpSiteID.startswith('ANALY_'): continue # remove test and local if re.search('_test',tmpSiteID,re.I) != None: continue if re.search('_local',tmpSiteID,re.I) != None: continue # avoid same site if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM: continue # check DQ2 ID if self.cloud in [None,tmpSiteSpec.cloud] \ and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []): # excluded sites excludedFlag = False for tmpExcSite in self.excludedSite: if re.search(tmpExcSite,tmpSiteID) != None: excludedFlag = True break if excludedFlag: _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID)) continue # use online only if tmpSiteSpec.status != 'online': _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status)) continue # check maxinputsize if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \ maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize: _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID)) continue # append if not tmpSiteID in maxPandaSites: maxPandaSites.append(tmpSiteID) # choose at most 20 sites randomly to avoid too many lookup random.shuffle(maxPandaSites) maxPandaSites = maxPandaSites[:20] _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites))) # no Panda siteIDs if maxPandaSites == []: _logger.debug("%s no Panda site candidate" % self.token) else: # set AtlasRelease and cmtConfig to dummy job tmpJobForBrokerage = JobSpec() if self.job.AtlasRelease in ['NULL',None]: tmpJobForBrokerage.AtlasRelease = '' else: tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease # use nightlies matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage) if matchNight != None: tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1) # use cache else: matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage) if matchCache != None: tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-') if not self.job.cmtConfig in ['NULL',None]: tmpJobForBrokerage.cmtConfig = self.job.cmtConfig # memory size if not self.job.minRamCount in ['NULL',None,0]: tmpJobForBrokerage.minRamCount = self.job.minRamCount # CPU count if not self.job.maxCpuCount in ['NULL',None,0]: tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount # run brokerage brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True, setScanSiteList=maxPandaSites,trustIS=True,reportLog=True) newSiteID = tmpJobForBrokerage.computingSite self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID)) # unknown site if not siteMapper.checkSite(newSiteID): _logger.error("%s unknown site" % self.token) _logger.debug("%s failed" % self.token) return # get new site spec newSiteSpec = siteMapper.getSite(newSiteID) # avoid repetition if self.getAggName(newSiteSpec.ddm) == origSiteDDM: _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID)) _logger.debug("%s end" % self.token) return # simulation mode if self.simulation: _logger.debug("%s end simulation" % self.token) return # prepare jobs status = self.prepareJob(newSiteID,newSiteSpec) if status: # run SetUpper statusSetUp = self.runSetUpper() if not statusSetUp: _logger.debug("%s runSetUpper failed" % self.token) else: _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID)) _logger.debug("%s end" % self.token) except: errType,errValue,errTraceBack = sys.exc_info() _logger.error("%s run() : %s %s" % (self.token,errType,errValue))
throttleForSource = {} totalFlowFromSource = {} # loop over all sources to get total flows tmpLog.debug(" >>> checking limits") for sinkSite,sinkMap in wanMX.iteritems(): totalFlowToSink = 0 # loop over all sinks for sourceSite,sourceMap in sinkMap.iteritems(): # get total flows totalFlowToSink += sourceMap['flow'] if not totalFlowFromSource.has_key(sourceSite): totalFlowFromSource[sourceSite] = 0 totalFlowFromSource[sourceSite] += sourceMap['flow'] # check limit for sink tmpSiteSpec = siteMapper.getSite(sinkSite) if siteMapper.checkSite(sinkSite) and tmpSiteSpec.wansinklimit*1024*1024*1024 > totalFlowToSink: throttleForSink[sinkSite] = False tmpLog.debug(" release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format(sinkSite,totalFlowToSink, tmpSiteSpec.wansinklimit)) else: throttleForSink[sinkSite] = True tmpLog.debug(" throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format(sinkSite,totalFlowToSink, tmpSiteSpec.wansinklimit)) # check limit for source for sourceSite,totalFlow in totalFlowFromSource.iteritems(): tmpSiteSpec = siteMapper.getSite(sourceSite) if siteMapper.checkSite(sourceSite) and tmpSiteSpec.wansourcelimit*1024*1024*1024 > totalFlow: throttleForSource[sourceSite] = False tmpLog.debug(" release Src {0} : {1}bps (total) < {2}Gbps (limit)".format(sourceSite,totalFlow, tmpSiteSpec.wansourcelimit)) else: