Exemple #1
0
 def run(self):
     # start
     try:
         byCallback = False
         if self.job == None:
             byCallback = True
             _logger.debug("start: %s" % self.dataset.name)
             _logger.debug("callback from %s" % self.site)
             # FIXME when callback from BNLPANDA disappeared
             if self.site == 'BNLPANDA':
                 self.site = 'BNL-OSG2_ATLASMCDISK'
             # instantiate site mapper
             siteMapper = SiteMapper(self.taskBuffer)
             # get computingSite/destinationSE
             computingSite, destinationSE = self.taskBuffer.getDestSE(
                 self.dataset.name)
             if destinationSE == None:
                 # try to get computingSite/destinationSE from ARCH to delete sub
                 # even if no active jobs left
                 computingSite, destinationSE = self.taskBuffer.getDestSE(
                     self.dataset.name, True)
                 if destinationSE == None:
                     _logger.error("cannot get source/destination for %s" %
                                   self.dataset.name)
                     _logger.debug("end: %s" % self.dataset.name)
                     return
             _logger.debug("src: %s" % computingSite)
             _logger.debug("dst: %s" % destinationSE)
             # get corresponding token
             tmpSrcSiteSpec = siteMapper.getSite(computingSite)
             tmpDstSiteSpec = siteMapper.getSite(destinationSE)
             _logger.debug(tmpDstSiteSpec.setokens)
             destToken = None
             for tmpToken, tmpDdmId in tmpDstSiteSpec.setokens.iteritems():
                 if self.site == tmpDdmId:
                     destToken = tmpToken
                     break
             _logger.debug("use Token=%s" % destToken)
             # get required tokens
             reqTokens = self.taskBuffer.getDestTokens(self.dataset.name)
             if reqTokens == None:
                 _logger.error("cannot get required token for %s" %
                               self.dataset.name)
                 _logger.debug("end: %s" % self.dataset.name)
                 return
             _logger.debug("req Token=%s" % reqTokens)
             # make bitmap for the token
             bitMap = 1
             if len(reqTokens.split(',')) > 1:
                 for tmpReqToken in reqTokens.split(','):
                     if tmpReqToken == destToken:
                         break
                     # shift one bit
                     bitMap <<= 1
             # completed bitmap
             compBitMap = (1 << len(reqTokens.split(','))) - 1
             # ignore the lowest bit for T1, file on DISK is already there
             if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm:
                 compBitMap = compBitMap & 0xFFFE
             # update bitmap in DB
             updatedBitMap = self.taskBuffer.updateTransferStatus(
                 self.dataset.name, bitMap)
             _logger.debug(
                 "transfer status:%s - comp:%s - bit:%s" %
                 (hex(updatedBitMap), hex(compBitMap), hex(bitMap)))
             # update output files
             if (updatedBitMap & compBitMap) == compBitMap:
                 ids = self.taskBuffer.updateOutFilesReturnPandaIDs(
                     self.dataset.name)
                 # set flag for T2 cleanup
                 self.dataset.status = 'cleanup'
                 self.taskBuffer.updateDatasets([self.dataset])
             else:
                 _logger.debug("end: %s" % self.dataset.name)
                 return
         else:
             _logger.debug("start: %s" % self.job.PandaID)
             # update input files
             ids = [self.job.PandaID]
         _logger.debug("IDs: %s" % ids)
         if len(ids) != 0:
             # get job
             if self.job == None:
                 jobs = self.taskBuffer.peekJobs(ids,
                                                 fromDefined=False,
                                                 fromArchived=False,
                                                 fromWaiting=False)
             else:
                 jobs = [self.job]
             # loop over all jobs
             for job in jobs:
                 if job == None:
                     continue
                 _logger.debug("Job: %s" % job.PandaID)
                 if job.jobStatus == 'transferring':
                     jobReady = True
                     failedFiles = []
                     noOutFiles = []
                     # check file status
                     for file in job.Files:
                         if file.type == 'output' or file.type == 'log':
                             if file.status == 'failed':
                                 failedFiles.append(file.lfn)
                             elif file.status == 'nooutput':
                                 noOutFiles.append(file.lfn)
                             elif file.status != 'ready':
                                 _logger.debug(
                                     "Job: %s file:%s %s != ready" %
                                     (job.PandaID, file.lfn, file.status))
                                 jobReady = False
                                 break
                     # finish job
                     if jobReady:
                         if byCallback:
                             _logger.debug("Job: %s all files ready" %
                                           job.PandaID)
                         else:
                             _logger.debug(
                                 "Job: %s all files checked with catalog" %
                                 job.PandaID)
                         # create XML
                         try:
                             import xml.dom.minidom
                             dom = xml.dom.minidom.getDOMImplementation()
                             doc = dom.createDocument(None, 'xml', None)
                             topNode = doc.createElement("POOLFILECATALOG")
                             for file in job.Files:
                                 if file.type in ['output', 'log']:
                                     # skip failed or no-output files
                                     if file.lfn in failedFiles + noOutFiles:
                                         continue
                                     # File
                                     fileNode = doc.createElement("File")
                                     fileNode.setAttribute("ID", file.GUID)
                                     # LFN
                                     logNode = doc.createElement("logical")
                                     lfnNode = doc.createElement("lfn")
                                     lfnNode.setAttribute('name', file.lfn)
                                     # metadata
                                     fsizeNode = doc.createElement(
                                         "metadata")
                                     fsizeNode.setAttribute(
                                         "att_name", "fsize")
                                     fsizeNode.setAttribute(
                                         "att_value", str(file.fsize))
                                     # checksum
                                     if file.checksum.startswith('ad:'):
                                         # adler32
                                         chksumNode = doc.createElement(
                                             "metadata")
                                         chksumNode.setAttribute(
                                             "att_name", "adler32")
                                         chksumNode.setAttribute(
                                             "att_value",
                                             re.sub('^ad:', '',
                                                    file.checksum))
                                     else:
                                         # md5sum
                                         chksumNode = doc.createElement(
                                             "metadata")
                                         chksumNode.setAttribute(
                                             "att_name", "md5sum")
                                         chksumNode.setAttribute(
                                             "att_value",
                                             re.sub('^md5:', '',
                                                    file.checksum))
                                     # append nodes
                                     logNode.appendChild(lfnNode)
                                     fileNode.appendChild(logNode)
                                     fileNode.appendChild(fsizeNode)
                                     fileNode.appendChild(chksumNode)
                                     topNode.appendChild(fileNode)
                             # status in file name
                             if failedFiles == []:
                                 statusFileName = 'finished'
                             else:
                                 statusFileName = 'failed'
                             # write to file
                             xmlFile = '%s/%s_%s_%s' % (
                                 panda_config.logdir, job.PandaID,
                                 statusFileName,
                                 commands.getoutput('uuidgen'))
                             oXML = open(xmlFile, "w")
                             oXML.write(topNode.toxml())
                             oXML.close()
                         except:
                             type, value, traceBack = sys.exc_info()
                             _logger.error("%s : %s %s" %
                                           (job.PandaID, type, value))
                 _logger.debug("Job: %s status: %s" %
                               (job.PandaID, job.jobStatus))
         # end
         if self.job == None:
             _logger.debug("end: %s" % self.dataset.name)
         else:
             _logger.debug("end: %s" % self.job.PandaID)
     except:
         type, value, traceBack = sys.exc_info()
         _logger.error("run() : %s %s" % (type, value))
xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!-- ATLAS file meta-data catalog -->
<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">
<POOLFILECATALOG>
"""
try:
    att = sys.argv[2]
except:
    att = job.attemptNr

if job.computingSite in ['',None,'NULL']:
    print 'computingSite is not yet defined'
    sys.exit(0)

siteSpec = siteMapper.getSite(job.computingSite)

for file in job.Files:
    if file.type in ['output','log']:
        file.GUID = commands.getoutput('uuidgen')
        if job.computingSite == file.destinationSE and \
                siteSpec.setokens.has_key(file.destinationDBlockToken):
            tmpSrcDDM = siteSpec.setokens[file.destinationDBlockToken]
        else:
            tmpSrcDDM = siteMapper.getSite(job.computingSite).ddm
        srm = TiersOfATLAS.getSiteProperty(tmpSrcDDM,'srm')
        srm = re.sub('^token:[^:]+:','',srm)
        xml += """
  <File ID="%s">
    <logical>
      <lfn name="%s"/>
Exemple #3
0
 throttleForSink = {}
 throttleForSource = {}
 totalFlowFromSource = {}
 # loop over all sources to get total flows
 tmpLog.debug(" >>> checking limits")
 for sinkSite, sinkMap in wanMX.iteritems():
     totalFlowToSink = 0
     # loop over all sinks
     for sourceSite, sourceMap in sinkMap.iteritems():
         # get total flows
         totalFlowToSink += sourceMap['flow']
         if not totalFlowFromSource.has_key(sourceSite):
             totalFlowFromSource[sourceSite] = 0
         totalFlowFromSource[sourceSite] += sourceMap['flow']
     # check limit for sink
     tmpSiteSpec = siteMapper.getSite(sinkSite)
     if siteMapper.checkSite(
             sinkSite
     ) and tmpSiteSpec.wansinklimit * 1024 * 1024 * 1024 > totalFlowToSink:
         throttleForSink[sinkSite] = False
         tmpLog.debug(
             " release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format(
                 sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit))
     else:
         throttleForSink[sinkSite] = True
         tmpLog.debug(
             " throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format(
                 sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit))
 # check limit for source
 for sourceSite, totalFlow in totalFlowFromSource.iteritems():
     tmpSiteSpec = siteMapper.getSite(sourceSite)
Exemple #4
0
 varMap[':computingSite']    = computingSite
 varMap[':prodUserName']     = prodUserName
 varMap[':jobDefinitionID']  = jobDefinitionID
 varMap[':modificationTime'] = recentRuntimeLimit
 varMap[':jobStatus1']       = 'starting'
 _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID,
                                                                prodUserName,jediTaskID,
                                                                computingSite))
 iComb += 1
 hasRecentJobs = False
 # check site
 if not siteMapper.checkSite(computingSite):
     _logger.debug("    -> skip unknown site=%s" % computingSite)
     continue
 # check site status            
 tmpSiteStatus = siteMapper.getSite(computingSite).status
 if not tmpSiteStatus in ['offline','test']:
     # use normal time limit for nornal site status
     if maxModificationTime > normalTimeLimit:
         _logger.debug("    -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime))
         continue
     for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: 
         retU,resU = taskBuffer.querySQLS(sql % tableName, varMap)
         if resU == None:
             # database error
             raise RuntimeError,"failed to check modTime"
         if resU != []:
             # found recent jobs
             hasRecentJobs = True
             _logger.debug("    -> skip %s ran recently at %s" % (resU[0][0],resU[0][1]))
             break
 varMap[':jediTaskID']       = jediTaskID
 varMap[':computingSite']    = computingSite
 varMap[':prodUserName']     = prodUserName
 varMap[':jobDefinitionID']  = jobDefinitionID
 varMap[':modificationTime'] = recentRuntimeLimit
 _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID,
                                                                prodUserName,jediTaskID,
                                                                computingSite))
 iComb += 1
 hasRecentJobs = False
 # check site
 if not siteMapper.checkSite(computingSite):
     _logger.debug("    -> skip unknown site=%s" % computingSite)
     continue
 # check site status            
 tmpSiteStatus = siteMapper.getSite(computingSite).status
 if not tmpSiteStatus in ['offline','test']:
     # use normal time limit for nornal site status
     if maxModificationTime > normalTimeLimit:
         _logger.debug("    -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime))
         continue
     for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: 
         retU,resU = taskBuffer.querySQLS(sql % tableName, varMap)
         if resU == None:
             # database error
             raise RuntimeError,"failed to check modTime"
         if resU != []:
             # found recent jobs
             hasRecentJobs = True
             _logger.debug("    -> skip %s ran recently at %s" % (resU[0][0],resU[0][1]))
             break
 def run(self):
     try:
         # get job
         tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID])
         if tmpJobs == [] or tmpJobs[0] == None:
             _logger.debug("cannot find job for PandaID=%s" % self.rPandaID)
             return
         self.job = tmpJobs[0]
         _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite))
         # using output container
         if not self.job.destinationDBlock.endswith('/'):
             _logger.debug("%s ouput dataset container is required" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # FIXEME : dont' touch group jobs for now
         if self.job.destinationDBlock.startswith('group') and (not self.userRequest):
             _logger.debug("%s skip group jobs" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check processingType
         typesForRebro = ['pathena','prun','ganga','ganga-rbtest']
         if not self.job.processingType in typesForRebro:
             _logger.debug("%s skip processingType=%s not in %s" % \
                           (self.token,self.job.processingType,str(typesForRebro)))
             _logger.debug("%s end" % self.token)
             return
         # check jobsetID
         if self.job.jobsetID in [0,'NULL',None]:
             _logger.debug("%s jobsetID is undefined" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check metadata 
         if self.job.metadata in [None,'NULL']:
             _logger.debug("%s metadata is unavailable" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --disableRebrokerage
         match = re.search("--disableRebrokerage",self.job.metadata)
         if match != None and (not self.simulation) and (not self.forceOpt) \
                and (not self.userRequest):
             _logger.debug("%s diabled rebrokerage" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --site
         match = re.search("--site",self.job.metadata)
         if match != None and (not self.simulation) and (not self.forceOpt) \
                and (not self.userRequest):
             _logger.debug("%s --site is used" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --libDS
         match = re.search("--libDS",self.job.metadata)
         if match != None:
             _logger.debug("%s --libDS is used" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check --workingGroup since it is site-specific 
         match = re.search("--workingGroup",self.job.metadata)
         if match != None:
             _logger.debug("%s workingGroup is specified" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # avoid too many rebrokerage
         if not self.checkRev():
             _logger.debug("%s avoid too many rebrokerage" % self.token)
             _logger.debug("%s end" % self.token)
             return
         # check if multiple JobIDs use the same libDS
         if self.bPandaID != None and self.buildStatus not in ['finished','failed']:
             if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None:
                 _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token)
                 _logger.debug("%s end" % self.token)
                 return
             tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS])
             if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None:
                 _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token)
                 _logger.debug("%s end" % self.token)
                 return
             # check
             if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID:
                 _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID,
                                                                                 self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID,
                                                                                 self.maxPandaIDlibDS))
                 _logger.debug("%s end" % self.token)
                 return
         # check excludedSite
         if self.excludedSite == None:
             self.excludedSite = []
             match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
             if match != None:
                 self.excludedSite = match.group(3).split(',')
         # remove empty
         try:
             self.excludedSite.remove('')
         except:
             pass
         _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite)))
         # check cloud
         if self.cloud == None:
             match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
             if match != None:
                 self.cloud = match.group(3)
         _logger.debug("%s cloud=%s" % (self.token,self.cloud))
         # get inDS/LFNs
         status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName)
         if not status:
             # failed
             _logger.error("%s failed to get inDS/LFN from DB" % self.token)
             return
         status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS)
         if not status:
             # failed
             _logger.error("%s failed" % self.token)
             return 
         # get relicas
         replicaMap = {}
         unknownSites = {} 
         for tmpDS in inputDS:
             if tmpDS.endswith('/'):
                 # container
                 status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS)
             else:
                 # normal dataset
                 status,tmpRepMap = self.getListDatasetReplicas(tmpDS)
                 tmpRepMaps = {tmpDS:tmpRepMap}
             if not status:
                 # failed
                 _logger.debug("%s failed" % self.token)
                 return 
             # make map per site
             for tmpDS,tmpRepMap in tmpRepMaps.iteritems():
                 for tmpSite,tmpStat in tmpRepMap.iteritems():
                     # ignore special sites
                     if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']:
                         continue
                     # ignore tape sites
                     if tmpSite.endswith('TAPE'):
                         continue
                     # keep sites with unknown replica info 
                     if tmpStat[-1]['found'] == None:
                         if not unknownSites.has_key(tmpDS):
                             unknownSites[tmpDS] = []
                         unknownSites[tmpDS].append(tmpSite)
                     # ignore ToBeDeleted
                     if tmpStat[-1]['archived'] in ['ToBeDeleted',]:
                         continue
                     # change EOS
                     if tmpSite.startswith('CERN-PROD_EOS'):
                         tmpSite = 'CERN-PROD_EOS'
                     # change EOS TMP
                     if tmpSite.startswith('CERN-PROD_TMP'):
                         tmpSite = 'CERN-PROD_TMP'
                     # change DISK to SCRATCHDISK
                     tmpSite = re.sub('_[^_-]+DISK$','',tmpSite)
                     # change PERF-XYZ to SCRATCHDISK
                     tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite)
                     # change PHYS-XYZ to SCRATCHDISK
                     tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite)
                     # patch for BNLPANDA
                     if tmpSite in ['BNLPANDA']:
                         tmpSite = 'BNL-OSG2'
                     # add to map    
                     if not replicaMap.has_key(tmpSite):
                         replicaMap[tmpSite] = {}
                     replicaMap[tmpSite][tmpDS] = tmpStat[-1]
         _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap)))
         # refresh replica info in needed
         self.refreshReplicaInfo(unknownSites)
         # instantiate SiteMapper
         siteMapper = SiteMapper(self.taskBuffer)
         # get original DDM
         origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm)
         # check all datasets
         maxDQ2Sites = []
         if inputDS != []:
             # loop over all sites
             for tmpSite,tmpDsVal in replicaMap.iteritems():
                 # loop over all datasets
                 appendFlag = True
                 for tmpOrigDS in inputDS:
                     # check completeness
                     if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \
                            tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']:
                         pass
                     else:
                         appendFlag = False
                 # append
                 if appendFlag:
                     if not tmpSite in maxDQ2Sites:
                         maxDQ2Sites.append(tmpSite)
         _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites)))
         if inputDS != [] and maxDQ2Sites == []:
             _logger.debug("%s no DQ2 candidate" % self.token)
         else:
             maxPandaSites = []
             # original maxinputsize
             origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize
             # look for Panda siteIDs
             for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems():
                 # use ANALY_ only
                 if not tmpSiteID.startswith('ANALY_'):
                     continue
                 # remove test and local
                 if re.search('_test',tmpSiteID,re.I) != None:
                     continue
                 if re.search('_local',tmpSiteID,re.I) != None:
                     continue
                 # avoid same site
                 if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM:
                     continue
                 # check DQ2 ID
                 if self.cloud in [None,tmpSiteSpec.cloud] \
                        and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []):
                     # excluded sites
                     excludedFlag = False
                     for tmpExcSite in self.excludedSite:
                         if re.search(tmpExcSite,tmpSiteID) != None:
                             excludedFlag = True
                             break
                     if excludedFlag:
                         _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID))
                         continue
                     # use online only
                     if tmpSiteSpec.status != 'online':
                         _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status))
                         continue
                     # check maxinputsize
                     if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \
                            maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize:
                         _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID))
                         continue
                     # append
                     if not tmpSiteID in maxPandaSites:
                         maxPandaSites.append(tmpSiteID)
             # choose at most 20 sites randomly to avoid too many lookup            
             random.shuffle(maxPandaSites)
             maxPandaSites = maxPandaSites[:20]
             _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites)))
             # no Panda siteIDs            
             if maxPandaSites == []:            
                 _logger.debug("%s no Panda site candidate" % self.token)
             else:
                 # set AtlasRelease and cmtConfig to dummy job
                 tmpJobForBrokerage = JobSpec()
                 if self.job.AtlasRelease in ['NULL',None]:
                     tmpJobForBrokerage.AtlasRelease = ''
                 else:
                     tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease
                 # use nightlies
                 matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage)
                 if matchNight != None:
                     tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1)
                 # use cache
                 else:
                     matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage)
                     if matchCache != None:
                         tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-')
                 if not self.job.cmtConfig in ['NULL',None]:    
                     tmpJobForBrokerage.cmtConfig = self.job.cmtConfig
                 # memory size
                 if not self.job.minRamCount in ['NULL',None,0]:
                     tmpJobForBrokerage.minRamCount = self.job.minRamCount
                 # CPU count
                 if not self.job.maxCpuCount in ['NULL',None,0]:
                     tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount
                 # run brokerage
                 brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True,
                                           setScanSiteList=maxPandaSites,trustIS=True,reportLog=True)
                 newSiteID = tmpJobForBrokerage.computingSite
                 self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag
                 _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID))
                 # unknown site
                 if not siteMapper.checkSite(newSiteID):
                     _logger.error("%s unknown site" % self.token)
                     _logger.debug("%s failed" % self.token)
                     return 
                 # get new site spec
                 newSiteSpec = siteMapper.getSite(newSiteID)
                 # avoid repetition
                 if self.getAggName(newSiteSpec.ddm) == origSiteDDM:
                     _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID))
                     _logger.debug("%s end" % self.token)                        
                     return
                 # simulation mode
                 if self.simulation:
                     _logger.debug("%s end simulation" % self.token)                        
                     return
                 # prepare jobs
                 status = self.prepareJob(newSiteID,newSiteSpec)
                 if status:
                     # run SetUpper
                     statusSetUp = self.runSetUpper()
                     if not statusSetUp:
                         _logger.debug("%s runSetUpper failed" % self.token)
                     else:
                         _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID))
         _logger.debug("%s end" % self.token)
     except:
         errType,errValue,errTraceBack = sys.exc_info()
         _logger.error("%s run() : %s %s" % (self.token,errType,errValue))
	throttleForSink     = {}
	throttleForSource   = {}
	totalFlowFromSource = {}
	# loop over all sources to get total flows
	tmpLog.debug(" >>> checking limits")
	for sinkSite,sinkMap in wanMX.iteritems():
		totalFlowToSink = 0 
		# loop over all sinks
		for sourceSite,sourceMap in sinkMap.iteritems():
			# get total flows
			totalFlowToSink += sourceMap['flow']
			if not totalFlowFromSource.has_key(sourceSite):
				totalFlowFromSource[sourceSite] = 0
			totalFlowFromSource[sourceSite] += sourceMap['flow']
		# check limit for sink
		tmpSiteSpec = siteMapper.getSite(sinkSite)
		if siteMapper.checkSite(sinkSite) and tmpSiteSpec.wansinklimit*1024*1024*1024 > totalFlowToSink:
			throttleForSink[sinkSite] = False
			tmpLog.debug(" release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format(sinkSite,totalFlowToSink,
												    tmpSiteSpec.wansinklimit))
		else:
			throttleForSink[sinkSite] = True
			tmpLog.debug(" throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format(sinkSite,totalFlowToSink,
												     tmpSiteSpec.wansinklimit))
	# check limit for source
	for sourceSite,totalFlow in totalFlowFromSource.iteritems():
		tmpSiteSpec = siteMapper.getSite(sourceSite)
		if siteMapper.checkSite(sourceSite) and tmpSiteSpec.wansourcelimit*1024*1024*1024 > totalFlow:
                        throttleForSource[sourceSite] = False
			tmpLog.debug(" release Src {0} : {1}bps (total) < {2}Gbps (limit)".format(sourceSite,totalFlow,
												   tmpSiteSpec.wansourcelimit))
Exemple #8
0
 def run(self):
     # start
     try:
         byCallback = False
         if self.job == None:
             byCallback = True
             _logger.debug("start: %s" % self.dataset.name)
             _logger.debug("callback from %s" % self.site)
             # FIXME when callback from BNLPANDA disappeared
             if self.site == 'BNLPANDA':
                 self.site = 'BNL-OSG2_ATLASMCDISK'
             # instantiate site mapper
             siteMapper = SiteMapper(self.taskBuffer)
             # get computingSite/destinationSE
             computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name)
             if destinationSE == None:
                 # try to get computingSite/destinationSE from ARCH to delete sub
                 # even if no active jobs left 
                 computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True)
                 if destinationSE == None:
                     _logger.error("cannot get source/destination for %s" % self.dataset.name)
                     _logger.debug("end: %s" % self.dataset.name)                
                     return
             _logger.debug("src: %s" % computingSite)
             _logger.debug("dst: %s" % destinationSE)
             # get corresponding token
             tmpSrcSiteSpec = siteMapper.getSite(computingSite)
             tmpDstSiteSpec = siteMapper.getSite(destinationSE)
             _logger.debug(tmpDstSiteSpec.setokens_output)
             destToken = None
             for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens_output.iteritems():
                 if self.site == tmpDdmId:
                     destToken = tmpToken
                     break
             _logger.debug("use Token=%s" % destToken)
             # get required tokens
             reqTokens = self.taskBuffer.getDestTokens(self.dataset.name)
             if reqTokens == None:
                 _logger.error("cannot get required token for %s" % self.dataset.name)
                 _logger.debug("end: %s" % self.dataset.name)                
                 return
             _logger.debug("req Token=%s" % reqTokens)
             # make bitmap for the token
             bitMap = 1
             if len(reqTokens.split(','))>1:
                 for tmpReqToken in reqTokens.split(','):
                     if tmpReqToken == destToken:
                         break
                     # shift one bit
                     bitMap <<= 1
             # completed bitmap
             compBitMap = (1 << len(reqTokens.split(',')))-1
             # ignore the lowest bit for T1, file on DISK is already there
             if tmpSrcSiteSpec.ddm_output == tmpDstSiteSpec.ddm_output:
                 compBitMap = compBitMap & 0xFFFE
             # update bitmap in DB
             updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap)
             _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap)))
             # update output files
             if (updatedBitMap & compBitMap) == compBitMap:
                 ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name)
                 # set flag for T2 cleanup
                 self.dataset.status = 'cleanup'
                 self.taskBuffer.updateDatasets([self.dataset])
             else:
                 _logger.debug("end: %s" % self.dataset.name)
                 return
         else:
             _logger.debug("start: %s" % self.job.PandaID)
             # update input files
             ids = [self.job.PandaID]
         _logger.debug("IDs: %s" % ids)
         if len(ids) != 0:
             # get job
             if self.job == None:
                 jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
             else:
                 jobs = [self.job]
             # loop over all jobs
             for job in jobs:
                 if job == None:
                     continue
                 _logger.debug("Job: %s" % job.PandaID)
                 if job.jobStatus == 'transferring':
                     jobReady = True
                     failedFiles = []
                     noOutFiles = []
                     # check file status
                     for file in job.Files:
                         if file.type == 'output' or file.type == 'log':
                             if file.status == 'failed':
                                 failedFiles.append(file.lfn)
                             elif file.status == 'nooutput':
                                 noOutFiles.append(file.lfn)
                             elif file.status != 'ready':
                                 _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status))
                                 jobReady = False
                                 break
                     # finish job
                     if jobReady:
                         if byCallback:
                             _logger.debug("Job: %s all files ready" % job.PandaID)
                         else:
                             _logger.debug("Job: %s all files checked with catalog" % job.PandaID)
                         # create XML
                         try:
                             import xml.dom.minidom
                             dom = xml.dom.minidom.getDOMImplementation()
                             doc = dom.createDocument(None,'xml',None)
                             topNode = doc.createElement("POOLFILECATALOG")
                             for file in job.Files:
                                 if file.type in ['output','log']:
                                     # skip failed or no-output files
                                     if file.lfn in failedFiles+noOutFiles:
                                         continue
                                     # File
                                     fileNode = doc.createElement("File")
                                     fileNode.setAttribute("ID",file.GUID)
                                     # LFN
                                     logNode = doc.createElement("logical")
                                     lfnNode = doc.createElement("lfn")
                                     lfnNode.setAttribute('name',file.lfn)
                                     # metadata
                                     fsizeNode    = doc.createElement("metadata")
                                     fsizeNode.setAttribute("att_name","fsize")
                                     fsizeNode.setAttribute("att_value",str(file.fsize))
                                     # checksum
                                     if file.checksum.startswith('ad:'):
                                         # adler32
                                         chksumNode    = doc.createElement("metadata")
                                         chksumNode.setAttribute("att_name","adler32")
                                         chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum))
                                     else:
                                         # md5sum
                                         chksumNode    = doc.createElement("metadata")
                                         chksumNode.setAttribute("att_name","md5sum")
                                         chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum))
                                     # append nodes
                                     logNode.appendChild(lfnNode)
                                     fileNode.appendChild(logNode)
                                     fileNode.appendChild(fsizeNode)
                                     fileNode.appendChild(chksumNode)
                                     topNode.appendChild(fileNode)
                             # status in file name
                             if failedFiles == []:
                                 statusFileName = 'finished'
                             else:
                                 statusFileName = 'failed'
                             # write to file
                             xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,statusFileName,commands.getoutput('uuidgen'))
                             oXML = open(xmlFile,"w")
                             oXML.write(topNode.toxml())
                             oXML.close()
                         except:
                             type, value, traceBack = sys.exc_info()
                             _logger.error("Job: %s %s %s" % (job.PandaID,type,value))
                 _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus))                
         # end
         if self.job == None:        
             _logger.debug("end: %s" % self.dataset.name)
         else:
             _logger.debug("end: %s" % self.job.PandaID)
     except:
         type, value, traceBack = sys.exc_info()
         _logger.error("run() : %s %s" % (type,value))
Exemple #9
0
xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!-- ATLAS file meta-data catalog -->
<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">
<POOLFILECATALOG>
"""
try:
    att = sys.argv[2]
except:
    att = job.attemptNr

if job.computingSite in ['', None, 'NULL']:
    print 'computingSite is not yet defined'
    sys.exit(0)

siteSpec = siteMapper.getSite(job.computingSite)

for file in job.Files:
    if file.type in ['output', 'log']:
        file.GUID = commands.getoutput('uuidgen')
        if job.computingSite == file.destinationSE and \
                siteSpec.setokens_output.has_key(file.destinationDBlockToken):
            tmpSrcDDM = siteSpec.setokens_output[file.destinationDBlockToken]
        else:
            tmpSrcDDM = siteMapper.getSite(job.computingSite).ddm_output
        srm = TiersOfATLAS.getSiteProperty(tmpSrcDDM, 'srm')
        srm = re.sub('^token:[^:]+:', '', srm)
        xml += """
  <File ID="%s">
    <logical>
      <lfn name="%s"/>
Exemple #10
0
         if startTime < timeLimit:
             siteJobsMap[siteName]['running'].append(pandaID)
 # sql to get number of high priority jobs
 sqlHiJobs = "SELECT count(*) FROM {0}.jobsActive4 ".format(
     panda_config.schemaPANDA)
 sqlHiJobs += "WHERE prodSourceLabel=:label AND jobStatus IN (:jobStat1,:jobStat2) "
 sqlHiJobs += "AND currentPriority>=:prio AND computingSite=:site AND eventService IS NULL "
 sqlHiJobs += "AND startTime<:timeLimit "
 # sql to kill job
 sqlKill = "UPDATE {0}.jobsActive4 ".format(panda_config.schemaPANDA)
 sqlKill += "SET commandToPilot=:com,supErrorCode=:code,supErrorDiag=:diag "
 sqlKill += "WHERE PandaID=:pandaID AND jobStatus=:jobStatus "
 # check all sites
 for siteName, jobsMap in siteJobsMap.iteritems():
     # check jobseed
     siteSpec = siteMapper.getSite(siteName)
     # skip ES-only sites
     if siteSpec.getJobSeed() == 'es':
         continue
     # get number of high priority jobs
     varMap = {}
     varMap[':label'] = 'managed'
     varMap[':jobStat1'] = 'activated'
     varMap[':jobStat2'] = 'starting'
     varMap[':prio'] = 800
     varMap[':timeLimit'] = timeLimit
     status, res = taskBuffer.querySQLS(sqlHiJobs, varMap)
     if res != None:
         nJobs = res[0][0]
         nJobsToKill = nJobs - len(siteJobsMap[siteName]['killing'])
         tmpLog.debug(
Exemple #11
0
class Closer:
    # constructor
    def __init__(self,taskBuffer,destinationDBlocks,job,pandaDDM=False,datasetMap={}):
        self.taskBuffer = taskBuffer
        self.destinationDBlocks = destinationDBlocks
        self.job = job
        self.pandaID = job.PandaID
        self.pandaDDM = pandaDDM
        self.siteMapper = None
        self.datasetMap = datasetMap
        
    # to keep backward compatibility
    def start(self):
        self.run()
    def join(self):
        pass

    # main
    def run(self):
        try:
            _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
            flagComplete    = True
            ddmJobs         = []
            topUserDsList   = []
            usingMerger     = False        
            disableNotifier = False
            firstIndvDS     = True
            finalStatusDS   = []
            for destinationDBlock in self.destinationDBlocks:
                dsList = []
                _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
                # ignore tid datasets
                if re.search('_tid[\d_]+$',destinationDBlock):
                    _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                    continue
                # ignore HC datasets
                if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None:
                    if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None:
                        _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                        continue
                # query dataset
                if self.datasetMap.has_key(destinationDBlock):
                    dataset = self.datasetMap[destinationDBlock]
                else:
                    dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
                if dataset == None:
                    _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                    flagComplete = False
                    continue
                # skip tobedeleted/tobeclosed 
                if dataset.status in ['cleanup','tobeclosed','completed']:
                    _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                    continue
                dsList.append(dataset)
                # sort
                dsList.sort()
                # count number of completed files
                notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                               'status':'unknown'})
                if notFinish < 0:
                    _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                    flagComplete = False                
                    continue
                # check if completed
                _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
                if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                    # close non-DQ2 destinationDBlock immediately
                    finalStatus = 'closed'
                elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                    # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                    finalStatus = 'closed'
                elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                         and self.job.processingType != 'usermerge':
                    # merge output files
                    if firstIndvDS:
                        # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                        finalStatus = 'tobemerged'
                        firstIndvDS = False
                    else:
                        finalStatus = 'tobeclosed'
                    # set merging to top dataset
                    usingMerger = True
                    # disable Notifier
                    disableNotifier = True
                elif self.job.produceUnMerge():
                    finalStatus = 'doing'
                else:
                    # set status to 'tobeclosed' to trigger DQ2 closing
                    finalStatus = 'tobeclosed'
                if notFinish==0: 
                    _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                    # set status
                    dataset.status = finalStatus
                    # update dataset in DB
                    retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                          criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                    if len(retT) > 0 and retT[0]==1:
                        finalStatusDS += dsList
                        # close user datasets
                        if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                               and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                            # get top-level user dataset 
                            topUserDsName = re.sub('_sub\d+$','',dataset.name)
                            # update if it is the first attempt
                            if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                                topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                                if topUserDs != None:
                                    # check status
                                    if topUserDs.status in ['completed','cleanup','tobeclosed',
                                                            'tobemerged','merging']:
                                        _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                    else:
                                        # set status
                                        if self.job.processingType.startswith('gangarobot') or \
                                               self.job.processingType.startswith('hammercloud'):
                                            # not trigger freezing for HC datasets so that files can be appended
                                            topUserDs.status = 'completed'
                                        elif not usingMerger:
                                            topUserDs.status = finalStatus
                                        else:
                                            topUserDs.status = 'merging'
                                        # append to avoid repetition
                                        topUserDsList.append(topUserDsName)
                                        # update DB
                                        retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                 criteriaMap={':crStatus':topUserDs.status})
                                        if len(retTopT) > 0 and retTopT[0]==1:
                                            _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                        else:
                                            _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                            # get parent dataset for merge job
                            if self.job.processingType == 'usermerge':
                                tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                                if tmpMatch == None:
                                    _logger.error('%s failed to extract parentDS' % self.pandaID)
                                else:
                                    unmergedDsName = tmpMatch.group(1)
                                    # update if it is the first attempt
                                    if not unmergedDsName in topUserDsList:
                                        unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                        if unmergedDs == None:
                                            _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                        else:
                                            # check status
                                            if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                                _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                            else:
                                                # set status
                                                unmergedDs.status = finalStatus
                                                # append to avoid repetition
                                                topUserDsList.append(unmergedDsName)
                                                # update DB
                                                retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                         criteriaMap={':crStatus':unmergedDs.status})
                                                if len(retTopT) > 0 and retTopT[0]==1:
                                                    _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                                else:
                                                    _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                        if self.pandaDDM and self.job.prodSourceLabel=='managed':
                            # instantiate SiteMapper
                            if self.siteMapper == None:
                                self.siteMapper = SiteMapper(self.taskBuffer)
                            # get file list for PandaDDM
                            retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock})
                            lfnsStr = ''
                            guidStr = ''
                            for tmpFile in retList:
                                if tmpFile.type in ['log','output']:
                                    lfnsStr += '%s,' % tmpFile.lfn
                                    guidStr += '%s,' % tmpFile.GUID
                            if lfnsStr != '':
                                guidStr = guidStr[:-1]
                                lfnsStr = lfnsStr[:-1]
                                # create a DDM job
                                ddmjob = JobSpec()
                                ddmjob.jobDefinitionID   = int(time.time()) % 10000
                                ddmjob.jobName           = "%s" % commands.getoutput('uuidgen')
                                ddmjob.transformation    = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr'
                                ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName
                                ddmjob.computingSite     = "BNL_ATLAS_DDM"
                                ddmjob.destinationSE     = ddmjob.computingSite
                                ddmjob.currentPriority   = 200000
                                ddmjob.prodSourceLabel   = 'ddm'
                                ddmjob.transferType      = 'sub'
                                # append log file
                                fileOL = FileSpec()
                                fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName
                                fileOL.destinationDBlock = ddmjob.destinationDBlock
                                fileOL.destinationSE     = ddmjob.destinationSE
                                fileOL.dataset           = ddmjob.destinationDBlock
                                fileOL.type = 'log'
                                ddmjob.addFile(fileOL)
                                # make arguments
                                dstDQ2ID = 'BNLPANDA'
                                srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm
                                callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \
                                              (panda_config.pserverhost,panda_config.pserverport,
                                               dataset.vuid,dstDQ2ID)
                                _logger.debug(callBackURL)
                                # set src/dest
                                ddmjob.sourceSite      = srcDQ2ID
                                ddmjob.destinationSite = dstDQ2ID
                                # if src==dst, send callback without ddm job
                                if dstDQ2ID == srcDQ2ID:
                                    comout = commands.getoutput('curl -k %s' % callBackURL)
                                    _logger.debug(comout)
                                else:
                                    # run dq2_cr
                                    callBackURL = urllib.quote(callBackURL)
                                    # get destination dir
                                    destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url)
                                    argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \
                                             (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir,
                                              destinationDBlock,destinationDBlock)
                                    # set job parameters
                                    ddmjob.jobParameters = argStr
                                    _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters))
                                    ddmJobs.append(ddmjob)
                        # start Activator
                        if re.search('_sub\d+$',dataset.name) == None:
                            if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                                # don't trigger Activator for merge jobs
                                pass
                            else:
                                if self.job.jobStatus == 'finished':
                                    aThr = Activator(self.taskBuffer,dataset)
                                    aThr.start()
                                    aThr.join()
                    else:
                        # unset flag since another thread already updated 
                        #flagComplete = False
                        pass
                else:
                    # update dataset in DB
                    self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                   criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                    # unset flag
                    flagComplete = False
                # end
                _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
            # start DDM jobs
            if ddmJobs != []:
                self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True)
            # change pending jobs to failed
            finalizedFlag = True
            if flagComplete and self.job.prodSourceLabel=='user':
                _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
                finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID,waitLock=True)
                _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
            # update unmerged datasets in JEDI to trigger merging
            if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
                if finalizedFlag:
                    self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
            # start notifier
            _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
            if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
               (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
               self.job.lockedby != 'jedi':
                # don't send email for merge jobs
                if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                    useNotifier = True
                    summaryInfo = {}
                    # check all jobDefIDs in jobsetID
                    if not self.job.jobsetID in [0,None,'NULL']:
                        useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                                self.job.prodUserName)
                        _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                    if useNotifier:
                        _logger.debug('%s start Notifier' % self.pandaID)
                        nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                        nThr.run()
                        _logger.debug('%s end Notifier' % self.pandaID)                    
            _logger.debug('%s End' % self.pandaID)
        except:
            errType,errValue = sys.exc_info()[:2]
            _logger.error("%s %s" % (errType,errValue))
            


    # check if top dataset
    def isTopLevelDS(self,datasetName):
        topDS = re.sub('_sub\d+$','',datasetName)
        if topDS == datasetName:
            return True
        return False
# exec 	
status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000)
if res == None:
    _logger.debug("total %s " % res)
else:
    # release high prio jobs
    sql  = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newStatus "
    sql += "WHERE jobStatus=:oldStatus AND prodSourceLabel IN (:p1) AND lockedBy=:lockedBy "
    sql += "AND currentPriority>=:prioCutoff AND computingSite=:computingSite "
    # loop over computing sites
    for computingSite, in res:
	# get site spec
	if not siteMapper.checkSite(computingSite):
	    continue
	siteSpec = siteMapper.getSite(computingSite)
        # check if resource fair share is used
        if siteSpec.useResourceFairShare():
            varMap = {}
            varMap[':newStatus'] = 'activated'
            varMap[':oldStatus'] = 'throttled'
            varMap[':p1'] = 'managed'
            varMap[':lockedBy'] = 'jedi'
            varMap[':prioCutoff'] = prioCutoff
            varMap[':computingSite'] = computingSite
            status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000)


# get statistics
sql  = "SELECT COUNT(*),jobStatus,computingSite,cloud FROM ATLAS_PANDA.jobsActive4 "
sql += "WHERE jobStatus IN (:s1,:s2,:s3) AND prodSourceLabel IN (:p1) AND lockedBy=:lockedBy "
Exemple #13
0
         # kill only old jobs
         if startTime < timeLimit:
             siteJobsMap[siteName]['running'].append(pandaID)
 # sql to get number of high priority jobs
 sqlHiJobs  = "SELECT count(*) FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
 sqlHiJobs += "WHERE prodSourceLabel=:label AND jobStatus IN (:jobStat1,:jobStat2) "
 sqlHiJobs += "AND currentPriority>=:prio AND computingSite=:site AND eventService IS NULL "
 sqlHiJobs += "AND startTime<:timeLimit "
 # sql to kill job
 sqlKill  = "UPDATE {0}.jobsActive4 ".format(panda_config.schemaPANDA)
 sqlKill += "SET commandToPilot=:com,supErrorCode=:code,supErrorDiag=:diag "
 sqlKill += "WHERE PandaID=:pandaID AND jobStatus=:jobStatus "
 # check all sites
 for siteName,jobsMap in siteJobsMap.iteritems():
     # check jobseed
     siteSpec = siteMapper.getSite(siteName)
     # skip ES-only sites
     if siteSpec.getJobSeed() == 'es':
         continue
     # get number of high priority jobs
     varMap = {}
     varMap[':label'] = 'managed'
     varMap[':jobStat1'] = 'activated'
     varMap[':jobStat2'] = 'starting'
     varMap[':prio'] = 800
     varMap[':timeLimit'] = timeLimit
     status,res = taskBuffer.querySQLS(sqlHiJobs,varMap)
     if res != None:
         nJobs = res[0][0]
         nJobsToKill = nJobs-len(siteJobsMap[siteName]['killing'])
         tmpLog.debug("site={0} nHighPrioJobs={1} nRunnigES={2} nKillingES={3} nESToKill={4}".format(siteName,nJobs,