def __init__(self, taskBuffer, jobID, jobStatus, xmlFile, ignoreTmpError=True, siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = { 'surl': {}, 'nevents': {}, 'lbnr': {}, 'endpoint': {} } # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$', tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger, str(self.jobID))
def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger,self.token) self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, token=' ',logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None
def make_logger(tmp_log, token=None, method_name=None, hook=None): # get method name of caller if method_name is None: tmpStr = inspect.stack()[1][3] else: tmpStr = method_name if token is not None: tmpStr += ' <{0}>'.format(token) else: tmpStr += ' :'.format(token) newLog = LogWrapper(tmp_log, tmpStr, seeMem=with_memory_profile, hook=hook) return newLog
class CloserAtlasPlugin: # constructor def __init__(self,job,datasets,log): self.jobSpec = job self.datasets = datasets self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID)) # execute def execute(self): try: # only for production if not self.jobSpec.prodSourceLabel in ['managed','test']: return True # only for urgent or high prio if not self.jobSpec.processingType in ['urgent'] and self.jobSpec.currentPriority <= 1000: return True # close datasets for datasetSpec in self.datasets: if re.search('_sub\d+$',datasetSpec.name) == None: continue if datasetSpec.status != 'tobeclosed': continue try: self.tmpLog.debug('immediate close {0}'.format(datasetSpec.name)) rucioAPI.closeDataset(datasetSpec.name) except: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to close : {0} {1}'.format(errtype,errvalue)) except: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to execute : {0} {1}'.format(errtype,errvalue)) return True
def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}} # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$',tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger,str(self.jobID))
def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger,self.token) self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, token=' ',logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None
class AdderGen: # constructor def __init__(self, taskBuffer, jobID, jobStatus, xmlFile, ignoreTmpError=True, siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = { 'surl': {}, 'nevents': {}, 'lbnr': {}, 'endpoint': {} } # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$', tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger, str(self.jobID)) # dump file report def dumpFileReport(self, fileCatalog, attemptNr): self.logger.debug("dump file report") # dump Catalog into file if attemptNr == None: xmlFile = '%s/%s_%s_%s' % (panda_config.logdir, self.jobID, self.jobStatus, str(uuid.uuid4())) else: xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir, self.jobID, self.jobStatus, str( uuid.uuid4()), attemptNr) file = open(xmlFile, 'w') file.write(fileCatalog) file.close() # get plugin class def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format( self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled( ) and not self.job.taskBufferErrorCode in [ taskbuffer.ErrorCode.EC_PilotRetried ]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError, 'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError, 'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug( "status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error( 'failed to update DB for pandaid={0}'.format( self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug( "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}" .format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug( "AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID, assDBlocks in assDBlockMap.iteritems( ): assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) cThr.start() cThr.join() self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type, value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) self.logger.error("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # parse json try: import json with open(self.xmlFile) as tmpF: jsonDict = json.load(tmpF) for lfn, fileData in jsonDict.iteritems(): lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) == None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData[ 'endpoint'] # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except: pass except: pass self.logger.debug('nEventsMapJson=%s' % str(nEventsMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test', 'rc_test', 'ptest' ]: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and \ (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0 # copy files for variable number of outputs def copyFilesForVariableNumOutputs(self, lfns): # get original output files origOutputs = {} updateOrig = {} for tmpFile in self.job.Files: if tmpFile.type in ['output', 'log']: origOutputs[tmpFile.lfn] = tmpFile if tmpFile.lfn in lfns: # keep original updateOrig[tmpFile.lfn] = False else: # overwrite original updateOrig[tmpFile.lfn] = True # look for unkown files addedNewFiles = False for newLFN in lfns: if not newLFN in origOutputs: # look for corresponding original output for origLFN in origOutputs.keys(): tmpPatt = '^{0}\.*_\d+$'.format(origLFN) if re.search(tmpPatt, newLFN) != None: # copy file record tmpStat = self.taskBuffer.copyFileRecord( newLFN, origOutputs[origLFN], updateOrig[origLFN]) if not tmpStat: return False addedNewFiles = True # disable further overwriting updateOrig[origLFN] = False break # refresh job info if addedNewFiles: self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # return return True
def _getPFNFromLFC(lfns,dq2url,guids,storageName,scopeList=[],tmpLog=None): if tmpLog == None: tmpLog = LogWrapper(_log,logPrefix) tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url,str(storageName), len(lfns),str(lfns[:3]),str(scopeList[:3]))) outStr = '' # check paramter if guids == [] or storageName == [] or (len(lfns) != len(guids)): tmpLog.debug('_getPFNFromLFC done with empty list') return outStr # check scopeList if not scopeList in [None,[]] and len(lfns) != len(scopeList): tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url,str(storageName), str(lfns),str(scopeList))) tmpLog.error('_getPFNFromLFC failed') return outStr # loop over all LFNs iLFN = 0 nLFN = 1000 strFiles = '' outStr = '' for iLFN in range(len(lfns)): if scopeList != []: strFiles += '%s %s %s\n' % (lfns[iLFN],guids[iLFN],scopeList[iLFN]) else: strFiles += '%s %s\n' % (lfns[iLFN],guids[iLFN]) # bulk operation if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns): # write to file inFileName = '%s/lfcin.%s' % (panda_config.logdir,commands.getoutput('uuidgen')) ifile = open(inFileName,'w') ifile.write(strFiles) ifile.close() # construct commands strStorage = '' for storage in storageName: strStorage += '%s,' % storage strStorage = strStorage[:-1] com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, inFileName,dq2url,strStorage) tmpLog.debug(com) # exeute status,output = commands.getstatusoutput(com) tmpLog.debug(status) if status == 0: outStr += output else: tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output)) # send message to logger try: # make message message = 'LFC access : %s %s %s' % (dq2url,status,output) # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type':'broker_util'}) logger = _pandaLogger.getHttpLogger(panda_config.loggername) # add message logger.error(message) # release HTTP handler _pandaLogger.release() except: pass tmpLog.error('_getPFNFromLFC failed') return status # reset strFiles = '' tmpLog.debug('_getPFNFromLFC done') # return return outStr
import taskbuffer.ErrorCode import pandalogger.PandaLogger from taskbuffer import EventServiceUtils from pandalogger.PandaLogger import PandaLogger from brokerage.SiteMapper import SiteMapper from pandautils import PandaUtils from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('add') tmpLog = LogWrapper(_logger,None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # current minute currentMinute = datetime.datetime.utcnow().minute
import re import sys import datetime import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']: varMap = {}
class EventPicker: # constructor def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger,self.token) self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, token=' ',logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB) except: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} ei_api = '' # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine) # check format if tmpMatch == None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'ei_api': # ei api parameter for MC ei_api = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: exec "runEvtGuidMap="+tmpItems[1] except: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)',self.params) if tmpMatch != None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(compactDN,self.userTaskName) # convert if tagDsList == [] or tagQuery == '': # convert run/event list to dataset/file list tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap, ei_api ) if not tmpRet: if 'isFatal' in locationMap and locationMap['isFatal'] == True: self.ignoreError = False self.endWithError('Failed to convert the run/event list to a dataset/file list') return False else: # get parent dataset/files with TAG tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) if not tmpRet: self.endWithError('Failed to get parent dataset/file list with TAG') return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy','',tmpDN) tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError('Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS,tmpDsVal in candidateMaps.iteritems(): for tmpCloud,tmpCloudVal in tmpDsVal.iteritems(): for tmpSiteName in tmpCloudVal[0]: if not tmpSiteName in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet,tmpOut = self.pd2p.getListDatasetReplicasInContainer(self.userDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % self.userDatasetName) return False userDatasetNameList = tmpOut.keys() else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet,totalInputSize = rucioAPI.getDatasetSize(tmpUserDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % tmpUserDatasetName) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, True,datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus,userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError,'user info not found for {0} with {1}'.format(tmpDN,userInfo) tmpDN = userInfo['nickname'] tmpDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm_input tmpMsg = "%s ds=%s site=%s id=%s" % ('registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation(tmpDS,[tmpDQ2ID],lifetime=14,owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except: errType,errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % (errType,errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True,tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except: errType,errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType,errValue,traceback.format_exc())) return False # end with error def endWithError(self,message): self.putLog(message,'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False,message) except: pass # upload log if self.jediTaskID != None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI(self.jediTaskID,'event picking failed. '+outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self,msg,type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self,isSucceeded,message): # mail address toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN,'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy,self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder,mailSubject,mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s,o = Client.uploadLog(strMsg,self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False, scopeList=[]): tmpLog = LogWrapper(_log,None) tmpLog.debug('getFilesFromLRC "%s" %s' % (url,str(storageName))) # get PFC outSTR = '' if url.startswith('mysql://'): # from MySQL outSTR = _getPFNFromMySQL(files,url) # get PFN if getPFN: outPFN = {} # FIXME tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('http://'): # from HTTP I/F outSTR = _getPoolFileCatalog(files,url) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['',None]: root = xml.dom.minidom.parseString(outSTR) fileNodes = root.getElementsByTagName('File') for file in fileNodes: # get PFN and LFN nodes physical = file.getElementsByTagName('physical')[0] pfnNode = physical.getElementsByTagName('pfn')[0] logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw pfn = str(pfnNode.getAttribute('name')) lfn = str(lfnNode.getAttribute('name')) # assign if not outPFN.has_key(lfn): outPFN[lfn] = [] outPFN[lfn].append(pfn) except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse XML - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('lfc://') or url.startswith('rucio://'): # from LFC timeStart = datetime.datetime.utcnow() outSTR = _getPFNFromLFC(files,url,guids,storageName,scopeList=scopeList,tmpLog=tmpLog) regTime = datetime.datetime.utcnow() - timeStart tmpLog.debug('file lookup for %s LFNs from %s took %s.%03d sec' % (len(files),url,regTime.seconds, regTime.microseconds/1000)) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['',None]: tmpItems = outSTR.split('LFCRet :') tmpItems.remove('') # loop over all returns for tmpItem in tmpItems: exec "tmpLFNmap = %s" % tmpItem for tmpLFN,tmpPFN in tmpLFNmap.iteritems(): outPFN[tmpLFN] = tmpPFN except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse LFC ret - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s files' % len(outPFN)) return outPFN # check return if not isinstance(outSTR,types.StringType): if terminateWhenFailed: return None # set empty string outSTR = '' # collect OK Files okFiles = [] for file in files: if re.search(file,outSTR) != None: okFiles.append(file) tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]),len(okFiles))) return okFiles
def __init__(self,job,datasets,log): self.jobSpec = job self.datasets = datasets self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID))
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger, None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format( self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO, tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format( tmpVO, len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin( 'setupper_plugins', tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format( setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass( self.taskBuffer, self.jobs, tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs( setupperPlugin.jobs + setupperPlugin.jumboJobs, tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format( errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir, self.jobs[0].PandaID, commands.getoutput('uuidgen')) outFile = open(outFileName, 'w') pickle.dump(self.jobs, outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'source %s; ' % panda_config.glite_source com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status, output = self.taskBuffer.processLimiter.getstatusoutput( com) tmpLog.debug("return from main process: %s %s" % (status, output)) tmpLog.debug('fork end') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format( errtype, errvalue))
def getGUIDsFromEventIndex(self, runEventList, streamName, amiTags, dataType): comment = ' /* DBProxy.getGUIDsFromEventIndex */' methodName = comment.split(' ')[-2].split('.')[-1] tmpLog = LogWrapper( _logger, methodName + " <streamName={0} amiTags={1} dataType={2}>".format( streamName, amiTags, dataType)) try: # change to list if not amiTags in [None, '']: amiTags = amiTags.replace('*', '.*').split(',') tmpLog.debug("start for {0} events".format(len(runEventList))) # check data type if not dataType in ['RAW', 'ESD', 'AOD']: return False, 'dataType={0} is unsupported'.format(dataType) # sql to insert runs and events sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format( panda_config.schemaEI) sqlRE += "VALUES (:runNumber,:eventNumber) " varMaps = [] for runNumber, eventNumber in runEventList: varMap = {} varMap[':runNumber'] = runNumber varMap[':eventNumber'] = eventNumber varMaps.append(varMap) # begin transaction self.conn.begin() self.cur.arraysize = 100000 # insert runs and events self.cur.executemany(sqlRE + comment, varMaps) # read GUIDs varMap = {} if amiTags in [None, '']: sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format( panda_config.schemaEI) else: sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format( panda_config.schemaEI) if not streamName in [None, '']: sqlRG += "WHERE streamName=:streamName " varMap[':streamName'] = streamName self.cur.execute(sqlRG + comment, varMap) resRG = self.cur.fetchall() # commit if not self._commit(): raise RuntimeError, 'Commit error' retValue = {} keyAmiIdxMap = {} for tmpItem in resRG: if amiTags in [None, '']: runNumber, eventNumber, guid = tmpItem # dummy idxTag = 0 else: runNumber, eventNumber, guid, amiTag = tmpItem # get index number for the AMI tag in the list idxTag = self.getIndexAmiTag(amiTags, amiTag) # didn't match if idxTag == None: continue tmpKey = (runNumber, eventNumber) # use AMI tag in a preference orde if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag: continue keyAmiIdxMap[tmpKey] = idxTag retValue[tmpKey] = [guid] tmpLog.debug("found {0} events".format(len(retValue))) return True, retValue except: # roll back self._rollback() # error self.dumpErrorMessage(_logger, methodName) return False, None
import sys import datetime import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper from taskbuffer import ErrorCode # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
class AdderGen: # constructor def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = {'surl':{},'nevents':{},'lbnr':{}} # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$',tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger,self.jobID) # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) else: # check file status in JEDI fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "wrong file status in JEDI" self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus)) # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # set VO=local for DDM free if self.job.destinationSE == 'local': tmpVO = 'local' else: tmpVO = self.job.VO # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user','panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job): # unset file status for ES jobs pass else: file.status = 'failed' continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable" return 2 # return self.logger.debug("parseXML end") return 0
import datetime import commands from taskbuffer.TaskBuffer import taskBuffer from taskbuffer.WorkerSpec import WorkerSpec from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('harvesterCtl') tmpLog = LogWrapper(_logger,None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # kill old process try: # time limit
import datetime import commands from taskbuffer.TaskBuffer import taskBuffer from taskbuffer.WorkerSpec import WorkerSpec from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('harvesterCtl') tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # kill old process try: # time limit
def _getPFNFromLFC(lfns, dq2url, guids, storageName, scopeList=[], tmpLog=None): if tmpLog == None: tmpLog = LogWrapper(_log, logPrefix) tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url, str(storageName), len(lfns), str( lfns[:3]), str(scopeList[:3]))) outStr = '' # check paramter if guids == [] or storageName == [] or (len(lfns) != len(guids)): tmpLog.debug('_getPFNFromLFC done with empty list') return outStr # check scopeList if not scopeList in [None, []] and len(lfns) != len(scopeList): tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url, str(storageName), str(lfns), str(scopeList))) tmpLog.error('_getPFNFromLFC failed') return outStr # loop over all LFNs iLFN = 0 nLFN = 1000 strFiles = '' outStr = '' for iLFN in range(len(lfns)): if scopeList != []: strFiles += '%s %s %s\n' % (lfns[iLFN], guids[iLFN], scopeList[iLFN]) else: strFiles += '%s %s\n' % (lfns[iLFN], guids[iLFN]) # bulk operation if (iLFN + 1) % nLFN == 0 or (iLFN + 1) >= len(lfns): # write to file inFileName = '%s/lfcin.%s' % (panda_config.logdir, commands.getoutput('uuidgen')) ifile = open(inFileName, 'w') ifile.write(strFiles) ifile.close() # construct commands strStorage = '' for storage in storageName: strStorage += '%s,' % storage strStorage = strStorage[:-1] com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, inFileName,dq2url,strStorage) tmpLog.debug(com) # exeute status, output = commands.getstatusoutput(com) tmpLog.debug(status) if status == 0: outStr += output else: tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url, status, output)) # send message to logger try: # make message message = 'LFC access : %s %s %s' % (dq2url, status, output) # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type': 'broker_util'}) logger = _pandaLogger.getHttpLogger( panda_config.loggername) # add message logger.error(message) # release HTTP handler _pandaLogger.release() except: pass tmpLog.error('_getPFNFromLFC failed') return status # reset strFiles = '' tmpLog.debug('_getPFNFromLFC done') # return return outStr
import re from config import panda_config from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper import panda_proxy_cache # logger _logger = PandaLogger().getLogger('panda_activeusers_query') tmpLog = LogWrapper(_logger) if __name__ == '__main__' : tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config,'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot'] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None, totRCHAR=None, totWCHAR=None, totRBYTES=None, totWBYTES=None, rateRCHAR=None, rateWCHAR=None, rateRBYTES=None, rateWBYTES=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s,totRCHAR=%s,totWCHAR=%s,totRBYTES=%s,totWBYTES=%s,rateRCHAR=%s,rateWCHAR=%s,rateRBYTES=%s,rateWBYTES=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, totRCHAR, totWCHAR, totRBYTES, totWBYTES, rateRCHAR, rateWCHAR, rateRBYTES, rateWBYTES, xml, pilotLog[:1024], metaData[:1024], jobMetrics, stdout)) _pilotReqLogger.debug('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) if acceptJson: tmpMsg = 'no production/pilot role in VOMS FQANs or non pilot owner' else: tmpMsg = None return Protocol.Response(Protocol.SC_Role, tmpMsg).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if totRCHAR is not None: totRCHAR = int(totRCHAR) / 1024 # convert to kByte totRCHAR = min(10**10 - 1, totRCHAR) # limit to 10 digit param['totRCHAR'] = totRCHAR if totWCHAR is not None: totWCHAR = int(totWCHAR) / 1024 # convert to kByte totWCHAR = min(10**10 - 1, totWCHAR) # limit to 10 digit param['totWCHAR'] = totWCHAR if totRBYTES is not None: totRBYTES = int(totRBYTES) / 1024 # convert to kByte totRBYTES = min(10**10 - 1, totRBYTES) # limit to 10 digit param['totRBYTES'] = totRBYTES if totWBYTES is not None: totWBYTES = int(totWBYTES) / 1024 # convert to kByte totWBYTES = min(10**10 - 1, totWBYTES) # limit to 10 digit param['totWBYTES'] = totWBYTES if rateRCHAR is not None: rateRCHAR = min(10**10 - 1, int(rateRCHAR)) # limit to 10 digit param['rateRCHAR'] = rateRCHAR if rateWCHAR is not None: rateWCHAR = min(10**10 - 1, int(rateWCHAR)) # limit to 10 digit param['rateWCHAR'] = rateWCHAR if rateRBYTES is not None: rateRBYTES = min(10**10 - 1, int(rateRBYTES)) # limit to 10 digit param['rateRBYTES'] = rateRBYTES if rateWBYTES is not None: rateWBYTES = min(10**10 - 1, int(rateWBYTES)) # limit to 10 digit param['rateWBYTES'] = rateWBYTES if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, pilotLog, attemptNr, stdout, acceptJson)
def application(environ, start_response): # get method name methodName = '' if environ.has_key('SCRIPT_NAME'): methodName = environ['SCRIPT_NAME'].split('/')[-1] tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName)) tmpLog.debug("start") regStart = datetime.datetime.utcnow() retType = None # check method name if not methodName in allowedMethods: tmpLog.error("is forbidden") exeRes = "False : %s is forbidden" % methodName else: # get method object tmpMethod = None try: exec "tmpMethod = %s" % methodName except: pass # object not found if tmpMethod == None: tmpLog.error("is undefined") exeRes = "False" else: try: # get params tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, keep_blank_values=1) # convert to map params = {} for tmpKey in tmpPars.keys(): if tmpPars[tmpKey].file != None and tmpPars[ tmpKey].filename != None: # file params[tmpKey] = tmpPars[tmpKey] else: # string params[tmpKey] = tmpPars.getfirst(tmpKey) if panda_config.entryVerbose: tmpLog.debug("with %s" % str(params.keys())) # dummy request object dummyReq = DummyReq(environ, tmpLog) # exec exeRes = apply(tmpMethod, [dummyReq], params) # extract return type if type(exeRes) == types.DictType: retType = exeRes['type'] exeRes = exeRes['content'] # convert bool to string if exeRes in [True, False]: exeRes = str(exeRes) except Exception as e: tmpLog.error("execution failure : {0}".format(str(e))) errStr = "" for tmpKey, tmpVal in environ.iteritems(): errStr += "%s : %s\n" % (tmpKey, str(tmpVal)) tmpLog.error(errStr) # return internal server error start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) return [str(e)] if panda_config.entryVerbose: tmpLog.debug("done") regTime = datetime.datetime.utcnow() - regStart tmpLog.info( "exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds, regTime.microseconds / 1000, len(str(exeRes)))) # return if exeRes == taskbuffer.ErrorCode.EC_NotFound: start_response('404 Not Found', [('Content-Type', 'text/plain')]) return ['not found'] elif isinstance(exeRes, taskbuffer.ErrorCode.EC_Redirect): start_response('302 Redirect', [('Location', exeRes.url)]) return ['redirect'] else: if retType == 'json': start_response('200 OK', [('Content-Type', 'application/json')]) else: start_response('200 OK', [('Content-Type', 'text/plain')]) return [exeRes]
class AdderGen: # constructor def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}} # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$',tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger,str(self.jobID)) # dump file report def dumpFileReport(self,fileCatalog,attemptNr): self.logger.debug("dump file report") # dump Catalog into file if attemptNr == None: xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, str(uuid.uuid4())) else: xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, str(uuid.uuid4()),attemptNr) file = open(xmlFile,'w') file.write(fileCatalog) file.close() # get plugin class def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug("going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError,'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug("AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' self.job.jobSubStatus = None # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled','closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug("AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID, destDBList) for assJobID,assDBlocks in assDBlockMap.iteritems(): assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID)) else: cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob) self.logger.debug("start Closer for PandaID={0}".format(assJobID)) cThr.start() cThr.join() self.logger.debug("end Closer for PandaID={0}".format(assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type,value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) self.logger.error("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data)) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # parse json try: import json with open(self.xmlFile) as tmpF: jsonDict = json.load(tmpF) for lfn, fileData in jsonDict.iteritems(): lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except: pass except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('guidMap=%s' % str(guidMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error("failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user','panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and \ (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn) return 2 # return self.logger.debug("parseXML end") return 0 # copy files for variable number of outputs def copyFilesForVariableNumOutputs(self,lfns): # get original output files origOutputs = {} updateOrig = {} for tmpFile in self.job.Files: if tmpFile.type in ['output','log']: origOutputs[tmpFile.lfn] = tmpFile if tmpFile.lfn in lfns: # keep original updateOrig[tmpFile.lfn] = False else: # overwrite original updateOrig[tmpFile.lfn] = True # look for unkown files addedNewFiles = False for newLFN in lfns: if not newLFN in origOutputs: # look for corresponding original output for origLFN in origOutputs.keys(): tmpPatt = '^{0}\.*_\d+$'.format(origLFN) if re.search(tmpPatt,newLFN) != None: # copy file record tmpStat = self.taskBuffer.copyFileRecord(newLFN,origOutputs[origLFN],updateOrig[origLFN]) if not tmpStat: return False addedNewFiles = True # disable further overwriting updateOrig[origLFN] = False break # refresh job info if addedNewFiles: self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromWaiting=False, forAnal=True)[0] # return return True
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger,None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO,tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen')) outFile = open(outFileName,'w') pickle.dump(self.jobs,outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status,output = self.taskBuffer.processLimiter.getstatusoutput(com) tmpLog.debug("return from main process: %s %s" % (status,output)) tmpLog.debug('fork end') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
def run(self): # get logger tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid, self.site, self.dataset)) # query dataset tmpLog.debug("start") if self.vuid != None: dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid}) else: dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset}) if dataset == None: tmpLog.error("Not found") tmpLog.debug("end") return tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name)) if dataset.type == 'dispatch': # activate jobs in jobsDefined Activator(self.taskBuffer,dataset).start() if dataset.type == 'output': if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None: # start unmerge jobs Activator(self.taskBuffer,dataset,enforce=True).start() else: # finish transferring jobs Finisher(self.taskBuffer,dataset,site=self.site).start() tmpLog.debug("end")
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, xml, pilotLog, metaData, jobMetrics, stdout)) _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) return Protocol.Response(Protocol.SC_Role).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # pilot log tmpLog.debug('sending log') if pilotLog != '': try: # make message message = pilotLog # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type': 'pilotLog', 'PandaID': int(jobId)}) logger = _pandaLogger.getHttpLogger(panda_config.loggername) # add message logger.info(message) except: tmpLog.debug('failed to send log') finally: tmpLog.debug('release lock') try: # release HTTP handler _pandaLogger.release() except: pass tmpLog.debug('done log') # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, attemptNr, stdout, acceptJson)
class EventPicker: # constructor def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger, self.token) self.pd2p = DynDataDistributer.DynDataDistributer([], self.taskBuffer, self.siteMapper, token=' ', logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine) # check format if tmpMatch == None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: exec "runEvtGuidMap=" + tmpItems[1] except: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params) if tmpMatch != None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI( compactDN, self.userTaskName) # convert if tagDsList == [] or tagQuery == '': # convert run/event list to dataset/file list tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets( runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap) if not tmpRet: if 'isFatal' in locationMap and locationMap[ 'isFatal'] == True: self.ignoreError = False self.endWithError( 'Failed to convert the run/event list to a dataset/file list' ) return False else: # get parent dataset/files with TAG tmpRet, locationMap, allFiles = self.pd2p.getTagParentInfoUsingTagQuery( tagDsList, tagQuery, tagStreamRef) if not tmpRet: self.endWithError( 'Failed to get parent dataset/file list with TAG') return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy', '', tmpDN) tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets( self.userDatasetName, allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet, candidateMaps = self.pd2p.getCandidates( self.userDatasetName, checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError( 'Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS, tmpDsVal in candidateMaps.iteritems(): for tmpCloud, tmpCloudVal in tmpDsVal.iteritems(): for tmpSiteName in tmpCloudVal[0]: if not tmpSiteName in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer( self.userDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % self.userDatasetName) return False userDatasetNameList = tmpOut.keys() else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet, totalInputSize = rucioAPI.getDatasetSize( tmpUserDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % tmpUserDatasetName) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) brokerage.broker.schedule([tmpJob], self.taskBuffer, self.siteMapper, True, allCandidates, True, datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus, userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError, 'user info not found for {0} with {1}'.format( tmpDN, userInfo) tmpDN = userInfo['nickname'] tmpDQ2ID = self.siteMapper.getSite( tmpJob.computingSite).ddm tmpMsg = "%s ds=%s site=%s id=%s" % ( 'registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation( tmpDS, [tmpDQ2ID], lifetime=14, owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except: errType, errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % ( errType, errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True, tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except: errType, errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType, errValue, traceback.format_exc())) return False # end with error def endWithError(self, message): self.putLog(message, 'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False, message) except: pass # upload log if self.jediTaskID != None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI( self.jediTaskID, 'event picking failed. ' + outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID, 'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self, msg, type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self, isSucceeded, message): # mail address toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN, 'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder, mailSubject, mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def uploadLog(req,file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False tmpLog = LogWrapper(_logger,'uploadLog <{0}>'.format(file.filename)) tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN'])) # size check sizeLimit = 100*1024*1024 # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "failed to upload log due to size limit" tmpLog.error(errStr) tmpLog.debug("end") return errStr jediLogDir = '/jedilog' retStr = '' try: fileBaseName = file.filename.split('/')[-1] fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir,jediLogDir,fileBaseName) # delete old file if os.path.exists(fileFullPath): os.remove(fileFullPath) # write fo = open(fileFullPath,'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() tmpLog.debug("written to {0}".format(fileFullPath)) retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),jediLogDir,fileBaseName) except: errtype,errvalue = sys.exc_info()[:2] errStr = "failed to write log with {0}:{1}".format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpLog.debug("end") return errStr tmpLog.debug("end") return retStr
def getFilesFromLRC(files, url, guids=[], storageName=[], terminateWhenFailed=False, getPFN=False, scopeList=[]): tmpLog = LogWrapper(_log, None) tmpLog.debug('getFilesFromLRC "%s" %s' % (url, str(storageName))) # get PFC outSTR = '' if url.startswith('mysql://'): # from MySQL outSTR = _getPFNFromMySQL(files, url) # get PFN if getPFN: outPFN = {} # FIXME tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('http://'): # from HTTP I/F outSTR = _getPoolFileCatalog(files, url) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['', None]: root = xml.dom.minidom.parseString(outSTR) fileNodes = root.getElementsByTagName('File') for file in fileNodes: # get PFN and LFN nodes physical = file.getElementsByTagName('physical')[0] pfnNode = physical.getElementsByTagName('pfn')[0] logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw pfn = str(pfnNode.getAttribute('name')) lfn = str(lfnNode.getAttribute('name')) # assign if not outPFN.has_key(lfn): outPFN[lfn] = [] outPFN[lfn].append(pfn) except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse XML - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('lfc://') or url.startswith('rucio://'): # from LFC timeStart = datetime.datetime.utcnow() outSTR = _getPFNFromLFC(files, url, guids, storageName, scopeList=scopeList, tmpLog=tmpLog) regTime = datetime.datetime.utcnow() - timeStart tmpLog.debug( 'file lookup for %s LFNs from %s took %s.%03d sec' % (len(files), url, regTime.seconds, regTime.microseconds / 1000)) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['', None]: tmpItems = outSTR.split('LFCRet :') tmpItems.remove('') # loop over all returns for tmpItem in tmpItems: exec "tmpLFNmap = %s" % tmpItem for tmpLFN, tmpPFN in tmpLFNmap.iteritems(): outPFN[tmpLFN] = tmpPFN except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse LFC ret - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s files' % len(outPFN)) return outPFN # check return if not isinstance(outSTR, types.StringType): if terminateWhenFailed: return None # set empty string outSTR = '' # collect OK Files okFiles = [] for file in files: if re.search(file, outSTR) != None: okFiles.append(file) tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]), len(okFiles))) return okFiles
def uploadLog(req, file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename)) tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN'])) # size check sizeLimit = 100 * 1024 * 1024 # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "failed to upload log due to size limit" tmpLog.error(errStr) tmpLog.debug("end") return errStr jediLogDir = '/jedilog' retStr = '' try: fileBaseName = file.filename.split('/')[-1] fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir, fileBaseName) # delete old file if os.path.exists(fileFullPath): os.remove(fileFullPath) # write fo = open(fileFullPath, 'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() tmpLog.debug("written to {0}".format(fileFullPath)) retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None), jediLogDir, fileBaseName) except: errtype, errvalue = sys.exc_info()[:2] errStr = "failed to write log with {0}:{1}".format( errtype.__name__, errvalue) tmpLog.error(errStr) tmpLog.debug("end") return errStr tmpLog.debug("end") return retStr
def getGUIDsFromEventIndex(self,runEventList,streamName,amiTags,dataType): comment = ' /* DBProxy.getGUIDsFromEventIndex */' methodName = comment.split(' ')[-2].split('.')[-1] tmpLog = LogWrapper(_logger,methodName+" <streamName={0} amiTags={1} dataType={2}>".format(streamName,amiTags,dataType)) try: # change to list if not amiTags in [None,'']: amiTags = amiTags.replace('*','.*').split(',') tmpLog.debug("start for {0} events".format(len(runEventList))) # check data type if not dataType in ['RAW','ESD','AOD']: return False,'dataType={0} is unsupported'.format(dataType) # sql to insert runs and events sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format(panda_config.schemaEI) sqlRE += "VALUES (:runNumber,:eventNumber) " varMaps = [] for runNumber,eventNumber in runEventList: varMap = {} varMap[':runNumber'] = runNumber varMap[':eventNumber'] = eventNumber varMaps.append(varMap) # begin transaction self.conn.begin() self.cur.arraysize = 100000 # insert runs and events self.cur.executemany(sqlRE+comment, varMaps) # read GUIDs varMap = {} if amiTags in [None,'']: sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format(dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format(panda_config.schemaEI) else: sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format(dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format(panda_config.schemaEI) if not streamName in [None,'']: sqlRG += "WHERE streamName=:streamName " varMap[':streamName'] = streamName self.cur.execute(sqlRG+comment, varMap) resRG = self.cur.fetchall() # commit if not self._commit(): raise RuntimeError, 'Commit error' retValue = {} keyAmiIdxMap = {} for tmpItem in resRG: if amiTags in [None,'']: runNumber,eventNumber,guid = tmpItem # dummy idxTag = 0 else: runNumber,eventNumber,guid,amiTag = tmpItem # get index number for the AMI tag in the list idxTag = self.getIndexAmiTag(amiTags,amiTag) # didn't match if idxTag == None: continue tmpKey = (runNumber,eventNumber) # use AMI tag in a preference orde if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag: continue keyAmiIdxMap[tmpKey] = idxTag retValue[tmpKey] = [guid] tmpLog.debug("found {0} events".format(len(retValue))) return True,retValue except: # roll back self._rollback() # error self.dumpErrorMessage(_logger,methodName) return False,None
import re from config import panda_config from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper import panda_proxy_cache # logger _logger = PandaLogger().getLogger('panda_activeusers_query') tmpLog = LogWrapper(_logger) if __name__ == '__main__': tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config, 'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = [ 'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot'
import sys import datetime import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
import multiprocessing from taskbuffer.TaskBuffer import taskBuffer import pandalogger.PandaLogger from pandalogger.PandaLogger import PandaLogger from brokerage.SiteMapper import SiteMapper from pandautils import PandaUtils from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('add') tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # current minute currentMinute = datetime.datetime.utcnow().minute
import sys import datetime import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper from taskbuffer import ErrorCode # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow()-datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime "