def __init__(self, taskBuffer, jobs, logger, params, defaultMap): self.jobs = [] self.jumboJobs = [] # separate normal and jumbo jobs for tmpJob in jobs: if EventServiceUtils.isJumboJob(tmpJob): self.jumboJobs.append(tmpJob) else: self.jobs.append(tmpJob) self.taskBuffer = taskBuffer self.logger = logger # set named parameters for tmpKey in params: tmpVal = params[tmpKey] setattr(self, tmpKey, tmpVal) # set defaults for tmpKey in defaultMap: tmpVal = defaultMap[tmpKey] if not hasattr(self, tmpKey): setattr(self, tmpKey, tmpVal)
def parseXML(self): # get LFN and GUID # self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs log_out = [f for f in self.job.Files if f.type in ['log', 'output']] if not log_out: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: # root = xml.dom.minidom.parse(self.xmlFile) root = xml.dom.minidom.parseString(self.data) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError('fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # parse json try: import json # with open(self.xmlFile) as tmpF: jsonDict = json.loads(self.data) for lfn in jsonDict: fileData = jsonDict[lfn] lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError( 'fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # check if file exists # if os.path.exists(self.xmlFile): if True: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents nEventsFrom = None try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break nEventsFrom = "xml" except Exception: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except Exception: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except Exception: pass nEventsFrom = "json" except Exception: pass # use nEvents and GUIDs reported by the pilot if no job report if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \ and self.job.prodSourceLabel in ['managed']: for file in self.job.Files: if file.type == 'output': nEventsMap[file.lfn] = self.job.nEvents for lfn, guid in zip(lfns, guids): guidMap[lfn] = guid nEventsFrom = "pilot" self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('nEventsFrom=%s' % str(nEventsFrom)) self.logger.debug('guidMap=%s' % str(guidMap)) self.logger.debug('self.job.jobStatus=%s in parseXML' % self.job.jobStatus) self.logger.debug( 'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob( self.job), EventServiceUtils.isJumboJob(self.job))) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test' ] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if file.lfn not in lfns: if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \ or EventServiceUtils.isJumboJob(self.job): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if file.lfn in fullLfnMap: file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if file.lfn in nEventsMap: self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except Exception: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr is not None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if lfn not in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0
def appendJob(self, job, siteMapperCache=None): # event service merge if EventServiceUtils.isEventServiceMerge(job): isEventServiceMerge = True else: isEventServiceMerge = False # PandaID self.data['PandaID'] = job.PandaID # prodSourceLabel self.data['prodSourceLabel'] = job.prodSourceLabel # swRelease self.data['swRelease'] = job.AtlasRelease # homepackage self.data['homepackage'] = job.homepackage # transformation self.data['transformation'] = job.transformation # job name self.data['jobName'] = job.jobName # job definition ID self.data['jobDefinitionID'] = job.jobDefinitionID # cloud self.data['cloud'] = job.cloud # files strIFiles = '' strOFiles = '' strDispatch = '' strDisToken = '' strDisTokenForOutput = '' strDestination = '' strRealDataset = '' strRealDatasetIn = '' strProdDBlock = '' strDestToken = '' strProdToken = '' strProdTokenForOutput = '' strGUID = '' strFSize = '' strCheckSum = '' strFileDestinationSE = '' strScopeIn = '' strScopeOut = '' strScopeLog = '' logFile = '' logGUID = '' ddmEndPointIn = [] ddmEndPointOut = [] noOutput = [] siteSpec = None inDsLfnMap = {} inLFNset = set() if siteMapperCache is not None: siteMapper = siteMapperCache.getObj() siteSpec = siteMapper.getSite(job.computingSite) # resolve destSE try: job.destinationSE = siteMapper.resolveNucleus( job.destinationSE) for tmpFile in job.Files: tmpFile.destinationSE = siteMapper.resolveNucleus( tmpFile.destinationSE) except Exception: pass siteMapperCache.releaseObj() for file in job.Files: if file.type == 'input': if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset: pass else: inLFNset.add(file.lfn) if strIFiles != '': strIFiles += ',' strIFiles += file.lfn if strDispatch != '': strDispatch += ',' strDispatch += file.dispatchDBlock if strDisToken != '': strDisToken += ',' strDisToken += file.dispatchDBlockToken strProdDBlock += '%s,' % file.prodDBlock if not isEventServiceMerge: strProdToken += '%s,' % file.prodDBlockToken else: strProdToken += '%s,' % job.metadata[1][file.lfn] if strGUID != '': strGUID += ',' strGUID += file.GUID strRealDatasetIn += '%s,' % file.dataset strFSize += '%s,' % file.fsize if file.checksum not in ['', 'NULL', None]: strCheckSum += '%s,' % file.checksum else: strCheckSum += '%s,' % file.md5sum strScopeIn += '%s,' % file.scope ddmEndPointIn.append( self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken, 'input', job.prodSourceLabel, job.job_label)) if file.dataset not in inDsLfnMap: inDsLfnMap[file.dataset] = [] inDsLfnMap[file.dataset].append(file.lfn) if file.type == 'output' or file.type == 'log': if strOFiles != '': strOFiles += ',' strOFiles += file.lfn if strDestination != '': strDestination += ',' strDestination += file.destinationDBlock if strRealDataset != '': strRealDataset += ',' strRealDataset += file.dataset strFileDestinationSE += '%s,' % file.destinationSE if file.type == 'log': logFile = file.lfn logGUID = file.GUID strScopeLog = file.scope else: strScopeOut += '%s,' % file.scope if strDestToken != '': strDestToken += ',' strDestToken += re.sub( '^ddd:', 'dst:', file.destinationDBlockToken.split(',')[0]) strDisTokenForOutput += '%s,' % file.dispatchDBlockToken strProdTokenForOutput += '%s,' % file.prodDBlockToken ddmEndPointOut.append( self.getDdmEndpoint( siteSpec, file.destinationDBlockToken.split(',')[0], 'output', job.prodSourceLabel, job.job_label)) if file.isAllowedNoOutput(): noOutput.append(file.lfn) # inFiles self.data['inFiles'] = strIFiles # dispatch DBlock self.data['dispatchDblock'] = strDispatch # dispatch DBlock space token self.data['dispatchDBlockToken'] = strDisToken # dispatch DBlock space token for output self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] # outFiles self.data['outFiles'] = strOFiles # destination DBlock self.data['destinationDblock'] = strDestination # destination DBlock space token self.data['destinationDBlockToken'] = strDestToken # prod DBlocks self.data['prodDBlocks'] = strProdDBlock[:-1] # prod DBlock space token self.data['prodDBlockToken'] = strProdToken[:-1] # real output datasets self.data['realDatasets'] = strRealDataset # real output datasets self.data['realDatasetsIn'] = strRealDatasetIn[:-1] # file's destinationSE self.data['fileDestinationSE'] = strFileDestinationSE[:-1] # log filename self.data['logFile'] = logFile # log GUID self.data['logGUID'] = logGUID # jobPars self.data['jobPars'], ppSteps = job.extractMultiStepExec() if ppSteps is not None: self.data.update(ppSteps) if job.to_encode_job_params(): self.data['jobPars'] = base64.b64encode( self.data['jobPars'].encode()).decode() # attempt number self.data['attemptNr'] = job.attemptNr # GUIDs self.data['GUID'] = strGUID # checksum self.data['checksum'] = strCheckSum[:-1] # fsize self.data['fsize'] = strFSize[:-1] # scope self.data['scopeIn'] = strScopeIn[:-1] self.data['scopeOut'] = strScopeOut[:-1] self.data['scopeLog'] = strScopeLog # DDM endpoints try: self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn) except TypeError: self.data['ddmEndPointIn'] = '' try: self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut) except TypeError: self.data['ddmEndPointOut'] = '' # destinationSE self.data['destinationSE'] = job.destinationSE # user ID self.data['prodUserID'] = job.prodUserID # CPU count self.data['maxCpuCount'] = job.maxCpuCount # RAM count self.data['minRamCount'] = job.minRamCount # disk count self.data['maxDiskCount'] = job.maxDiskCount # cmtconfig if ppSteps is None: self.data['cmtConfig'] = job.cmtConfig else: self.data['cmtConfig'] = '' # processingType self.data['processingType'] = job.processingType # transferType self.data['transferType'] = job.transferType # sourceSite self.data['sourceSite'] = job.sourceSite # current priority self.data['currentPriority'] = job.currentPriority # taskID if job.lockedby == 'jedi': self.data['taskID'] = job.jediTaskID else: self.data['taskID'] = job.taskID # core count if job.coreCount in ['NULL', None]: self.data['coreCount'] = 1 else: self.data['coreCount'] = job.coreCount # jobsetID self.data['jobsetID'] = job.jobsetID # nucleus self.data['nucleus'] = job.nucleus # walltime self.data['maxWalltime'] = job.maxWalltime # looping check if job.is_no_looping_check(): self.data['loopingCheck'] = False # debug mode if job.specialHandling is not None and 'debug' in job.specialHandling: self.data['debug'] = 'True' # event service or job cloning if EventServiceUtils.isJobCloningJob(job): self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job) elif EventServiceUtils.isEventServiceJob( job) or EventServiceUtils.isJumboJob(job): self.data['eventService'] = 'True' # prod DBlock space token for pre-merging output self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1] # event service merge if isEventServiceMerge: self.data['eventServiceMerge'] = 'True' # write to file for ES merge writeToFileStr = '' try: for outputName in job.metadata[0]: inputList = job.metadata[0][outputName] writeToFileStr += 'inputFor_{0}:'.format(outputName) for tmpInput in inputList: writeToFileStr += '{0},'.format(tmpInput) writeToFileStr = writeToFileStr[:-1] writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] except Exception: pass self.data['writeToFile'] = writeToFileStr elif job.writeInputToFile(): try: # write input to file writeToFileStr = '' for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] writeToFileStr += 'tmpin_{0}:'.format(inDS) writeToFileStr += ','.join(inputList) writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] self.data['writeToFile'] = writeToFileStr except Exception: pass # replace placeholder if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob( job): try: for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] srcStr = 'tmpin__cnt_{0}'.format(inDS) dstStr = ','.join(inputList) self.data['jobPars'] = self.data['jobPars'].replace( srcStr, dstStr) except Exception: pass # no output if noOutput != []: self.data['allowNoOutput'] = ','.join(noOutput) # alternative stage-out if job.getAltStgOut() is not None: self.data['altStageOut'] = job.getAltStgOut() # log to OS if job.putLogToOS(): self.data['putLogToOS'] = 'True' # suppress execute string conversion if job.noExecStrCnv(): self.data['noExecStrCnv'] = 'True' # in-file positional event number if job.inFilePosEvtNum(): self.data['inFilePosEvtNum'] = 'True' # use prefetcher if job.usePrefetcher(): self.data['usePrefetcher'] = 'True' # image name if job.container_name not in ['NULL', None]: self.data['container_name'] = job.container_name # IO self.data['ioIntensity'] = job.get_task_attribute('ioIntensity') self.data['ioIntensityUnit'] = job.get_task_attribute( 'ioIntensityUnit') # HPO if job.is_hpo_workflow(): self.data['isHPO'] = 'True' # VP if siteSpec is not None: scope_input, scope_output = DataServiceUtils.select_scope( siteSpec, job.prodSourceLabel, job.job_label) if siteSpec.use_vp(scope_input): self.data['useVP'] = 'True'