def getOutFilesGuids(outFiles, workdir, experiment, TURL=False): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] # Get the experiment object and the GUID source filename thisExperiment = getExperiment(experiment) filename = thisExperiment.getGUIDSourceFilename() # If a source file should not be used (ie empty filename string), then generate the GUIDs here if filename == "": tolog("Pilot will generate GUIDs for the output files") for i in range(0, len(outFiles)): guid = getGUID() if guid == "": guid = "- GUID generation failed -" outFilesGuids.append(guid) return ec, pilotErrorDiag, outFilesGuids else: tolog("Pilot will get GUIDs for the output files from source: %s" % (filename)) pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir) # The PFC used for Event Service will be TURL based, use the corresponding file if TURL: pfcFile = pfcFile.replace(".xml", "TURL.xml") # Initialization: make sure the guid list has the same length as the file list for i in range(0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str( thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0, len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False): """ get the outFilesGuids from the PFC """ ec = 0 pilotErrorDiag = "" outFilesGuids = [] # Get the experiment object and the GUID source filename thisExperiment = getExperiment(experiment) filename = thisExperiment.getGUIDSourceFilename() # If a source file should not be used (ie empty filename string), then generate the GUIDs here if filename == "": tolog("Pilot will generate GUIDs for the output files") for i in range (0, len(outFiles)): guid = getGUID() if guid == "": guid = "- GUID generation failed -" outFilesGuids.append(guid) return ec, pilotErrorDiag, outFilesGuids else: tolog("Pilot will get GUIDs for the output files from source: %s" % (filename)) pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir) # The PFC used for Event Service will be TURL based, use the corresponding file if TURL: pfcFile = pfcFile.replace(".xml", "TURL.xml") # Initialization: make sure the guid list has the same length as the file list for i in range (0, len(outFiles)): outFilesGuids.append(None) # make sure the PFC exists if os.path.isfile(pfcFile): from xml.dom import minidom xmldoc = minidom.parse(pfcFile) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: gpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) guid = str(thisfile.getAttribute("ID")) for i in range(0, len(outFiles)): if outFiles[i] == gpfn: outFilesGuids[i] = guid else: pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile) tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag)) error = PilotErrors() ec = error.ERR_MISSINGPFC return ec, pilotErrorDiag, outFilesGuids
def __init__(self): self.jobId = '0' # panda job id self.homePackage = None # package name self.trf = None # trf name self.inFiles = None # list of input files self.dispatchDblock = None # self.prodDBlockToken = [ ] # used to send file info to the pilot (if input files should be directly accessed or not) self.prodDBlockTokenForOutput = [] # used for object store info self.prodDBlocks = [ ] # contains the correct container or dataset name for the traces self.dispatchDBlockToken = [ ] # used to send space tokens to the pilot (for input files) self.dispatchDBlockTokenForOut = None # used for chirp file destination, including server name self.destinationDBlockToken = [ ] # used to send space tokens to the pilot (for output files) self.outFiles = [] # list of output files self.destinationDblock = [] # datasets for output files self.inFilesGuids = [] # list of input file guids self.outFilesGuids = [ ] # these guids are usually unknown till the job is done self.logFile = None # self.tarFileGuid = pUtil.getGUID( ) # guid for the tarball of the job workdir self.logDblock = None # self.jobPars = None # Job parameters defining the execution of the job self.exeErrorCode = 0 # payload error code self.exeErrorDiag = "" # payload error diagnostic, potentially more detailed error text than std error self.pilotErrorDiag = None # detailed error diag self.release = None # software release string self.result = [ "Unknown", 0, 0 ] # the first digit is the transExitCode, and second one is the pilotErrorCode self.action = "" # place holder for "tobekilled" command from dispatcher self.workdir = None # workdir for this job, usually under site.workdir self.siteworkdir = None # workdir for the pilot (site.workdir) self.logMsgFiles = [ ] # list of log files that need to be reported back to panda server at the end of a job self.newDirNM = "" # self.datadir = "" # path to recovery datadir self.finalstate = "" # final job state (either "failed" or "finished") self.attemptNr = -1 # attempt number for this job self.output_latereg = "None" # control variable for late registration by job recovery algo self.output_fields = None # - " - self.log_latereg = "None" # - " - self.log_field = None # - " - self.destinationSE = "" # self.fileDestinationSE = "" # SE info for CMS self.payload = "payload" # payload name, e.g. "athena" self.stdout = "payload_stdout.txt" # payload stdout filename, default "%s_stdout.txt" % (self.payload) self.stderr = "payload_stderr.txt" # payload stdout filename, default "%s_stderr.txt" % (self.payload) self.spsetup = None # special setup string for xrdcp systems self.prodUserID = "" # user id self.cpuConsumptionTime = 0 # time spent during payload execution self.cpuConsumptionUnit = None # self.cpuConversionFactor = 0 # self.maxCpuCount = 0 # defines what is a looping job (seconds) self.maxDiskCount = 21 # max input file size [GB] (server default 0) self.processingType = "NULL" # alternatively 'reprocessing', used to increase max input file size self.prodSourceLabel = "" # job label, e.g. 'user', 'test', 'rc_test', 'ddm', 'software', 'ptest' self.nEvents = 0 # number of processed events (read) self.nEventsW = 0 # number of processed events (written) self.realDatasetsIn = None # dataset name(s) for input file(s) self.cmtconfig = None # CMTCONFIG value from the task definition self.jobState = None # Current job state (for definition, see JobRecovery class) self.fileStateDictionary = None # Dictionary for current file states (for definition, see JobRecovery class) self.outputFilesXML = "OutputFiles.xml" # XML metadata related to output files for NG / CERNVM self.transferType = None # Brokerage may decide to have input files transferred with remote I/O (set to 'direct' in that case) self.jobDefinitionID = None # Job definition id forwarded to the DQ2 tracing server self.cloud = "" # The cloud the job belongs to self.credname = 'None' # self.myproxy = 'None' # self.taskID = "" # The task that this job belongs to self.isPilotResubmissionRequired = False # Pilot-controlled resubmission self.filesizeIn = [] # Input file sizes from the dispatcher self.checksumIn = [] # Input file checksums from the dispatcher self.debug = "" # debug = True will trigger the pilot to send stdout tail on job update self.currentState = "" # Basically the same as result[0] but includes states like "stagein", "stageout" self.vmPeakMax = 0 # Maximum value of vmPeak self.vmPeakMean = 0 # Average value of vmPeak self.RSSMean = 0 # Average value of RSS self.JEM = "NO" # JEM usage (YES/NO), default: NO self.filesWithoutFAX = 0 # Number of files normally staged in (only reported to jobMetrics in FAX mode) self.filesWithFAX = 0 # Number of files staged in by FAX (only reported to jobMetrics in FAX mode) self.filesNormalStageOut = 0 # Number of files normally staged out (only reported to jobMetrics in alt stage-out mode) self.filesAltStageOut = 0 # Number of files staged out to alternative SE (only reported to jobMetrics in alt stage-out mode) self.bytesWithoutFAX = 0 # Total size of files transferred without FAX (only reported to jobMetrics in FAX mode) self.bytesWithFAX = 0 # Total size of files transferred with FAX (only reported to jobMetrics in FAX mode) self.scopeIn = [] # Rucio scope for in files self.scopeOut = [] # Rucio scope for out files self.scopeLog = [] # Rucio scope for log file self.experiment = "undefined" # Which experiment this job belongs to self.coreCount = None # Number of cores as requested by the task self.pgrp = 0 # Process group (RunJob* subprocess) self.sourceSite = "" # Keep track of the original source site of the job (useful for overflow jobs to get to the proper FAX redirector) self.ddmEndPointIn = [] # self.ddmEndPointOut = [] # # self.ddmEndPointOutAlt = [] # self.ddmEndPointLog = [] # self.cloneJob = "" # Is the job cloned? Allowed values: 'runonce', 'storeonce' # event service objects self.eventService = False # True for event service jobs self.eventServiceMerge = False # True for event service merge jobs self.eventRanges = None # Event ranges dictionary self.jobsetID = None # Event range job set ID # self.eventRangeID = None # Set for event service jobs # self.startEvent = None # Set for event service jobs # self.lastEvent = None # Set for event service jobs # self.lfn = None # LFNs of input files to be read by the Event Server (NOT by the pilot) # self.guid = None # GUIDs of input files to be read by the Event Server (NOT by the pilot) # self.attemptNr = "" # (defined above) # job mode, for example, HPC_normal, HPC_backfill self.mode = None self.hpcStatus = None self.refreshNow = False self.HPCJobId = None # walltime counting for various steps self.timeSetup = 0 self.timeGetJob = 0 self.timeStageIn = 0 self.timeExe = 0 self.timeStageOut = 0 self.timeCleanUp = 0 # Job start/end time self.startTime = "" self.endTime = ""
def setJobDef(self, data): """ set values for a job object from a dictionary data which is usually from cgi messages from panda server """ self.jobId = str(data.get('PandaID', '0')) self.taskID = data.get('taskID', '') self.outputFilesXML = "OutputFiles-%s.xml" % (self.jobId) self.homePackage = data.get('homepackage', '') self.trf = data.get('transformation', '') try: self.jobDefinitionID = int(data.get('jobDefinitionID', '')) except: self.jobDefinitionID = '' try: self.cloud = data.get('cloud', '') except: self.cloud = '' # get the input files inFiles = data.get('inFiles', '') self.inFiles = inFiles.split(",") realDatasetsIn = data.get('realDatasetsIn', '') self.realDatasetsIn = realDatasetsIn.split(",") filesizeIn = data.get('fsize', '') self.filesizeIn = filesizeIn.split(",") checksumIn = data.get('checksum', '') self.checksumIn = checksumIn.split(",") dispatchDblock = data.get('dispatchDblock', '') self.dispatchDblock = dispatchDblock.split(",") prodDBlocks = data.get('prodDBlocks', '') self.prodDBlocks = prodDBlocks.split(",") prodDBlockToken = data.get('prodDBlockToken', '') self.prodDBlockToken = prodDBlockToken.split(",") prodDBlockTokenForOutput = data.get('prodDBlockTokenForOutput', '') self.prodDBlockTokenForOutput = prodDBlockTokenForOutput.split(",") dispatchDBlockToken = data.get('dispatchDBlockToken', '') self.dispatchDBlockToken = dispatchDBlockToken.split(",") dispatchDBlockTokenForOut = data.get('dispatchDBlockTokenForOut', '') self.dispatchDBlockTokenForOut = dispatchDBlockTokenForOut.split(",") destinationDBlockToken = data.get('destinationDBlockToken', '') self.destinationDBlockToken = destinationDBlockToken.split(",") self.ddmEndPointIn = data.get( 'ddmEndPointIn', '').split(',') if data.get('ddmEndPointIn') else [] self.ddmEndPointOut = data.get( 'ddmEndPointOut', '').split(',') if data.get('ddmEndPointOut') else [] self.cloneJob = data.get('cloneJob', '') logFile = data.get('logFile', '') self.logFile = logFile tarFileGuid = data.get('logGUID', pUtil.getGUID()) self.tarFileGuid = tarFileGuid self.prodUserID = data.get('prodUserID', '') self.credname = data.get('credname', 'None') self.myproxy = data.get('myproxy', 'None') outFiles = data.get('outFiles', '') self.attemptNr = int(data.get('attemptNr', -1)) if data.has_key('GUID'): self.inFilesGuids = data['GUID'].split(",") else: self.inFilesGuids = [] if data.has_key('processingType'): self.processingType = str(data['processingType']) # self.processingType = 'nightlies' else: # use default pass # Event Service variables if data.has_key('eventService'): if data.get('eventService', '').lower() == "true": self.eventService = True else: self.eventService = False pUtil.tolog("eventService = %s" % str(self.eventService)) else: pUtil.tolog("Normal job (not an eventService job)") if data.has_key('eventRanges'): self.eventRanges = data.get('eventRanges', None) if data.has_key('jobsetID'): self.jobsetID = data.get('jobsetID', None) pUtil.tolog("jobsetID=%s" % (self.jobsetID)) if not self.eventService and self.processingType == "evtest": pUtil.tolog("Turning on Event Service for processing type = %s" % (self.processingType)) self.eventService = True # Event Service Merge variables if data.has_key('eventServiceMerge'): if data.get('eventServiceMerge', '').lower() == "true": self.eventServiceMerge = True else: self.eventServiceMerge = False pUtil.tolog("eventServiceMerge = %s" % str(self.eventServiceMerge)) # Event Service merge job if self.workdir and data.has_key('eventServiceMerge') and data[ 'eventServiceMerge'].lower() == "true": if data.has_key('writeToFile'): writeToFile = data['writeToFile'] esFileDictionary, orderedFnameList = pUtil.createESFileDictionary( writeToFile) pUtil.tolog("esFileDictionary=%s" % (esFileDictionary)) pUtil.tolog("orderedFnameList=%s" % (orderedFnameList)) if esFileDictionary != {}: ec, fnames = pUtil.writeToInputFile( self.workdir, esFileDictionary, orderedFnameList) if ec == 0: data['jobPars'] = pUtil.updateJobPars( data['jobPars'], fnames) # HPC job staus if data.has_key('mode'): self.mode = data.get("mode", None) if data.has_key('hpcStatus'): self.hpcStatus = data.get('hpcStatus', None) # self.eventRangeID = data.get('eventRangeID', None) # self.startEvent = data.get('startEvent', None) # self.lastEvent = data.get('lastEvent', None) # pUtil.tolog("eventRangeID = %s" % str(self.eventRangeID)) # pUtil.tolog("startEvent = %s" % str(self.startEvent)) # pUtil.tolog("lastEvent = %s" % str(self.lastEvent)) # if data.has_key('lfn'): # self.lfn = data['lfn'].split(",") # else: # self.lfn = [] # if data.has_key('guid'): # self.guid = data['guid'].split(",") # else: # self.guid = [] # Rucio scopes if data.has_key('scopeIn'): self.scopeIn = data['scopeIn'].split(",") else: self.scopeIn = [] if data.has_key('scopeOut'): self.scopeOut = data['scopeOut'].split(",") else: self.scopeOut = [] if data.has_key('scopeLog'): self.scopeLog = data['scopeLog'].split(",") else: self.scopeLog = [] self.maxCpuCount = int(data.get('maxCpuCount', 0)) self.transferType = data.get('transferType', '') #PN self.transferType = 'direct' if data.has_key('maxDiskCount'): _tmp = int(data['maxDiskCount']) if _tmp != 0 and _tmp != self.maxDiskCount: self.maxDiskCount = _tmp else: # use default pass if data.has_key('cmtConfig'): self.cmtconfig = str(data['cmtConfig']) else: # use default pass if data.has_key('coreCount'): self.coreCount = str(data['coreCount']) else: # use default pass # Overwrite the coreCount value with ATHENA_PROC_NUMBER if it is set if os.environ.has_key('ATHENA_PROC_NUMBER'): try: self.coreCount = int(os.environ['ATHENA_PROC_NUMBER']) except Exception, e: pUtil.tolog( "ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.coreCount value)" % (e))
def __init__(self): self.jobId = '0' # panda job id self.homePackage = None # package name self.trf = None # trf name self.inFiles = None # list of input files self.dispatchDblock = None # self.prodDBlockToken = [] # used to send file info to the pilot (if input files should be directly accessed or not) self.prodDBlockTokenForOutput = [] # used for object store info self.prodDBlocks = [] # contains the correct container or dataset name for the traces self.dispatchDBlockToken = [] # used to send space tokens to the pilot (for input files) self.dispatchDBlockTokenForOut = None # used for chirp file destination, including server name self.destinationDBlockToken = [] # used to send space tokens to the pilot (for output files) self.outFiles = [] # list of output files self.destinationDblock = [] # datasets for output files self.inFilesGuids = [] # list of input file guids self.outFilesGuids = [] # these guids are usually unknown till the job is done self.logFile = None # self.tarFileGuid = pUtil.getGUID() # guid for the tarball of the job workdir self.logDblock = None # self.jobPars = None # Job parameters defining the execution of the job self.exeErrorCode = 0 # payload error code self.exeErrorDiag = "" # payload error diagnostic, potentially more detailed error text than std error self.pilotErrorDiag = None # detailed error diag self.release = None # software release string self.result = ["Unknown", 0, 0] # the first digit is the transExitCode, and second one is the pilotErrorCode self.action = "" # place holder for "tobekilled" command from dispatcher self.workdir = None # workdir for this job, usually under site.workdir self.siteworkdir = None # workdir for the pilot (site.workdir) self.logMsgFiles = [] # list of log files that need to be reported back to panda server at the end of a job self.newDirNM = "" # self.datadir = "" # path to recovery datadir self.finalstate = "" # final job state (either "failed" or "finished") self.attemptNr = -1 # attempt number for this job self.output_latereg = "None" # control variable for late registration by job recovery algo self.output_fields = None # - " - self.log_latereg = "None" # - " - self.log_field = None # - " - self.destinationSE = "" # self.fileDestinationSE = "" # SE info for CMS self.payload = "payload" # payload name, e.g. "athena" self.stdout = "payload_stdout.txt" # payload stdout filename, default "%s_stdout.txt" % (self.payload) self.stderr = "payload_stderr.txt" # payload stdout filename, default "%s_stderr.txt" % (self.payload) self.spsetup = None # special setup string for xrdcp systems self.prodUserID = "" # user id self.cpuConsumptionTime = 0 # time spent during payload execution self.cpuConsumptionUnit = None # self.cpuConversionFactor = 0 # self.maxCpuCount = 0 # defines what is a looping job (seconds) self.maxDiskCount = 21 # max input file size [GB] (server default 0) self.processingType = "NULL" # alternatively 'reprocessing', used to increase max input file size self.prodSourceLabel = "" # job label, e.g. 'user', 'test', 'rc_test', 'ddm', 'software', 'ptest' self.nEvents = 0 # number of processed events (read) self.nEventsW = 0 # number of processed events (written) self.realDatasetsIn = None # dataset name(s) for input file(s) self.cmtconfig = None # CMTCONFIG value from the task definition self.jobState = None # Current job state (for definition, see JobRecovery class) self.fileStateDictionary = None # Dictionary for current file states (for definition, see JobRecovery class) self.outputFilesXML = "OutputFiles.xml" # XML metadata related to output files for NG / CERNVM self.transferType = None # Brokerage may decide to have input files transferred with remote I/O (set to 'direct' in that case) self.jobDefinitionID = None # Job definition id forwarded to the DQ2 tracing server self.cloud = "" # The cloud the job belongs to self.credname = 'None' # self.myproxy = 'None' # self.taskID = "" # The task that this job belongs to self.isPilotResubmissionRequired = False # Pilot-controlled resubmission self.filesizeIn = [] # Input file sizes from the dispatcher self.checksumIn = [] # Input file checksums from the dispatcher self.debug = "" # debug = True will trigger the pilot to send stdout tail on job update self.currentState = "" # Basically the same as result[0] but includes states like "stagein", "stageout" self.vmPeakMax = 0 # Maximum value of vmPeak self.vmPeakMean = 0 # Average value of vmPeak self.RSSMean = 0 # Average value of RSS self.JEM = "NO" # JEM usage (YES/NO), default: NO self.filesWithoutFAX = 0 # Number of files normally staged in (only reported to jobMetrics in FAX mode) self.filesWithFAX = 0 # Number of files staged in by FAX (only reported to jobMetrics in FAX mode) self.filesNormalStageOut = 0 # Number of files normally staged out (only reported to jobMetrics in alt stage-out mode) self.filesAltStageOut = 0 # Number of files staged out to alternative SE (only reported to jobMetrics in alt stage-out mode) self.bytesWithoutFAX = 0 # Total size of files transferred without FAX (only reported to jobMetrics in FAX mode) self.bytesWithFAX = 0 # Total size of files transferred with FAX (only reported to jobMetrics in FAX mode) self.scopeIn = [] # Rucio scope for in files self.scopeOut = [] # Rucio scope for out files self.scopeLog = [] # Rucio scope for log file self.experiment = "undefined" # Which experiment this job belongs to self.coreCount = None # Number of cores as requested by the task self.pgrp = 0 # Process group (RunJob* subprocess) self.sourceSite = "" # Keep track of the original source site of the job (useful for overflow jobs to get to the proper FAX redirector) self.ddmEndPointIn = [] # self.ddmEndPointOut = [] # # self.ddmEndPointOutAlt = [] # self.ddmEndPointLog = [] # self.cloneJob = "" # Is the job cloned? Allowed values: 'runonce', 'storeonce' # event service objects self.eventService = False # True for event service jobs self.eventServiceMerge = False # True for event service merge jobs self.eventRanges = None # Event ranges dictionary self.jobsetID = None # Event range job set ID # self.eventRangeID = None # Set for event service jobs # self.startEvent = None # Set for event service jobs # self.lastEvent = None # Set for event service jobs # self.lfn = None # LFNs of input files to be read by the Event Server (NOT by the pilot) # self.guid = None # GUIDs of input files to be read by the Event Server (NOT by the pilot) # self.attemptNr = "" # (defined above) # job mode, for example, HPC_normal, HPC_backfill self.mode = None self.hpcStatus = None self.refreshNow = False # walltime counting for various steps self.timeSetup = 0 self.timeGetJob = 0 self.timeStageIn = 0 self.timeExe = 0 self.timeStageOut = 0 self.timeCleanUp = 0