def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return
#!/usr/bin/env python """ _WMRuntimeMonitor_ This is the base class for monitors """ import threading import os from WMCore.WMException import WMException from WMCore.WMSpec.Steps.Executor import getStepSpace from WMCore.WMSpec.WMStep import WMStepHelper getStepName = lambda step: WMStepHelper(step).name() class WMRuntimeMonitorException(WMException): """ _StepFactortyException_ It's like an exception class that does nothing """ pass class WMRuntimeMonitor: def __init__(self): self.currentStep = None self.currentStepName = None
class ExecuteMaster: """ _ExecuteMaster_ Traverse the given task and invoke the execute framework If an emulator is provided, then invoke the appropriate emulator instead of the executor """ def __init__(self): pass def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception, ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if not result == None: skipToStep = result except WMException, ex: self.toTaskDirectory() break except Exception, ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break
""" from __future__ import absolute_import import json import logging import os import subprocess import sys from Utils.FileTools import getFullPath from Utils.Utilities import zipEncodeStr from WMCore.FwkJobReport.Report import Report from WMCore.WMSpec.Steps.StepFactory import getStepEmulator from WMCore.WMSpec.WMStep import WMStepHelper getStepName = lambda step: WMStepHelper(step).name() getStepErrorDestination = lambda step: WMStepHelper(step).getErrorDestinationStep() def getStepSpace(stepName): """ _getStepSpace_ Util to get the runtime step space. This imports dynamic runtime libraries so be careful how you use it """ modName = "WMTaskSpace" if modName in sys.modules.keys(): taskspace = sys.modules[modName]
def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if emulator is not None: return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments userTarball = ','.join(self.step.user.inputSandboxes) userFiles = ','.join(self.step.user.userFiles) logging.info('User files are %s', userFiles) logging.info('User sandboxes are %s', userTarball) scramArch = getSingleScramArch(scramArch) multicoreSettings = self.step.application.multicore try: logging.info("CMSSW configured for %s cores and %s event streams", multicoreSettings.numberOfCores, multicoreSettings.eventStreams) except AttributeError: logging.info( "No value set for multicore numberOfCores or eventStreams") logging.info("Executing CMSSW step") # # set any global environment variables # try: os.environ['FRONTIER_ID'] = 'wmagent_%s' % ( self.report.data.workload) except Exception as ex: logging.error('Have critical error in setting FRONTIER_ID: %s', str(ex)) logging.error( 'Continuing, as this is not a critical function yet.') # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen( ["/bin/bash"], shell=True, cwd=self.step.builder.workingDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) # BADPYTHON scriptProcess.stdin.write( "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n") invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % ( sys.executable, stepModule, script) logging.info(" Invoking command: %s", invokeCommand) scriptProcess.stdin.write(invokeCommand) stdout, stderr = scriptProcess.communicate() retCode = scriptProcess.returncode if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr) logging.critical("Error running command") logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptFailure", msg) # # pre scripts with scram # logging.info("RUNNING SCRAM SCRIPTS") for script in self.step.runtime.scramPreScripts: # invoke scripts with scram() runtimeDir = getattr(self.step.runtime, 'scramPreDir', None) invokeCommand = self.step.runtime.invokeCommand if hasattr(self.step.runtime, 'invokeCommand') else \ "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule) invokeCommand += " %s \n" % script retCode = scram(invokeCommand, runtimeDir=runtimeDir) if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptScramFailure", msg) configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName) handle = open(configPath, 'w') handle.write(CONFIG_BLOB) handle.close() # spawn this new process # the script looks for: # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG> # open the output files stdoutHandle = open(self.step.output.stdout, 'w') stderrHandle = open(self.step.output.stderr, 'w') args = [ '/bin/bash', configPath, scramSetup, scramArch, scramCommand, scramProject, cmsswVersion, jobReportXML, cmsswCommand, cmsswConfig, userTarball, userFiles, cmsswArguments ] logging.info("Executing CMSSW. args: %s", args) # possibly needed environment overrides for CMSSW call go here envOverride = {} # work around problem with GSI authentication plugin and EOS at CERN if socket.getfqdn().endswith("cern.ch"): envOverride['XRD_LOADBALANCERTTL'] = "86400" # some libraries linked with CMSSW need HOME in the environment if 'HOME' not in os.environ: envOverride['HOME'] = os.environ.get('PWD', "/") os.environ.update(envOverride) returncode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle) self.setCondorChirpAttrDelayed('Chirp_WMCore_cmsRun_ExitCode', returncode) self.setCondorChirpAttrDelayed( 'Chirp_WMCore_%s_ExitCode' % self.stepName, returncode) stdoutHandle.close() stderrHandle.close() self.step.execution.exitStatus = returncode argsDump = {'arguments': args} if returncode != 0: msg = "Error running cmsRun\n%s\n" % argsDump msg += "Return code: %s\n" % returncode logging.critical(msg) raise WMExecutionFailure(returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception as ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.task.getPrepID() campaign = self.workload.getCampaign() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setCampaign(campaign) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath) self.report.setAcquisitionProcessing(acquisitionEra=acquisitionEra, processingVer=processingVer, processingStr=processingStr) self.report.setConfigURL(configURL="%s;;%s;;%s" % (cacheUrl, cacheDB, configID)) # Attach info to files self.report.addInfoToOutputFilesForStep(stepName=self.stepName, step=self.step) self.report.checkForOutputFiles(stepName=self.stepName) self.report.checkForAdlerChecksum(stepName=self.stepName) self.report.checkForRunLumiInformation(stepName=self.stepName) if self.step.output.keep != True: self.report.killOutput() else: # Check that we only keep the desired output for module in stepHelper.getIgnoredOutputModules(): self.report.deleteOutputModuleForStep(stepName=self.stepName, moduleName=module) # Add stageout LFN to existing TFileService files reportAnalysisFiles = self.report.getAnalysisFilesFromStep( self.stepName) for reportAnalysisFile in reportAnalysisFiles: newLFN = analysisFileLFN(reportAnalysisFile.fileName, self.step.user.lfnBase, self.job) addAttributesToFile(reportAnalysisFile, pfn=reportAnalysisFile.fileName, lfn=newLFN, validate=False) # Add analysis file entries for additional files listed in workflow for fileName in stepHelper.listAnalysisFiles(): analysisFile = stepHelper.getAnalysisFile(fileName) if os.path.isfile(analysisFile.fileName): newLFN = analysisFileLFN(analysisFile.fileName, analysisFile.lfnBase, self.job) self.report.addAnalysisFile(analysisFile.fileName, lfn=newLFN, Source='UserDefined', pfn=os.path.join( os.getcwd(), analysisFile.fileName), validate=False) return
def steps(self): """get WMStep structure""" if self.data.steps.topStepName == None: return None step = getattr(self.data.steps, self.data.steps.topStepName, None) return WMStepHelper(step)
raise WMExecutionFailure(spawnedChild.returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception, ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) # # If multicore is enabled, merged the output files and reports # if multicoreEnabled: self.multicoreMerge(scram, applicationStart) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.workload.getPrepID() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath)
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) failureUpstream = False for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if failureUpstream: # for chained steps, don't execute further steps if a # failure has already happened helper.addOverride("previousCmsRunFailure", True) executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) logging.info("StepName: %s, StepType: %s, with result: %r", stepName, stepType, result) if result: # can be either None, or the step exit code failureUpstream = True except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return