def inputFileHandler(targets): """ _inputFileHandler_ coroutine to create input files in the report and dispatch sub data down the pipeline """ while True: report, node = (yield) moduleName = None moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0] moduleName = moduleNode.text fileRef = report.addInputFile(moduleName) fileAttrs = {} for subnode in node.children: if subnode.name == "Runs": targets['Runs'].send((fileRef, subnode)) elif subnode.name == "Branches": targets['Branches'].send((fileRef, subnode)) else: fileAttrs[subnode.name] = subnode.text Report.addAttributesToFile(fileRef, lfn=fileAttrs["LFN"], pfn=fileAttrs["PFN"], catalog=fileAttrs["Catalog"], module_label=fileAttrs["ModuleLabel"], guid=fileAttrs["GUID"], input_type=fileAttrs["InputType"], input_source_class=fileAttrs["InputSourceClass"], events=int(fileAttrs["EventsRead"]))
def inputFileHandler(targets): """ _inputFileHandler_ coroutine to create input files in the report and dispatch sub data down the pipeline """ while True: report, node = (yield) moduleName = None moduleNode = [ x for x in node.children if x.name == "ModuleLabel"][0] moduleName = moduleNode.text moduleRef = report.addInputSource(moduleName) fileRef = report.addInputFile(moduleName) fileAttrs = {} for subnode in node.children: if subnode.name == "Runs": targets['Runs'].send( (fileRef, subnode) ) elif subnode.name == "Branches": targets['Branches'].send( (fileRef, subnode) ) else: fileAttrs[subnode.name] = subnode.text Report.addAttributesToFile(fileRef, lfn = fileAttrs["LFN"], pfn = fileAttrs["PFN"], catalog = fileAttrs["Catalog"], module_label = fileAttrs["ModuleLabel"], guid = fileAttrs["GUID"], input_type = fileAttrs["InputType"], input_source_class = fileAttrs["InputSourceClass"], events = int(fileAttrs["EventsRead"])) [fileRef]
def runHandler(): """ _runHandler_ Sink to add run information to a file. Given the following XML: <Runs> <Run ID="122023"> <LumiSection ID="215"/> <LumiSection ID="216"/> </Run> <Run ID="122024"> <LumiSection ID="1"/> <LumiSection ID="2"/> </Run> </Runs> Create a WMCore.DataStructs.Run object for each run and call the addRunInfoToFile() function to add the run information to the file section. """ while True: fileSection, node = (yield) for subnode in node.children: if subnode.name == "Run": runId = subnode.attrs.get("ID", None) if runId == None: continue lumis = [int(lumi.attrs["ID"]) for lumi in subnode.children if lumi.attrs.has_key("ID")] runInfo = Run(runNumber=runId) runInfo.lumis.extend(lumis) Report.addRunInfoToFile(fileSection, runInfo)
def runHandler(): """ _runHandler_ Sink to add run information to a file. Given the following XML: <Runs> <Run ID="122023"> <LumiSection ID="215"/> <LumiSection ID="216"/> </Run> <Run ID="122024"> <LumiSection ID="1"/> <LumiSection ID="2"/> </Run> </Runs> Create a WMCore.DataStructs.Run object for each run and call the addRunInfoToFile() function to add the run information to the file section. """ while True: fileSection, node = (yield) for subnode in node.children: if subnode.name == "Run": runId = subnode.attrs.get("ID", None) if runId == None: continue lumis = [ int(lumi.attrs['ID']) for lumi in subnode.children if lumi.attrs.has_key("ID")] runInfo = Run(runNumber = runId) runInfo.lumis.extend(lumis) Report.addRunInfoToFile(fileSection, runInfo)
def fileHandler(targets): """ _fileHandler_ coroutine to create files and handle sub data in the appropriate dispatchers """ while True: report, node = (yield) moduleName = None moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0] moduleName = moduleNode.text fileRef = report.addOutputFile(moduleName) fileAttrs = {} for subnode in node.children: if subnode.name == "Inputs": targets['Inputs'].send((fileRef, subnode)) elif subnode.name == "Runs": targets['Runs'].send((fileRef, subnode)) elif subnode.name == "Branches": targets['Branches'].send((fileRef, subnode)) else: fileAttrs[subnode.name] = subnode.text Report.addAttributesToFile(fileRef, lfn=fileAttrs["LFN"], pfn=fileAttrs["PFN"], catalog=fileAttrs["Catalog"], module_label=fileAttrs["ModuleLabel"], guid=fileAttrs["GUID"], output_module_class=fileAttrs["OutputModuleClass"], events=int(fileAttrs["TotalEvents"]), branch_hash=fileAttrs["BranchHash"])
def addInputFilesToReport(self, report): """ _addInputFilesToReport_ Pull all of the input files out of the job and add them to the report. """ report.addInputSource("PoolSource") for inputFile in self.job["input_files"]: inputFileSection = report.addInputFile("PoolSource", lfn=inputFile["lfn"], size=inputFile["size"], events=inputFile["events"]) Report.addRunInfoToFile(inputFileSection, inputFile["runs"]) return
def createInitialReport(job, task, logLocation): """ _createInitialReport_ Create an initial job report with the base information in it. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) #TODO: Make less goatballs for testing purposes return report = Report.Report() report.data.WMAgentJobID = job.get('id', None) report.data.WMAgentJobName = job.get('name', None) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() report.data.completed = False report.setTaskName(taskName=job.get('task', 'TaskNotFound')) # Not so fond of this, but we have to put the master # report way up at the top so it's returned if the # job fails early reportPath = os.path.join(os.getcwd(), '../', logLocation) report.save(reportPath) return
def createInitialReport(job, reportName): """ _createInitialReport_ Create an initial job report with the base information in it. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) # TODO: Make less goatballs for testing purposes return report = Report.Report() report.data.WMAgentJobID = job.get('id', None) report.data.WMAgentJobName = job.get('name', None) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() # TODO: need to check what format it returns and what features need to extract. # currently # $MACHINEFEATURES/hs06: HS06 score of the host # $MACHINEFEATURES/total_cpu: number of configured job slots # $JOBFEATURES/hs06_job: HS06 score available to your job # $JOBFEATURES/allocated_cpu: number of allocated slots (=8 in case of a multicore job machineFeaturesFile = os.environ.get('MACHINEFEATURES') report.data.machineFeatures = {} if machineFeaturesFile: report.data.machineFeatures['hs06'] = readFloatFromFile( "%s/hs06" % machineFeaturesFile) report.data.machineFeatures['total_cpu'] = readFloatFromFile( "%s/total_cpu" % machineFeaturesFile) jobFeaturesFile = os.environ.get('JOBFEATURES') report.data.jobFeatures = {} if jobFeaturesFile: report.data.jobFeatures['hs06_job'] = readFloatFromFile( "%s/hs06_job" % jobFeaturesFile) report.data.jobFeatures['allocated_cpu'] = readFloatFromFile( "%s/allocated_cpu" % jobFeaturesFile) report.data.completed = False report.setTaskName(taskName=job.get('task', 'TaskNotFound')) # Not so fond of this, but we have to put the master # report way up at the top so it's returned if the # job fails early reportPath = os.path.join(os.getcwd(), '../', reportName) report.save(reportPath) return
def addOutputFilesToReport(self, report): """ _addOutputFilesToReport_ Add output files to every output module in the step. Scale the size and number of events in the output files appropriately. """ (outputSize, outputEvents) = self.determineOutputSize() if not os.path.exists('ReportEmuTestFile.txt'): f = open('ReportEmuTestFile.txt', 'w') f.write('A Shubbery') f.close() for outputModuleName in self.step.listOutputModules(): outputModuleSection = self.step.getOutputModule(outputModuleName) outputModuleSection.fixedLFN = False outputModuleSection.disableGUID = False outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase, str(makeUUID())) outputFile = File(lfn=outputLFN, size=outputSize, events=outputEvents, merged=False) outputFile.setLocation(self.job["location"]) outputFile['pfn'] = "ReportEmuTestFile.txt" outputFile['guid'] = "ThisIsGUID" outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"} outputFile["dataset"] = { "primaryDataset": outputModuleSection.primaryDataset, "processedDataset": outputModuleSection.processedDataset, "dataTier": outputModuleSection.dataTier, "applicationName": "cmsRun", "applicationVersion": self.step.getCMSSWVersion() } outputFile["module_label"] = outputModuleName outputFileSection = report.addOutputFile(outputModuleName, outputFile) for inputFile in self.job["input_files"]: Report.addRunInfoToFile(outputFileSection, inputFile["runs"]) return
def __call__(self): report = Report.Report(self.step.name()) report.id = self.job["id"] report.task = self.job["task"] report.workload = None self.addInputFilesToReport(report) self.addOutputFilesToReport(report) return report
def runHandler(): """ _runHandler_ Sink to add run information to a file. Given the following XML: <Runs> <Run ID="122023"> <LumiSection NEvents="100" ID="215"/> <LumiSection NEvents="100" ID="216"/> </Run> <Run ID="122024"> <LumiSection ID="1"/> <LumiSection ID="2"/> </Run> </Runs> Create a WMCore.DataStructs.Run object for each run and call the addRunInfoToFile() function to add the run information to the file section. """ while True: fileSection, node = (yield) for subnode in node.children: if subnode.name == "Run": runId = subnode.attrs.get("ID", None) if runId is None: continue lumis = [] for lumi in subnode.children: if "ID" in lumi.attrs: lumiNumber = int(lumi.attrs['ID']) nEvents = lumi.attrs.get("NEvents", None) if nEvents is not None: try: nEvents = int(nEvents) except ValueError: nEvents = None lumis.append((lumiNumber, nEvents)) runInfo = Run(runNumber=runId) runInfo.extendLumis(lumis) Report.addRunInfoToFile(fileSection, runInfo)
def inputAssocHandler(): """ _inputAssocHandler_ Sink to handle output:input association information. Given the following XML: <Input> <LFN>/path/to/some/lfn.root</LFN> <PFN>/some/pfn/info/path/to/some/lfn.root</PFN> </Input> Extract the LFN and call the addInputToFile() function to associate input to output in the FWJR. """ while True: fileSection, node = (yield) for inputnode in node.children: data = {} [data.__setitem__(subnode.name, subnode.text) for subnode in inputnode.children] Report.addInputToFile(fileSection, data["LFN"], data["PFN"])
def completeTask(self, jobLocation, logLocation): """ _completeTask_ Combine all the logs from all the steps in the task to a single log If necessary, output to Dashboard """ import WMCore.FwkJobReport.Report as Report finalReport = Report.Report() # We left the master report somewhere way up at the top testPath = os.path.join(jobLocation, '../../', logLocation) if os.path.exists(testPath): # If a report already exists, we load it and # append our steps to it finalReport.load(testPath) taskSteps = self.listAllStepNames() for taskStep in taskSteps: reportPath = os.path.join(jobLocation, taskStep, "Report.pkl") if os.path.isfile(reportPath): stepReport = Report.Report() stepReport.unpersist(reportPath, taskStep) finalReport.setStep(taskStep, stepReport.retrieveStep(taskStep)) else: # Then we have a missing report # This should raise an alarm bell, as per Steve's request # TODO: Change error code finalReport.addStep(reportname=taskStep, status=1) finalReport.addError( stepName=taskStep, exitCode=99999, errorType="ReportManipulatingError", errorDetails="Could not find report file for step %s!" % taskStep) finalReport.data.completed = True finalReport.persist(logLocation) return
def inputAssocHandler(): """ _inputAssocHandler_ Sink to handle output:input association information. Given the following XML: <Input> <LFN>/path/to/some/lfn.root</LFN> <PFN>/some/pfn/info/path/to/some/lfn.root</PFN> </Input> Extract the LFN and call the addInputToFile() function to associate input to output in the FWJR. """ while True: fileSection, node = (yield) for inputnode in node.children: data = {} for subnode in inputnode.children: data.__setitem__(subnode.name, subnode.text) Report.addInputToFile(fileSection, data["LFN"], data['PFN'])
def fileHandler(targets): """ _fileHandler_ coroutine to create files and handle sub data in the appropriate dispatchers """ while True: report, node = (yield) moduleName = None moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0] moduleName = moduleNode.text moduleRef = report.addOutputModule(moduleName) fileRef = report.addOutputFile(moduleName) fileAttrs = {} for subnode in node.children: if subnode.name == "Inputs": targets["Inputs"].send((fileRef, subnode)) elif subnode.name == "Runs": targets["Runs"].send((fileRef, subnode)) elif subnode.name == "Branches": targets["Branches"].send((fileRef, subnode)) else: fileAttrs[subnode.name] = subnode.text Report.addAttributesToFile( fileRef, lfn=fileAttrs["LFN"], pfn=fileAttrs["PFN"], catalog=fileAttrs["Catalog"], module_label=fileAttrs["ModuleLabel"], guid=fileAttrs["GUID"], ouput_module_class=fileAttrs["OutputModuleClass"], events=int(fileAttrs["TotalEvents"]), branch_hash=fileAttrs["BranchHash"], ) [fileRef]
def addOutputFilesToReport(self, report): """ _addOutputFilesToReport_ Add output files to every output module in the step. Scale the size and number of events in the output files appropriately. """ (outputSize, outputEvents) = self.determineOutputSize() if not os.path.exists('ReportEmuTestFile.txt'): f = open('ReportEmuTestFile.txt', 'w') f.write('A Shubbery') f.close() for outputModuleName in self.step.listOutputModules(): outputModuleSection = self.step.getOutputModule(outputModuleName) outputModuleSection.fixedLFN = False outputModuleSection.disableGUID = False outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase, str(makeUUID())) outputFile = File(lfn = outputLFN, size = outputSize, events = outputEvents, merged = False) outputFile.setLocation(self.job["location"]) outputFile['pfn'] = "ReportEmuTestFile.txt" outputFile['guid'] = "ThisIsGUID" outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"} outputFile["dataset"] = {"primaryDataset": outputModuleSection.primaryDataset, "processedDataset": outputModuleSection.processedDataset, "dataTier": outputModuleSection.dataTier, "applicationName": "cmsRun", "applicationVersion": self.step.getCMSSWVersion()} outputFile["module_label"] = outputModuleName outputFileSection = report.addOutputFile(outputModuleName, outputFile) for inputFile in self.job["input_files"]: Report.addRunInfoToFile(outputFileSection, inputFile["runs"]) return
def createErrorReport(exitCode, errorType, errorDetails=None, logLocation="Report.0.pkl"): """ _createErrorReport_ Create a report if something fails inside the Bootstrap This creates a dummy step called 'CRITICAL' and sticks the error in there. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) #TODO: Make this not suck goatballs when you are just running tests return report = Report.Report() report.data.seName = siteCfg.localStageOut.get('se-name', socket.gethostname()) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() report.data.completed = False report.addError(stepName='CRITICAL', exitCode=exitCode, errorType=errorType, errorDetails=errorDetails) reportPath = os.path.join(os.getcwd(), '../', logLocation) report.save(reportPath) return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
outputModules = ["outputModule1", "outputModule2", "outputModule3", "outputModule4", "outputModule5", "outputModule6", "outputModule7", "outputModule8", "outputModule9", "outputModule10"] runInfo = Run(1) runInfo.lumis.extend([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) totalReports = 25 inputFilesPerReport = 50 inputFileCounter = 0 for i in range(totalReports): loadTestReport = Report.Report("cmsRun1") loadTestReport.addInputSource("PoolSource") for j in range(inputFilesPerReport): inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter, events = 600000, size = 600000) inputFileCounter += 1 Report.addRunInfoToFile(inputFile, runInfo) for outputModule in outputModules: loadTestReport.addOutputModule(outputModule) datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3", "primaryDataset": outputModule, "dataTier": "RAW", "processedDataset": "LoadTest10"} fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov",
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM)
39, 40]) totalReports = 25 inputFilesPerReport = 50 inputFileCounter = 0 for i in range(totalReports): loadTestReport = Report.Report("cmsRun1") loadTestReport.addInputSource("PoolSource") for j in range(inputFilesPerReport): inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter, events = 600000, size = 600000) inputFileCounter += 1 Report.addRunInfoToFile(inputFile, runInfo) for outputModule in outputModules: loadTestReport.addOutputModule(outputModule) datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3", "primaryDataset": outputModule, "dataTier": "RAW", "processedDataset": "LoadTest10"} fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov", "checksums": {"adler32": "ff810ec3", "cksum": "2212831827"}, "events": random.randrange(500, 5000, 50), "merged": True, "size": random.randrange(1000, 2000, 100000000), "module_label": outputModule, "dataset": datasetInfo} outputFile = loadTestReport.addOutputFile(outputModule, fileAttrs) Report.addRunInfoToFile(outputFile, runInfo)
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = { 'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999 } if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: # ps returns data in kiloBytes, let's make it megaBytes # I'm so confused with these megabytes and mebibytes... rss = int(output[2]) // 1000 # convert it to MiB vsize = int(output[3]) // 1000 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return