def createInitialReport(job, task, logLocation): """ _createInitialReport_ Create an initial job report with the base information in it. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) #TODO: Make less goatballs for testing purposes return report = Report.Report() report.data.WMAgentJobID = job.get('id', None) report.data.WMAgentJobName = job.get('name', None) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() report.data.completed = False report.setTaskName(taskName=job.get('task', 'TaskNotFound')) # Not so fond of this, but we have to put the master # report way up at the top so it's returned if the # job fails early reportPath = os.path.join(os.getcwd(), '../', logLocation) report.save(reportPath) return
def createInitialReport(job, reportName): """ _createInitialReport_ Create an initial job report with the base information in it. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) # TODO: Make less goatballs for testing purposes return report = Report.Report() report.data.WMAgentJobID = job.get('id', None) report.data.WMAgentJobName = job.get('name', None) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() # TODO: need to check what format it returns and what features need to extract. # currently # $MACHINEFEATURES/hs06: HS06 score of the host # $MACHINEFEATURES/total_cpu: number of configured job slots # $JOBFEATURES/hs06_job: HS06 score available to your job # $JOBFEATURES/allocated_cpu: number of allocated slots (=8 in case of a multicore job machineFeaturesFile = os.environ.get('MACHINEFEATURES') report.data.machineFeatures = {} if machineFeaturesFile: report.data.machineFeatures['hs06'] = readFloatFromFile( "%s/hs06" % machineFeaturesFile) report.data.machineFeatures['total_cpu'] = readFloatFromFile( "%s/total_cpu" % machineFeaturesFile) jobFeaturesFile = os.environ.get('JOBFEATURES') report.data.jobFeatures = {} if jobFeaturesFile: report.data.jobFeatures['hs06_job'] = readFloatFromFile( "%s/hs06_job" % jobFeaturesFile) report.data.jobFeatures['allocated_cpu'] = readFloatFromFile( "%s/allocated_cpu" % jobFeaturesFile) report.data.completed = False report.setTaskName(taskName=job.get('task', 'TaskNotFound')) # Not so fond of this, but we have to put the master # report way up at the top so it's returned if the # job fails early reportPath = os.path.join(os.getcwd(), '../', reportName) report.save(reportPath) return
def __call__(self): report = Report.Report(self.step.name()) report.id = self.job["id"] report.task = self.job["task"] report.workload = None self.addInputFilesToReport(report) self.addOutputFilesToReport(report) return report
def completeTask(self, jobLocation, logLocation): """ _completeTask_ Combine all the logs from all the steps in the task to a single log If necessary, output to Dashboard """ import WMCore.FwkJobReport.Report as Report finalReport = Report.Report() # We left the master report somewhere way up at the top testPath = os.path.join(jobLocation, '../../', logLocation) if os.path.exists(testPath): # If a report already exists, we load it and # append our steps to it finalReport.load(testPath) taskSteps = self.listAllStepNames() for taskStep in taskSteps: reportPath = os.path.join(jobLocation, taskStep, "Report.pkl") if os.path.isfile(reportPath): stepReport = Report.Report() stepReport.unpersist(reportPath, taskStep) finalReport.setStep(taskStep, stepReport.retrieveStep(taskStep)) else: # Then we have a missing report # This should raise an alarm bell, as per Steve's request # TODO: Change error code finalReport.addStep(reportname=taskStep, status=1) finalReport.addError( stepName=taskStep, exitCode=99999, errorType="ReportManipulatingError", errorDetails="Could not find report file for step %s!" % taskStep) finalReport.data.completed = True finalReport.persist(logLocation) return
def createErrorReport(exitCode, errorType, errorDetails=None, logLocation="Report.0.pkl"): """ _createErrorReport_ Create a report if something fails inside the Bootstrap This creates a dummy step called 'CRITICAL' and sticks the error in there. """ try: siteCfg = loadSiteLocalConfig() except SiteConfigError: # For now, assume that we did this on purpose msg = "Couldn't find SiteConfig" logging.error(msg) #TODO: Make this not suck goatballs when you are just running tests return report = Report.Report() report.data.seName = siteCfg.localStageOut.get('se-name', socket.gethostname()) report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown') report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown') report.data.hostName = socket.gethostname() report.data.ceName = getSyncCE() report.data.completed = False report.addError(stepName='CRITICAL', exitCode=exitCode, errorType=errorType, errorDetails=errorDetails) reportPath = os.path.join(os.getcwd(), '../', logLocation) report.save(reportPath) return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM)
outputModules = ["outputModule1", "outputModule2", "outputModule3", "outputModule4", "outputModule5", "outputModule6", "outputModule7", "outputModule8", "outputModule9", "outputModule10"] runInfo = Run(1) runInfo.lumis.extend([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) totalReports = 25 inputFilesPerReport = 50 inputFileCounter = 0 for i in range(totalReports): loadTestReport = Report.Report("cmsRun1") loadTestReport.addInputSource("PoolSource") for j in range(inputFilesPerReport): inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter, events = 600000, size = 600000) inputFileCounter += 1 Report.addRunInfoToFile(inputFile, runInfo) for outputModule in outputModules: loadTestReport.addOutputModule(outputModule) datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3", "primaryDataset": outputModule, "dataTier": "RAW", "processedDataset": "LoadTest10"} fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov",
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = { 'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999 } if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: # ps returns data in kiloBytes, let's make it megaBytes # I'm so confused with these megabytes and mebibytes... rss = int(output[2]) // 1000 # convert it to MiB vsize = int(output[3]) // 1000 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return