Ejemplo n.º 1
0
def createInitialReport(job, task, logLocation):
    """
    _createInitialReport_

    Create an initial job report with the base
    information in it.
    """
    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        #TODO: Make less goatballs for testing purposes
        return

    report = Report.Report()

    report.data.WMAgentJobID = job.get('id', None)
    report.data.WMAgentJobName = job.get('name', None)
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()
    report.data.completed = False
    report.setTaskName(taskName=job.get('task', 'TaskNotFound'))

    # Not so fond of this, but we have to put the master
    # report way up at the top so it's returned if the
    # job fails early
    reportPath = os.path.join(os.getcwd(), '../', logLocation)
    report.save(reportPath)

    return
Ejemplo n.º 2
0
def createInitialReport(job, reportName):
    """
    _createInitialReport_

    Create an initial job report with the base
    information in it.
    """
    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        # TODO: Make less goatballs for testing purposes
        return

    report = Report.Report()

    report.data.WMAgentJobID = job.get('id', None)
    report.data.WMAgentJobName = job.get('name', None)
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()

    # TODO: need to check what format it returns and what features need to extract.
    # currently
    # $MACHINEFEATURES/hs06: HS06 score of the host
    # $MACHINEFEATURES/total_cpu: number of configured job slots
    # $JOBFEATURES/hs06_job: HS06 score available to your job
    # $JOBFEATURES/allocated_cpu: number of allocated slots (=8 in case of a multicore job

    machineFeaturesFile = os.environ.get('MACHINEFEATURES')
    report.data.machineFeatures = {}
    if machineFeaturesFile:
        report.data.machineFeatures['hs06'] = readFloatFromFile(
            "%s/hs06" % machineFeaturesFile)
        report.data.machineFeatures['total_cpu'] = readFloatFromFile(
            "%s/total_cpu" % machineFeaturesFile)

    jobFeaturesFile = os.environ.get('JOBFEATURES')
    report.data.jobFeatures = {}
    if jobFeaturesFile:
        report.data.jobFeatures['hs06_job'] = readFloatFromFile(
            "%s/hs06_job" % jobFeaturesFile)
        report.data.jobFeatures['allocated_cpu'] = readFloatFromFile(
            "%s/allocated_cpu" % jobFeaturesFile)

    report.data.completed = False
    report.setTaskName(taskName=job.get('task', 'TaskNotFound'))

    # Not so fond of this, but we have to put the master
    # report way up at the top so it's returned if the
    # job fails early
    reportPath = os.path.join(os.getcwd(), '../', reportName)
    report.save(reportPath)

    return
Ejemplo n.º 3
0
    def __call__(self):
        report = Report.Report(self.step.name())

        report.id = self.job["id"]
        report.task = self.job["task"]
        report.workload = None

        self.addInputFilesToReport(report)
        self.addOutputFilesToReport(report)
        return report
Ejemplo n.º 4
0
    def completeTask(self, jobLocation, logLocation):
        """
        _completeTask_

        Combine all the logs from all the steps in the task to a single log

        If necessary, output to Dashboard
        """
        import WMCore.FwkJobReport.Report as Report

        finalReport = Report.Report()
        # We left the master report somewhere way up at the top
        testPath = os.path.join(jobLocation, '../../', logLocation)
        if os.path.exists(testPath):
            # If a report already exists, we load it and
            # append our steps to it
            finalReport.load(testPath)
        taskSteps = self.listAllStepNames()
        for taskStep in taskSteps:
            reportPath = os.path.join(jobLocation, taskStep, "Report.pkl")
            if os.path.isfile(reportPath):
                stepReport = Report.Report()
                stepReport.unpersist(reportPath, taskStep)
                finalReport.setStep(taskStep,
                                    stepReport.retrieveStep(taskStep))
            else:
                # Then we have a missing report
                # This should raise an alarm bell, as per Steve's request
                # TODO: Change error code
                finalReport.addStep(reportname=taskStep, status=1)
                finalReport.addError(
                    stepName=taskStep,
                    exitCode=99999,
                    errorType="ReportManipulatingError",
                    errorDetails="Could not find report file for step %s!" %
                    taskStep)

        finalReport.data.completed = True
        finalReport.persist(logLocation)

        return
Ejemplo n.º 5
0
def createErrorReport(exitCode,
                      errorType,
                      errorDetails=None,
                      logLocation="Report.0.pkl"):
    """
    _createErrorReport_

    Create a report if something fails inside the Bootstrap
    This creates a dummy step called 'CRITICAL' and
    sticks the error in there.
    """

    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        #TODO: Make this not suck goatballs when you are just running tests
        return
    report = Report.Report()

    report.data.seName = siteCfg.localStageOut.get('se-name',
                                                   socket.gethostname())
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()
    report.data.completed = False

    report.addError(stepName='CRITICAL',
                    exitCode=exitCode,
                    errorType=errorType,
                    errorDetails=errorDetails)

    reportPath = os.path.join(os.getcwd(), '../', logLocation)
    report.save(reportPath)

    return
Ejemplo n.º 6
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'PSS': 50660,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the ps monitor command and collate the data
        # Gathers RSS, %CPU and %MEM statistics from ps
        ps_cmd = self.monitorBase % (stepPID, stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd)

        ps_output = stdout.split()
        if not len(ps_output) > 6:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % ps_output
            msg += "command = %s\n" % ps_cmd
            logging.error(msg)
            return

        # run the command to gather PSS memory statistics from /proc/<pid>/smaps
        smaps_cmd = self.pssMemoryCommand % (stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd)

        smaps_output = stdout.split()
        if not len(smaps_output) == 1:
            # Then something went wrong in getting the smaps data
            msg = "Error when grabbing output from smaps\n"
            msg += "output = %s\n" % smaps_output
            msg += "command = %s\n" % smaps_cmd
            logging.error(msg)
            return

        # smaps also returns data in kiloBytes, let's make it megaBytes
        # I'm also confused with these megabytes and mebibytes...
        pss = int(smaps_output[0]) // 1000

        logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxPSS is not None and pss >= self.maxPSS:
            msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS
            msg += "Job has PSS: %s MB\n" % pss
            killProc = True
            reason = 'PSS'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if not killProc:
            # then job is behaving well, there is nothing to do
            return

        # make sure we persist the performance error only once
        if not self.killRetry:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += "Error: %s" % str(ex)
                logging.exception(msg2)

        try:
            if not killHard and not self.killRetry:
                logging.error("Attempting to kill step using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            else:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
        except Exception:
            logging.error("Attempting to kill step using SIGTERM")
            os.kill(stepPID, signal.SIGTERM)
        finally:
            self.killRetry = True

        return
Ejemplo n.º 7
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" %
                     (output[2], output[3], output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True

        #Let's check the running time
        currentTime = time.time()

        if self.hardTimeout != None and self.softTimeout != None:
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=99900,
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
Ejemplo n.º 8
0
outputModules = ["outputModule1", "outputModule2", "outputModule3",
                 "outputModule4", "outputModule5", "outputModule6",
                 "outputModule7", "outputModule8", "outputModule9",
                 "outputModule10"]

runInfo = Run(1)
runInfo.lumis.extend([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                      25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
                      39, 40])

totalReports = 25
inputFilesPerReport = 50

inputFileCounter = 0
for i in range(totalReports):
    loadTestReport = Report.Report("cmsRun1")
    loadTestReport.addInputSource("PoolSource")

    for j in range(inputFilesPerReport):
        inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter,
                                                events = 600000, size = 600000)
        inputFileCounter += 1

    Report.addRunInfoToFile(inputFile, runInfo)

    for outputModule in outputModules:
        loadTestReport.addOutputModule(outputModule)
        datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3",
                       "primaryDataset": outputModule, "dataTier": "RAW",
                       "processedDataset": "LoadTest10"}
        fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov",
Ejemplo n.º 9
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {
            'RSS': 50660,
            'VSZ': 50661,
            'Wallclock time': 50664,
            '': 99999
        }

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        # FIXME: making it backwards compatible. Keep only the "else" block in HG1801
        if self.maxRSS is not None and self.maxRSS >= (1024 * 1024):
            # then workload value is still in KiB (old way)
            rss = int(output[2])
            vsize = int(output[3])
        else:
            # ps returns data in kiloBytes, let's make it megaBytes
            # I'm so confused with these megabytes and mebibytes...
            rss = int(output[2]) // 1000  # convert it to MiB
            vsize = int(output[3]) // 1000  # convert it to MiB
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s", output[2],
                     output[3], output[4], output[5])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxRSS is not None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
            reason = 'RSS'
        elif self.maxVSize is not None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True
            reason = 'VSZ'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return