Example #1
0
def inputFileHandler(targets):
    """
    _inputFileHandler_

    coroutine to create input files in the report and dispatch
    sub data down the pipeline

    """
    while True:
        report, node = (yield)
        moduleName = None
        moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0]
        moduleName = moduleNode.text

        fileRef = report.addInputFile(moduleName)
        fileAttrs = {}
        for subnode in node.children:
            if subnode.name == "Runs":
                targets['Runs'].send((fileRef, subnode))
            elif subnode.name == "Branches":
                targets['Branches'].send((fileRef, subnode))
            else:
                fileAttrs[subnode.name] = subnode.text

        Report.addAttributesToFile(fileRef, lfn=fileAttrs["LFN"],
                                   pfn=fileAttrs["PFN"], catalog=fileAttrs["Catalog"],
                                   module_label=fileAttrs["ModuleLabel"],
                                   guid=fileAttrs["GUID"], input_type=fileAttrs["InputType"],
                                   input_source_class=fileAttrs["InputSourceClass"],
                                   events=int(fileAttrs["EventsRead"]))
Example #2
0
def inputFileHandler(targets):
    """
    _inputFileHandler_

    coroutine to create input files in the report and dispatch
    sub data down the pipeline

    """
    while True:
        report, node = (yield)
        moduleName = None
        moduleNode = [ x for x in node.children if x.name == "ModuleLabel"][0]
        moduleName = moduleNode.text

        moduleRef = report.addInputSource(moduleName)
        fileRef = report.addInputFile(moduleName)
        fileAttrs = {}
        for subnode in node.children:
            if subnode.name == "Runs":
                targets['Runs'].send( (fileRef, subnode) )
            elif subnode.name == "Branches":
                targets['Branches'].send( (fileRef, subnode) )
            else:
                fileAttrs[subnode.name] = subnode.text

        Report.addAttributesToFile(fileRef, lfn = fileAttrs["LFN"],
                                   pfn = fileAttrs["PFN"], catalog = fileAttrs["Catalog"],
                                   module_label = fileAttrs["ModuleLabel"],
                                   guid = fileAttrs["GUID"], input_type = fileAttrs["InputType"],
                                   input_source_class = fileAttrs["InputSourceClass"],
                                   events = int(fileAttrs["EventsRead"]))

        [fileRef]
Example #3
0
def runHandler():
    """
    _runHandler_

    Sink to add run information to a file.  Given the following XML:
      <Runs>
      <Run ID="122023">
        <LumiSection ID="215"/>
        <LumiSection ID="216"/>
      </Run>
      <Run ID="122024">
        <LumiSection ID="1"/>
        <LumiSection ID="2"/>
      </Run>    
      </Runs>

    Create a WMCore.DataStructs.Run object for each run and call the
    addRunInfoToFile() function to add the run information to the file
    section.
    """
    while True:
        fileSection, node = (yield)
        for subnode in node.children:
            if subnode.name == "Run":
                runId = subnode.attrs.get("ID", None)
                if runId == None:
                    continue

                lumis = [int(lumi.attrs["ID"]) for lumi in subnode.children if lumi.attrs.has_key("ID")]

                runInfo = Run(runNumber=runId)
                runInfo.lumis.extend(lumis)

                Report.addRunInfoToFile(fileSection, runInfo)
Example #4
0
def runHandler():
    """
    _runHandler_

    Sink to add run information to a file.  Given the following XML:
      <Runs>
      <Run ID="122023">
        <LumiSection ID="215"/>
        <LumiSection ID="216"/>
      </Run>
      <Run ID="122024">
        <LumiSection ID="1"/>
        <LumiSection ID="2"/>
      </Run>
      </Runs>

    Create a WMCore.DataStructs.Run object for each run and call the
    addRunInfoToFile() function to add the run information to the file
    section.
    """
    while True:
        fileSection, node = (yield)
        for subnode in node.children:
            if subnode.name == "Run":
                runId = subnode.attrs.get("ID", None)
                if runId == None: continue

                lumis = [ int(lumi.attrs['ID'])
                          for lumi in subnode.children
                          if lumi.attrs.has_key("ID")]

                runInfo = Run(runNumber = runId)
                runInfo.lumis.extend(lumis)

                Report.addRunInfoToFile(fileSection, runInfo)
Example #5
0
def fileHandler(targets):
    """
    _fileHandler_

    coroutine to create files and handle sub data in the appropriate
    dispatchers

    """
    while True:
        report, node = (yield)
        moduleName = None
        moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0]
        moduleName = moduleNode.text

        fileRef = report.addOutputFile(moduleName)
        fileAttrs = {}
        for subnode in node.children:
            if subnode.name == "Inputs":
                targets['Inputs'].send((fileRef, subnode))
            elif subnode.name == "Runs":
                targets['Runs'].send((fileRef, subnode))
            elif subnode.name == "Branches":
                targets['Branches'].send((fileRef, subnode))
            else:
                fileAttrs[subnode.name] = subnode.text

        Report.addAttributesToFile(fileRef, lfn=fileAttrs["LFN"],
                                   pfn=fileAttrs["PFN"], catalog=fileAttrs["Catalog"],
                                   module_label=fileAttrs["ModuleLabel"],
                                   guid=fileAttrs["GUID"],
                                   output_module_class=fileAttrs["OutputModuleClass"],
                                   events=int(fileAttrs["TotalEvents"]),
                                   branch_hash=fileAttrs["BranchHash"])
Example #6
0
    def addInputFilesToReport(self, report):
        """
        _addInputFilesToReport_

        Pull all of the input files out of the job and add them to the report.
        """
        report.addInputSource("PoolSource")

        for inputFile in self.job["input_files"]:
            inputFileSection = report.addInputFile("PoolSource", lfn=inputFile["lfn"],
                                                   size=inputFile["size"],
                                                   events=inputFile["events"])
            Report.addRunInfoToFile(inputFileSection, inputFile["runs"])

        return
Example #7
0
def createInitialReport(job, task, logLocation):
    """
    _createInitialReport_

    Create an initial job report with the base
    information in it.
    """
    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        #TODO: Make less goatballs for testing purposes
        return

    report = Report.Report()

    report.data.WMAgentJobID = job.get('id', None)
    report.data.WMAgentJobName = job.get('name', None)
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()
    report.data.completed = False
    report.setTaskName(taskName=job.get('task', 'TaskNotFound'))

    # Not so fond of this, but we have to put the master
    # report way up at the top so it's returned if the
    # job fails early
    reportPath = os.path.join(os.getcwd(), '../', logLocation)
    report.save(reportPath)

    return
Example #8
0
    def addInputFilesToReport(self, report):
        """
        _addInputFilesToReport_

        Pull all of the input files out of the job and add them to the report.
        """
        report.addInputSource("PoolSource")

        for inputFile in self.job["input_files"]:
            inputFileSection = report.addInputFile("PoolSource",
                                                   lfn=inputFile["lfn"],
                                                   size=inputFile["size"],
                                                   events=inputFile["events"])
            Report.addRunInfoToFile(inputFileSection, inputFile["runs"])

        return
Example #9
0
def createInitialReport(job, reportName):
    """
    _createInitialReport_

    Create an initial job report with the base
    information in it.
    """
    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        # TODO: Make less goatballs for testing purposes
        return

    report = Report.Report()

    report.data.WMAgentJobID = job.get('id', None)
    report.data.WMAgentJobName = job.get('name', None)
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()

    # TODO: need to check what format it returns and what features need to extract.
    # currently
    # $MACHINEFEATURES/hs06: HS06 score of the host
    # $MACHINEFEATURES/total_cpu: number of configured job slots
    # $JOBFEATURES/hs06_job: HS06 score available to your job
    # $JOBFEATURES/allocated_cpu: number of allocated slots (=8 in case of a multicore job

    machineFeaturesFile = os.environ.get('MACHINEFEATURES')
    report.data.machineFeatures = {}
    if machineFeaturesFile:
        report.data.machineFeatures['hs06'] = readFloatFromFile(
            "%s/hs06" % machineFeaturesFile)
        report.data.machineFeatures['total_cpu'] = readFloatFromFile(
            "%s/total_cpu" % machineFeaturesFile)

    jobFeaturesFile = os.environ.get('JOBFEATURES')
    report.data.jobFeatures = {}
    if jobFeaturesFile:
        report.data.jobFeatures['hs06_job'] = readFloatFromFile(
            "%s/hs06_job" % jobFeaturesFile)
        report.data.jobFeatures['allocated_cpu'] = readFloatFromFile(
            "%s/allocated_cpu" % jobFeaturesFile)

    report.data.completed = False
    report.setTaskName(taskName=job.get('task', 'TaskNotFound'))

    # Not so fond of this, but we have to put the master
    # report way up at the top so it's returned if the
    # job fails early
    reportPath = os.path.join(os.getcwd(), '../', reportName)
    report.save(reportPath)

    return
Example #10
0
    def addOutputFilesToReport(self, report):
        """
        _addOutputFilesToReport_

        Add output files to every output module in the step.  Scale the size
        and number of events in the output files appropriately.
        """
        (outputSize, outputEvents) = self.determineOutputSize()

        if not os.path.exists('ReportEmuTestFile.txt'):
            f = open('ReportEmuTestFile.txt', 'w')
            f.write('A Shubbery')
            f.close()

        for outputModuleName in self.step.listOutputModules():
            outputModuleSection = self.step.getOutputModule(outputModuleName)
            outputModuleSection.fixedLFN = False
            outputModuleSection.disableGUID = False

            outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase,
                                        str(makeUUID()))
            outputFile = File(lfn=outputLFN,
                              size=outputSize,
                              events=outputEvents,
                              merged=False)
            outputFile.setLocation(self.job["location"])
            outputFile['pfn'] = "ReportEmuTestFile.txt"
            outputFile['guid'] = "ThisIsGUID"
            outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"}
            outputFile["dataset"] = {
                "primaryDataset": outputModuleSection.primaryDataset,
                "processedDataset": outputModuleSection.processedDataset,
                "dataTier": outputModuleSection.dataTier,
                "applicationName": "cmsRun",
                "applicationVersion": self.step.getCMSSWVersion()
            }
            outputFile["module_label"] = outputModuleName

            outputFileSection = report.addOutputFile(outputModuleName,
                                                     outputFile)
            for inputFile in self.job["input_files"]:
                Report.addRunInfoToFile(outputFileSection, inputFile["runs"])

        return
Example #11
0
    def __call__(self):
        report = Report.Report(self.step.name())

        report.id = self.job["id"]
        report.task = self.job["task"]
        report.workload = None

        self.addInputFilesToReport(report)
        self.addOutputFilesToReport(report)
        return report
Example #12
0
def runHandler():
    """
    _runHandler_

    Sink to add run information to a file.  Given the following XML:
      <Runs>
      <Run ID="122023">
        <LumiSection NEvents="100" ID="215"/>
        <LumiSection NEvents="100" ID="216"/>
      </Run>
      <Run ID="122024">
        <LumiSection ID="1"/>
        <LumiSection ID="2"/>
      </Run>
      </Runs>

    Create a WMCore.DataStructs.Run object for each run and call the
    addRunInfoToFile() function to add the run information to the file
    section.
    """
    while True:
        fileSection, node = (yield)
        for subnode in node.children:
            if subnode.name == "Run":
                runId = subnode.attrs.get("ID", None)
                if runId is None:
                    continue

                lumis = []
                for lumi in subnode.children:
                    if "ID" in lumi.attrs:
                        lumiNumber = int(lumi.attrs['ID'])
                        nEvents = lumi.attrs.get("NEvents", None)
                        if nEvents is not None:
                            try:
                                nEvents = int(nEvents)
                            except ValueError:
                                nEvents = None
                        lumis.append((lumiNumber, nEvents))
                runInfo = Run(runNumber=runId)
                runInfo.extendLumis(lumis)

                Report.addRunInfoToFile(fileSection, runInfo)
Example #13
0
def runHandler():
    """
    _runHandler_

    Sink to add run information to a file.  Given the following XML:
      <Runs>
      <Run ID="122023">
        <LumiSection NEvents="100" ID="215"/>
        <LumiSection NEvents="100" ID="216"/>
      </Run>
      <Run ID="122024">
        <LumiSection ID="1"/>
        <LumiSection ID="2"/>
      </Run>
      </Runs>

    Create a WMCore.DataStructs.Run object for each run and call the
    addRunInfoToFile() function to add the run information to the file
    section.
    """
    while True:
        fileSection, node = (yield)
        for subnode in node.children:
            if subnode.name == "Run":
                runId = subnode.attrs.get("ID", None)
                if runId is None:
                    continue

                lumis = []
                for lumi in subnode.children:
                    if "ID" in lumi.attrs:
                        lumiNumber = int(lumi.attrs['ID'])
                        nEvents = lumi.attrs.get("NEvents", None)
                        if nEvents is not None:
                            try:
                                nEvents = int(nEvents)
                            except ValueError:
                                nEvents = None
                        lumis.append((lumiNumber, nEvents))
                runInfo = Run(runNumber=runId)
                runInfo.extendLumis(lumis)

                Report.addRunInfoToFile(fileSection, runInfo)
Example #14
0
def inputAssocHandler():
    """
    _inputAssocHandler_

    Sink to handle output:input association information.  Given the following
    XML:
      <Input>
        <LFN>/path/to/some/lfn.root</LFN>
        <PFN>/some/pfn/info/path/to/some/lfn.root</PFN>
      </Input>  

    Extract the LFN and call the addInputToFile() function to associate input to
    output in the FWJR.
    """
    while True:
        fileSection, node = (yield)
        for inputnode in node.children:
            data = {}
            [data.__setitem__(subnode.name, subnode.text) for subnode in inputnode.children]
            Report.addInputToFile(fileSection, data["LFN"], data["PFN"])
Example #15
0
    def completeTask(self, jobLocation, logLocation):
        """
        _completeTask_

        Combine all the logs from all the steps in the task to a single log

        If necessary, output to Dashboard
        """
        import WMCore.FwkJobReport.Report as Report

        finalReport = Report.Report()
        # We left the master report somewhere way up at the top
        testPath = os.path.join(jobLocation, '../../', logLocation)
        if os.path.exists(testPath):
            # If a report already exists, we load it and
            # append our steps to it
            finalReport.load(testPath)
        taskSteps = self.listAllStepNames()
        for taskStep in taskSteps:
            reportPath = os.path.join(jobLocation, taskStep, "Report.pkl")
            if os.path.isfile(reportPath):
                stepReport = Report.Report()
                stepReport.unpersist(reportPath, taskStep)
                finalReport.setStep(taskStep,
                                    stepReport.retrieveStep(taskStep))
            else:
                # Then we have a missing report
                # This should raise an alarm bell, as per Steve's request
                # TODO: Change error code
                finalReport.addStep(reportname=taskStep, status=1)
                finalReport.addError(
                    stepName=taskStep,
                    exitCode=99999,
                    errorType="ReportManipulatingError",
                    errorDetails="Could not find report file for step %s!" %
                    taskStep)

        finalReport.data.completed = True
        finalReport.persist(logLocation)

        return
Example #16
0
def inputAssocHandler():
    """
    _inputAssocHandler_

    Sink to handle output:input association information.  Given the following
    XML:
      <Input>
        <LFN>/path/to/some/lfn.root</LFN>
        <PFN>/some/pfn/info/path/to/some/lfn.root</PFN>
      </Input>

    Extract the LFN and call the addInputToFile() function to associate input to
    output in the FWJR.
    """
    while True:
        fileSection, node = (yield)
        for inputnode in node.children:
            data = {}
            for subnode in inputnode.children:
                data.__setitem__(subnode.name, subnode.text)
            Report.addInputToFile(fileSection, data["LFN"], data['PFN'])
Example #17
0
def fileHandler(targets):
    """
    _fileHandler_

    coroutine to create files and handle sub data in the appropriate
    dispatchers

    """
    while True:
        report, node = (yield)
        moduleName = None
        moduleNode = [x for x in node.children if x.name == "ModuleLabel"][0]
        moduleName = moduleNode.text

        moduleRef = report.addOutputModule(moduleName)
        fileRef = report.addOutputFile(moduleName)
        fileAttrs = {}
        for subnode in node.children:
            if subnode.name == "Inputs":
                targets["Inputs"].send((fileRef, subnode))
            elif subnode.name == "Runs":
                targets["Runs"].send((fileRef, subnode))
            elif subnode.name == "Branches":
                targets["Branches"].send((fileRef, subnode))
            else:
                fileAttrs[subnode.name] = subnode.text

        Report.addAttributesToFile(
            fileRef,
            lfn=fileAttrs["LFN"],
            pfn=fileAttrs["PFN"],
            catalog=fileAttrs["Catalog"],
            module_label=fileAttrs["ModuleLabel"],
            guid=fileAttrs["GUID"],
            ouput_module_class=fileAttrs["OutputModuleClass"],
            events=int(fileAttrs["TotalEvents"]),
            branch_hash=fileAttrs["BranchHash"],
        )

        [fileRef]
Example #18
0
    def addOutputFilesToReport(self, report):
        """
        _addOutputFilesToReport_

        Add output files to every output module in the step.  Scale the size
        and number of events in the output files appropriately.
        """
        (outputSize, outputEvents) = self.determineOutputSize()

        if not os.path.exists('ReportEmuTestFile.txt'):
            f = open('ReportEmuTestFile.txt', 'w')
            f.write('A Shubbery')
            f.close()

        for outputModuleName in self.step.listOutputModules():
            outputModuleSection = self.step.getOutputModule(outputModuleName)
            outputModuleSection.fixedLFN    = False
            outputModuleSection.disableGUID = False

            outputLFN = "%s/%s.root" % (outputModuleSection.lfnBase,
                                        str(makeUUID()))
            outputFile = File(lfn = outputLFN, size = outputSize, events = outputEvents,
                              merged = False)
            outputFile.setLocation(self.job["location"])
            outputFile['pfn'] = "ReportEmuTestFile.txt"
            outputFile['guid'] = "ThisIsGUID"
            outputFile["checksums"] = {"adler32": "1234", "cksum": "5678"}
            outputFile["dataset"] = {"primaryDataset": outputModuleSection.primaryDataset,
                                     "processedDataset": outputModuleSection.processedDataset,
                                     "dataTier": outputModuleSection.dataTier,
                                     "applicationName": "cmsRun",
                                     "applicationVersion": self.step.getCMSSWVersion()}
            outputFile["module_label"] = outputModuleName

            outputFileSection = report.addOutputFile(outputModuleName, outputFile)
            for inputFile in self.job["input_files"]:
                Report.addRunInfoToFile(outputFileSection, inputFile["runs"])

        return
Example #19
0
def createErrorReport(exitCode,
                      errorType,
                      errorDetails=None,
                      logLocation="Report.0.pkl"):
    """
    _createErrorReport_

    Create a report if something fails inside the Bootstrap
    This creates a dummy step called 'CRITICAL' and
    sticks the error in there.
    """

    try:
        siteCfg = loadSiteLocalConfig()
    except SiteConfigError:
        # For now, assume that we did this on purpose
        msg = "Couldn't find SiteConfig"
        logging.error(msg)
        #TODO: Make this not suck goatballs when you are just running tests
        return
    report = Report.Report()

    report.data.seName = siteCfg.localStageOut.get('se-name',
                                                   socket.gethostname())
    report.data.pnn = siteCfg.localStageOut.get('phedex-node', 'Unknown')
    report.data.siteName = getattr(siteCfg, 'siteName', 'Unknown')
    report.data.hostName = socket.gethostname()
    report.data.ceName = getSyncCE()
    report.data.completed = False

    report.addError(stepName='CRITICAL',
                    exitCode=exitCode,
                    errorType=errorType,
                    errorDetails=errorDetails)

    reportPath = os.path.join(os.getcwd(), '../', logLocation)
    report.save(reportPath)

    return
Example #20
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'PSS': 50660,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the ps monitor command and collate the data
        # Gathers RSS, %CPU and %MEM statistics from ps
        ps_cmd = self.monitorBase % (stepPID, stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd)

        ps_output = stdout.split()
        if not len(ps_output) > 6:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % ps_output
            msg += "command = %s\n" % ps_cmd
            logging.error(msg)
            return

        # run the command to gather PSS memory statistics from /proc/<pid>/smaps
        smaps_cmd = self.pssMemoryCommand % (stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd)

        smaps_output = stdout.split()
        if not len(smaps_output) == 1:
            # Then something went wrong in getting the smaps data
            msg = "Error when grabbing output from smaps\n"
            msg += "output = %s\n" % smaps_output
            msg += "command = %s\n" % smaps_cmd
            logging.error(msg)
            return

        # smaps also returns data in kiloBytes, let's make it megaBytes
        # I'm also confused with these megabytes and mebibytes...
        pss = int(smaps_output[0]) // 1000

        logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxPSS is not None and pss >= self.maxPSS:
            msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS
            msg += "Job has PSS: %s MB\n" % pss
            killProc = True
            reason = 'PSS'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if not killProc:
            # then job is behaving well, there is nothing to do
            return

        # make sure we persist the performance error only once
        if not self.killRetry:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += "Error: %s" % str(ex)
                logging.exception(msg2)

        try:
            if not killHard and not self.killRetry:
                logging.error("Attempting to kill step using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            else:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
        except Exception:
            logging.error("Attempting to kill step using SIGTERM")
            os.kill(stepPID, signal.SIGTERM)
        finally:
            self.killRetry = True

        return
Example #21
0
outputModules = ["outputModule1", "outputModule2", "outputModule3",
                 "outputModule4", "outputModule5", "outputModule6",
                 "outputModule7", "outputModule8", "outputModule9",
                 "outputModule10"]

runInfo = Run(1)
runInfo.lumis.extend([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                      25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
                      39, 40])

totalReports = 25
inputFilesPerReport = 50

inputFileCounter = 0
for i in range(totalReports):
    loadTestReport = Report.Report("cmsRun1")
    loadTestReport.addInputSource("PoolSource")

    for j in range(inputFilesPerReport):
        inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter,
                                                events = 600000, size = 600000)
        inputFileCounter += 1

    Report.addRunInfoToFile(inputFile, runInfo)

    for outputModule in outputModules:
        loadTestReport.addOutputModule(outputModule)
        datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3",
                       "primaryDataset": outputModule, "dataTier": "RAW",
                       "processedDataset": "LoadTest10"}
        fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov",
Example #22
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" %
                     (output[2], output[3], output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True

        #Let's check the running time
        currentTime = time.time()

        if self.hardTimeout != None and self.softTimeout != None:
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=99900,
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
                     39, 40])

totalReports = 25
inputFilesPerReport = 50

inputFileCounter = 0
for i in range(totalReports):
    loadTestReport = Report.Report("cmsRun1")
    loadTestReport.addInputSource("PoolSource")

    for j in range(inputFilesPerReport):
        inputFile = loadTestReport.addInputFile("PoolSource", lfn = "input%i" % inputFileCounter,
                                                events = 600000, size = 600000)
        inputFileCounter += 1

    Report.addRunInfoToFile(inputFile, runInfo)

    for outputModule in outputModules:
        loadTestReport.addOutputModule(outputModule)
        datasetInfo = {"applicationName": "cmsRun", "applicationVersion": "CMSSW_3_3_5_patch3",
                       "primaryDataset": outputModule, "dataTier": "RAW",
                       "processedDataset": "LoadTest10"}
        fileAttrs = {"lfn": makeUUID(), "location": "cmssrm.fnal.gov",
                     "checksums": {"adler32": "ff810ec3", "cksum": "2212831827"},
                     "events": random.randrange(500, 5000, 50),
                     "merged": True,
                     "size": random.randrange(1000, 2000, 100000000),
                     "module_label": outputModule, "dataset": datasetInfo}

        outputFile = loadTestReport.addOutputFile(outputModule, fileAttrs)
        Report.addRunInfoToFile(outputFile, runInfo)
Example #24
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {
            'RSS': 50660,
            'VSZ': 50661,
            'Wallclock time': 50664,
            '': 99999
        }

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        # FIXME: making it backwards compatible. Keep only the "else" block in HG1801
        if self.maxRSS is not None and self.maxRSS >= (1024 * 1024):
            # then workload value is still in KiB (old way)
            rss = int(output[2])
            vsize = int(output[3])
        else:
            # ps returns data in kiloBytes, let's make it megaBytes
            # I'm so confused with these megabytes and mebibytes...
            rss = int(output[2]) // 1000  # convert it to MiB
            vsize = int(output[3]) // 1000  # convert it to MiB
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s", output[2],
                     output[3], output[4], output[5])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxRSS is not None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
            reason = 'RSS'
        elif self.maxVSize is not None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True
            reason = 'VSZ'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return