Ejemplo n.º 1
0
    def makeThreadsStreamsTweak(self):
        """
        _makeThreadsStreamsTweak_

        Tweak threads and streams paraameters
        """
        origCores = int(
            getattr(self.step.data.application.multicore, 'numberOfCores', 1))
        eventStreams = int(
            getattr(self.step.data.application.multicore, 'eventStreams', 0))
        resources = {'cores': origCores}
        resizeResources(resources)
        numCores = resources['cores']
        if numCores != origCores:
            self.logger.info(
                "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly."
            )
            eventStreams = 0

        tweak = PSetTweak()
        tweak.addParameter("process.options", "customTypeCms.untracked.PSet()")
        self.applyPsetTweak(tweak, skipIfSet=True)
        self.tweak.addParameter(
            "process.options.numberOfThreads",
            "customTypeCms.untracked.uint32(%s)" % numCores)
        self.tweak.addParameter(
            "process.options.numberOfStreams",
            "customTypeCms.untracked.uint32(%s)" % eventStreams)

        return
Ejemplo n.º 2
0
    def setupMonitors(self, task, wmbsJob):
        logging.info("In Watchdog.setupMonitors")
        if not hasattr(task.data, 'watchdog'):
            msg = "Could not find watchdog in spec"
            logging.error(msg)
            # I don't think this is necessarily fatal
            return
        if not hasattr(task.data.watchdog, 'monitors'):
            msg = "Watchdog has no monitors"
            logging.error(msg)
            # Probably not fatal either
            return
        if hasattr(task.data.watchdog, 'interval'):
            # Set the interval off the config
            self.setInterval(task.data.watchdog.interval)
        for monitor in task.data.watchdog.monitors:
            msg = "Initializing monitor %s" % monitor
            logging.info(msg)
            mon = self.loadMonitor(monitor)
            args = {}
            if hasattr(task.data.watchdog, monitor):
                # This should be a config section
                monitorArgs = getattr(task.data.watchdog, monitor)
                args = monitorArgs.dictionary_()
            if monitor == 'PerformanceMonitor' and args:
                # Apply tweaks to PerformanceMonitor only.
                # Scale resources according to the HTCondor runtime environment.
                origCores = 1
                for stepName in task.listAllStepNames():
                    sh = task.getStepHelper(stepName)
                    origCores = max(origCores, sh.getNumberOfCores())
                resources = {'cores': origCores}
                origMaxPSS = args.get('maxPSS', args.get('maxRSS'))
                if origMaxPSS:
                    resources['memory'] = origMaxPSS
                # Actually parses the HTCondor runtime
                resizeResources(resources)
                # We decided to only touch Watchdog settings if the number of cores changed.
                # (even if this means the watchdog memory is wrong for a slot this size).
                changedCores = origCores != resources['cores']
                # If we did base maxPSS off the memory in the HTCondor slot, subtract a bit
                # off the top so watchdog triggers before HTCondor does.
                # Add the new number of cores to the args such that PerformanceMonitor can see it
                args['cores'] = resources['cores']
                if changedCores:
                    if origMaxPSS:
                        args['maxPSS'] = resources['memory'] - 50

                logging.info("Watchdog modified: %s. Final settings:",
                             changedCores)
                for k, v in viewitems(args):
                    logging.info("  %s: %r", k, v)
            # Actually initialize the monitor variables
            mon.initMonitor(task=task,
                            job=wmbsJob,
                            logPath=self.logPath,
                            args=args)
            self._Monitors.append(mon)

        return
Ejemplo n.º 3
0
    def setupMonitors(self, task, wmbsJob):
        logging.info("In Watchdog.setupMonitors")
        if not hasattr(task.data, 'watchdog'):
            msg = "Could not find watchdog in spec"
            logging.error(msg)
            # I don't think this is necessarily fatal
            return
        if not hasattr(task.data.watchdog, 'monitors'):
            msg = "Watchdog has no monitors"
            logging.error(msg)
            # Probably not fatal either
            return
        if hasattr(task.data.watchdog, 'interval'):
            # Set the interval off the config
            self.setInterval(task.data.watchdog.interval)
        for monitor in task.data.watchdog.monitors:
            msg = "Initializing monitor %s" % monitor
            logging.info(msg)
            mon = self.loadMonitor(monitor)
            args = {}
            if hasattr(task.data.watchdog, monitor):
                # This should be a config section
                monitorArgs = getattr(task.data.watchdog, monitor)
                args = monitorArgs.dictionary_()
            if monitor == 'PerformanceMonitor' and args:
                # Apply tweaks to PerformanceMonitor only.
                # Scale resources according to the HTCondor runtime environment.
                origCores = 1
                for stepName in task.listAllStepNames():
                    sh = task.getStepHelper(stepName)
                    origCores = max(origCores, sh.getNumberOfCores())
                resources = {'cores': origCores}
                origMaxPSS = args.get('maxPSS', args.get('maxRSS'))
                if origMaxPSS:
                    resources['memory'] = origMaxPSS
                # Actually parses the HTCondor runtime
                resizeResources(resources)
                # We decided to only touch Watchdog settings if the number of cores changed.
                # (even if this means the watchdog memory is wrong for a slot this size).
                changedCores = origCores != resources['cores']
                # If we did base maxPSS off the memory in the HTCondor slot, subtract a bit
                # off the top so watchdog triggers before HTCondor does.
                # Add the new number of cores to the args such that DashboardInterface can see it
                args['cores'] = resources['cores']
                if changedCores:
                    if origMaxPSS:
                        args['maxPSS'] = resources['memory'] - 50

                logging.info("Watchdog modified: %s. Final settings:", changedCores)
                for k, v in args.iteritems():
                    logging.info("  %s: %r", k, v)
            # Actually initialize the monitor variables
            mon.initMonitor(task=task, job=wmbsJob,
                            logPath=self.logPath, args=args)
            self._Monitors.append(mon)

        return
Ejemplo n.º 4
0
    def __call__(self):
        """
        _call_

        Examine the step configuration and construct a PSet from that.

        """
        self.logger.info("Executing SetupCMSSWPSet...")
        self.jobBag = self.job.getBaggage()

        scenario = getattr(self.step.data.application.configuration,
                           "scenario", None)
        if scenario is not None and scenario != "":
            self.logger.info("Setting up job scenario/process")
            funcName = getattr(self.step.data.application.configuration,
                               "function", None)
            if getattr(self.step.data.application.configuration,
                       "pickledarguments", None) is not None:
                funcArgs = pickle.loads(
                    self.step.data.application.configuration.pickledarguments)
            else:
                funcArgs = {}
            try:
                self.createProcess(scenario, funcName, funcArgs)
            except Exception as ex:
                self.logger.exception(
                    "Error creating process for Config/DataProcessing:")
                raise ex

            if funcName == "repack":
                self.handleRepackSettings()

            if funcName in ["merge", "alcaHarvesting"]:
                self.handleSingleCoreOverride()

            if socket.getfqdn().endswith("cern.ch"):
                self.handleSpecialCERNMergeSettings(funcName)

        else:
            try:
                self.loadPSet()
            except Exception as ex:
                self.logger.exception("Error loading PSet:")
                raise ex

        # Check process.source exists
        if getattr(self.process, "source", None) is None:
            msg = "Error in CMSSW PSet: process is missing attribute 'source'"
            msg += " or process.source is defined with None value."
            self.logger.error(msg)
            raise RuntimeError(msg)

        self.handleCondorStatusService()

        self.fixupProcess()

        # In case of CRAB3, the number of threads in the PSet should not be overridden
        if not self.crabPSet:
            try:
                origCores = int(
                    getattr(self.step.data.application.multicore,
                            'numberOfCores', 1))
                eventStreams = int(
                    getattr(self.step.data.application.multicore,
                            'eventStreams', 0))
                resources = {'cores': origCores}
                resizeResources(resources)
                numCores = resources['cores']
                if numCores != origCores:
                    self.logger.info(
                        "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly."
                    )
                    eventStreams = 0
                options = getattr(self.process, "options", None)
                if options is None:
                    self.process.options = cms.untracked.PSet()
                    options = getattr(self.process, "options")
                options.numberOfThreads = cms.untracked.uint32(numCores)
                options.numberOfStreams = cms.untracked.uint32(eventStreams)
            except AttributeError as ex:
                self.logger.error("Failed to override numberOfThreads: %s",
                                  str(ex))

        psetTweak = getattr(self.step.data.application.command, "psetTweak",
                            None)
        if psetTweak is not None:
            self.applyPSetTweak(psetTweak, self.fixupDict)

        # Apply task level tweaks
        taskTweak = makeTaskTweak(self.step.data)
        applyTweak(self.process, taskTweak, self.fixupDict)

        # Check if chained processing is enabled
        # If not - apply the per job tweaks
        # If so - create an override TFC (like done in PA) and then modify thePSet accordingly
        if hasattr(self.step.data.input, "chainedProcessing"
                   ) and self.step.data.input.chainedProcessing:
            self.handleChainedProcessing()
        else:
            # Apply per job PSet Tweaks
            jobTweak = makeJobTweak(self.job)
            applyTweak(self.process, jobTweak, self.fixupDict)

        # check for pileup settings presence, pileup support implementation
        # and if enabled, process pileup configuration / settings
        if hasattr(self.step.data, "pileup"):
            self.handlePileup()

        # Apply per output module PSet Tweaks
        cmsswStep = self.step.getTypeHelper()
        for om in cmsswStep.listOutputModules():
            mod = cmsswStep.getOutputModule(om)
            outTweak = makeOutputTweak(mod, self.job)
            applyTweak(self.process, outTweak, self.fixupDict)

        # revlimiter for testing
        if getattr(self.step.data.application.command, "oneEventMode", False):
            self.process.maxEvents.input = 1

        # check for random seeds and the method of seeding which is in the job baggage
        self.handleSeeding()

        # make sure default parametersets for perf reports are installed
        self.handlePerformanceSettings()

        # check for event numbers in the producers
        self.handleProducersNumberOfEvents()

        # fixup the dqmFileSaver
        self.handleDQMFileSaver()

        # tweak for jobs reading LHE articles from CERN
        self.handleLHEInput()

        # tweak jobs for enforceGUIDInFileName
        self.handleEnforceGUIDInFileName()

        # Check if we accept skipping bad files
        if hasattr(self.step.data.application.configuration, "skipBadFiles"):
            self.process.source.skipBadFiles = \
                cms.untracked.bool(self.step.data.application.configuration.skipBadFiles)

        # Apply events per lumi section if available
        if hasattr(self.step.data.application.configuration, "eventsPerLumi"):
            self.process.source.numberEventsInLuminosityBlock = \
                cms.untracked.uint32(self.step.data.application.configuration.eventsPerLumi)

        # limit run time if desired
        if hasattr(self.step.data.application.configuration,
                   "maxSecondsUntilRampdown"):
            self.process.maxSecondsUntilRampdown = cms.untracked.PSet(
                input=cms.untracked.int32(
                    self.step.data.application.configuration.
                    maxSecondsUntilRampdown))

        # accept an overridden TFC from the step
        if hasattr(self.step.data.application, 'overrideCatalog'):
            self.logger.info("Found a TFC override: %s",
                             self.step.data.application.overrideCatalog)
            self.process.source.overrideCatalog = \
                cms.untracked.string(self.step.data.application.overrideCatalog)

        configFile = self.step.data.application.command.configuration
        configPickle = getattr(self.step.data.application.command,
                               "configurationPickle", "PSet.pkl")
        workingDir = self.stepSpace.location
        try:
            with open("%s/%s" % (workingDir, configPickle), 'wb') as pHandle:
                pickle.dump(self.process, pHandle)

            with open("%s/%s" % (workingDir, configFile), 'w') as handle:
                handle.write("import FWCore.ParameterSet.Config as cms\n")
                handle.write("import pickle\n")
                handle.write("with open('%s', 'rb') as handle:\n" %
                             configPickle)
                handle.write("    process = pickle.load(handle)\n")
        except Exception as ex:
            self.logger.exception("Error writing out PSet:")
            raise ex
        self.logger.info("CMSSW PSet setup completed!")

        return 0
Ejemplo n.º 5
0
    def __call__(self):
        """
        _call_

        Examine the step configuration and construct a PSet from that.

        """
        self.process = None

        scenario = getattr(self.step.data.application.configuration, "scenario", None)
        if scenario is not None and scenario != "":
            funcName = getattr(self.step.data.application.configuration, "function", None)
            if getattr(self.step.data.application.configuration, "pickledarguments", None) is not None:
                funcArgs = pickle.loads(self.step.data.application.configuration.pickledarguments)
            else:
                funcArgs = {}
            try:
                self.createProcess(scenario, funcName, funcArgs)
            except Exception as ex:
                logging.exception("Error creating process for Config/DataProcessing:")
                raise ex

            if funcName == "repack":
                self.handleRepackSettings()

            if funcName in ["merge", "alcaHarvesting"]:
                self.handleSingleCoreOverride()

            if socket.getfqdn().endswith("cern.ch"):
                self.handleSpecialCERNMergeSettings(funcName)

        else:
            try:
                self.loadPSet()
            except Exception as ex:
                logging.exception("Error loading PSet:")
                raise ex

        # Check process.source exists
        if getattr(self.process, "source", None) is None:
            msg = "Error in CMSSW PSet: process is missing attribute 'source'"
            msg += " or process.source is defined with None value."
            logging.error(msg)
            raise RuntimeError(msg)

        self.handleCondorStatusService()

        self.fixupProcess()

        # In case of CRAB3, the number of threads in the PSet should not be overridden
        if not self.crabPSet:
            try:
                origCores = int(getattr(self.step.data.application.multicore, 'numberOfCores', 1))
                eventStreams = int(getattr(self.step.data.application.multicore, 'eventStreams', 0))
                resources = {'cores': origCores}
                resizeResources(resources)
                numCores = resources['cores']
                if numCores != origCores:
                    logging.info(
                        "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly.")
                    eventStreams = 0
                options = getattr(self.process, "options", None)
                if options is None:
                    self.process.options = cms.untracked.PSet()
                    options = getattr(self.process, "options")
                options.numberOfThreads = cms.untracked.uint32(numCores)
                options.numberOfStreams = cms.untracked.uint32(eventStreams)
            except AttributeError as ex:
                logging.error("Failed to override numberOfThreads: %s", str(ex))

        psetTweak = getattr(self.step.data.application.command, "psetTweak", None)
        if psetTweak is not None:
            self.applyPSetTweak(psetTweak, self.fixupDict)

        # Apply task level tweaks
        taskTweak = makeTaskTweak(self.step.data)
        applyTweak(self.process, taskTweak, self.fixupDict)

        # Check if chained processing is enabled
        # If not - apply the per job tweaks
        # If so - create an override TFC (like done in PA) and then modify thePSet accordingly
        if hasattr(self.step.data.input, "chainedProcessing") and self.step.data.input.chainedProcessing:
            self.handleChainedProcessing()
        else:
            # Apply per job PSet Tweaks
            jobTweak = makeJobTweak(self.job)
            applyTweak(self.process, jobTweak, self.fixupDict)

        # check for pileup settings presence, pileup support implementation
        # and if enabled, process pileup configuration / settings
        if hasattr(self.step.data, "pileup"):
            self.handlePileup()

        # Apply per output module PSet Tweaks
        cmsswStep = self.step.getTypeHelper()
        for om in cmsswStep.listOutputModules():
            mod = cmsswStep.getOutputModule(om)
            outTweak = makeOutputTweak(mod, self.job)
            applyTweak(self.process, outTweak, self.fixupDict)

        # revlimiter for testing
        if getattr(self.step.data.application.command, "oneEventMode", False):
            self.process.maxEvents.input = 1

        # check for random seeds and the method of seeding which is in the job baggage
        self.handleSeeding()

        # make sure default parametersets for perf reports are installed
        self.handlePerformanceSettings()

        # check for event numbers in the producers
        self.handleProducersNumberOfEvents()

        # fixup the dqmFileSaver
        self.handleDQMFileSaver()

        # Check if we accept skipping bad files
        if hasattr(self.step.data.application.configuration, "skipBadFiles"):
            self.process.source.skipBadFiles = \
                cms.untracked.bool(self.step.data.application.configuration.skipBadFiles)

        # Apply events per lumi section if available
        if hasattr(self.step.data.application.configuration, "eventsPerLumi"):
            self.process.source.numberEventsInLuminosityBlock = \
                cms.untracked.uint32(self.step.data.application.configuration.eventsPerLumi)

        # limit run time if desired
        if hasattr(self.step.data.application.configuration, "maxSecondsUntilRampdown"):
            self.process.maxSecondsUntilRampdown = cms.untracked.PSet(
                input=cms.untracked.int32(self.step.data.application.configuration.maxSecondsUntilRampdown))

        # accept an overridden TFC from the step
        if hasattr(self.step.data.application, 'overrideCatalog'):
            logging.info("Found a TFC override: %s", self.step.data.application.overrideCatalog)
            self.process.source.overrideCatalog = \
                cms.untracked.string(self.step.data.application.overrideCatalog)

        configFile = self.step.data.application.command.configuration
        configPickle = getattr(self.step.data.application.command, "configurationPickle", "PSet.pkl")
        workingDir = self.stepSpace.location
        try:
            with open("%s/%s" % (workingDir, configPickle), 'wb') as pHandle:
                pickle.dump(self.process, pHandle)

            with open("%s/%s" % (workingDir, configFile), 'w') as handle:
                handle.write("import FWCore.ParameterSet.Config as cms\n")
                handle.write("import pickle\n")
                handle.write("with open('%s', 'rb') as handle:\n" % configPickle)
                handle.write("    process = pickle.load(handle)\n")
        except Exception as ex:
            logging.exception("Error writing out PSet:")
            raise ex

        return 0
Ejemplo n.º 6
0
    def setupMonitors(self, task, wmbsJob):
        logging.info("In Watchdog.setupMonitors")
        if not hasattr(task.data, 'watchdog'):
            msg = "Could not find watchdog in spec"
            logging.error(msg)
            # I don't think this is necessarily fatal
            return
        if not hasattr(task.data.watchdog, 'monitors'):
            msg = "Watchdog has no monitors"
            logging.error(msg)
            # Probably not fatal either
            return
        if hasattr(task.data.watchdog, 'interval'):
            # Set the interval off the config
            self.setInterval(task.data.watchdog.interval)
        for monitor in task.data.watchdog.monitors:
            msg = "Initializing monitor %s" % monitor
            logging.info(msg)
            mon = self.loadMonitor(monitor)
            args = {}
            if hasattr(task.data.watchdog, monitor):
                # This should be a config section
                monitorArgs = getattr(task.data.watchdog, monitor)
                args = monitorArgs.dictionary_()
            if monitor == 'PerformanceMonitor' and args:
                # Apply tweaks to PerformanceMonitor only.
                # Scale resources according to the HTCondor runtime environment.
                origCores = 1
                for stepName in task.listAllStepNames():
                    sh = task.getStepHelper(stepName)
                    origCores = max(origCores, sh.getNumberOfCores())
                resources = {'cores': origCores}
                origMaxRSS = args.get('maxRSS')
                if origMaxRSS:
                    origMaxRSS = int(origMaxRSS /
                                     1024.)  # HTCondor expects MB; we get KB.
                    resources['memory'] = origMaxRSS
                # Actually parses the HTCondor runtime
                resizeResources(resources)
                # We decided to only touch Watchdog settings if the number of cores changed.
                # (even if this means the watchdog memory is wrong for a slot this size).
                changedCores = origCores != resources['cores']
                # HTCondor doesn't explicitly scale VSize; it's also not clear what
                # resources this manages (as we already watch the memory use) or how
                # it should relate to other resources (such as memory or cores used).
                # Hence, we simply remove it if we change anything about the memory.
                # If we did base maxRSS off the memory in the HTCondor slot, subtract a bit
                # off the top so watchdog triggers before HTCondor does.
                # Add the new number of cores to the args such that DashboardInterface can see it
                args['cores'] = resources['cores']
                if changedCores:
                    if origMaxRSS:
                        args.pop('maxVSize', None)
                        args['maxRSS'] = 1024 * (resources['memory'] - 50
                                                 )  # Convert back to KB

                logging.info("Watchdog modified: %s. Final settings:",
                             changedCores)
                for k, v in args.iteritems():
                    logging.info("  %s: %r", k, v)
            # Actually initialize the monitor variables
            mon.initMonitor(task=task,
                            job=wmbsJob,
                            logPath=self.logPath,
                            args=args)
            self._Monitors.append(mon)

        return
Ejemplo n.º 7
0
    def setupMonitors(self, task, wmbsJob):
        logging.info("In Watchdog.setupMonitors")
        if not hasattr(task.data, 'watchdog'):
            msg = "Could not find watchdog in spec"
            logging.error(msg)
            # I don't think this is necessarily fatal
            return
        if not hasattr(task.data.watchdog, 'monitors'):
            msg = "Watchdog has no monitors"
            logging.error(msg)
            # Probably not fatal either
            return
        if hasattr(task.data.watchdog, 'interval'):
            # Set the interval off the config
            self.setInterval(task.data.watchdog.interval)
        for monitor in task.data.watchdog.monitors:
            msg = "Initializing monitor %s" % monitor
            logging.info(msg)
            mon = self.loadMonitor(monitor)
            args = {}
            if hasattr(task.data.watchdog, monitor):
                # This should be a config section
                monitorArgs = getattr(task.data.watchdog, monitor)
                args = monitorArgs.dictionary_()
            if monitor == 'PerformanceMonitor' and args:
                # Apply tweaks to PerformanceMonitor only.
                # Scale resources according to the HTCondor runtime environment.
                origCores = 1
                for stepName in task.listAllStepNames():
                    sh = task.getStepHelper(stepName)
                    origCores = max(origCores, sh.getNumberOfCores())
                resources = {'cores': origCores}
                origMaxRSS = args.get('maxRSS')
                ### TODO: keep only the else clause after ~HG1805
                if origMaxRSS and origMaxRSS > 100 * 1000:  # in case MaxRSS is in KB
                    origMaxRSS = int(origMaxRSS / 1024.)  # HTCondor expects MB; we get KB.
                    resources['memory'] = origMaxRSS
                elif origMaxRSS:
                    resources['memory'] = origMaxRSS  # then it's already in MB
                # Actually parses the HTCondor runtime
                resizeResources(resources)
                # We decided to only touch Watchdog settings if the number of cores changed.
                # (even if this means the watchdog memory is wrong for a slot this size).
                changedCores = origCores != resources['cores']
                # HTCondor doesn't explicitly scale VSize; it's also not clear what
                # resources this manages (as we already watch the memory use) or how
                # it should relate to other resources (such as memory or cores used).
                # Hence, we simply remove it if we change anything about the memory.
                # If we did base maxRSS off the memory in the HTCondor slot, subtract a bit
                # off the top so watchdog triggers before HTCondor does.
                # Add the new number of cores to the args such that DashboardInterface can see it
                args['cores'] = resources['cores']
                if changedCores:
                    if origMaxRSS:
                        args.pop('maxVSize', None)
                        args['maxRSS'] = resources['memory'] - 50

                logging.info("Watchdog modified: %s. Final settings:", changedCores)
                for k, v in args.iteritems():
                    logging.info("  %s: %r", k, v)
            # Actually initialize the monitor variables
            mon.initMonitor(task=task, job=wmbsJob,
                            logPath=self.logPath, args=args)
            self._Monitors.append(mon)

        return