Ejemplo n.º 1
0
    def testAlertsMessagingBasic(self):
        config = getConfig("/tmp")
        self.assertTrue(hasattr(config, "Alert"))
        # initialization
        # sender: instance of Alert messages Sender
        # preAlert: pre-defined values for Alert instances generated from this class
        self.config = config # needed in setUpAlertsMessaging
        preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                         compName = "testBasic")
        sendAlert = alertAPI.getSendAlert(sender = sender,
                                          preAlert = preAlert)

        # set up a temporary alert message receiver
        handler, receiver = utils.setUpReceiver(config.Alert.address,
                                                config.Alert.controlAddr)
        # test sending alert
        msg = "this is my message Basic"
        sendAlert(100, msg = msg)

        # wait for the alert to arrive
        while len(handler.queue) == 0:
            time.sleep(0.3)
            print "%s waiting for alert to arrive ..." % inspect.stack()[0][3]

        self.assertEqual(len(handler.queue), 1)
        alert = handler.queue[0]
        self.assertEqual(alert["Component"], "testBasic")
        self.assertEqual(alert["Level"], 100)
        self.assertEqual(alert["Source"], self.__class__.__name__)
        self.assertEqual(alert["Details"]["msg"], msg)

        sender.unregister()
        receiver.shutdown()
Ejemplo n.º 2
0
    def testAlertsMessagingBasic(self):
        config = getConfig("/tmp")
        self.assertTrue(hasattr(config, "Alert"))
        # initialization
        # sender: instance of Alert messages Sender
        # preAlert: pre-defined values for Alert instances generated from this class
        self.config = config  # needed in setUpAlertsMessaging
        preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                         compName="testBasic")
        sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert)

        # set up a temporary alert message receiver
        handler, receiver = utils.setUpReceiver(config.Alert.address,
                                                config.Alert.controlAddr)
        # test sending alert
        msg = "this is my message Basic"
        sendAlert(100, msg=msg)

        # wait for the alert to arrive
        while len(handler.queue) == 0:
            time.sleep(0.3)
            print "%s waiting for alert to arrive ..." % inspect.stack()[0][3]

        self.assertEqual(len(handler.queue), 1)
        alert = handler.queue[0]
        self.assertEqual(alert["Component"], "testBasic")
        self.assertEqual(alert["Level"], 100)
        self.assertEqual(alert["Source"], self.__class__.__name__)
        self.assertEqual(alert["Details"]["msg"], msg)

        sender.unregister()
        receiver.shutdown()
Ejemplo n.º 3
0
    def testAgentConfigurationRetrieving(self):
        """
        Test that getting some agent details (config values from config.Agent
        section) will be correctly propagated into Alert instances.
        Alert instance is obtained via API.getPredefinedAlert factory.

        """
        d = dict(Additional = "detail")
        # instantiate just plain Alert, no configuration to take
        # into account at this point
        a = Alert(**d)
        self.assertEqual(a["HostName"], None)
        self.assertEqual(a["Contact"], None)
        self.assertEqual(a["TeamName"], None)
        self.assertEqual(a["AgentName"], None)
        self.assertEqual(a["Additional"], "detail")
        # instantiate via factory which reads configuration instance
        config = Configuration()
        config.section_("Agent")
        config.Agent.hostName = "some1"
        config.Agent.contact = "some2"
        config.Agent.teamName = "some3"
        config.Agent.agentName = "some4"
        a = alertAPI.getPredefinedAlert(**d)
        self.assertEqual(a["HostName"], "some1")
        self.assertEqual(a["Contact"], "some2")
        self.assertEqual(a["TeamName"], "some3")
        self.assertEqual(a["AgentName"], "some4")
        self.assertEqual(a["Additional"], "detail")
Ejemplo n.º 4
0
 def __init__(self, config, generator):
     threading.Thread.__init__(self)
     # it's particular Poller config only
     self.config = config
     # reference to AlertGenerator instance
     self.generator = generator
     # store levels (critical, soft) for critical, soft thresholds correspondence
     # these values are defined in the AlertProcessor config
     # self.levels and self.thresholds has to have the same corresponding order
     # and critical has to be first - if this threshold is caught, no point
     # testing soft one
     # this belongs to the AlertGenerator and is in fact dependent on AlertProcessor
     # by referencing these two values - not sure if to tolerate such dependecy or
     # configure these two values independently in AlertGenerator itself (surely a
     # possible mismatch would make a bit of chaos)
     self.levels = [self.generator.config.AlertProcessor.critical.level,
                    self.generator.config.AlertProcessor.soft.level]
     
     # critical, soft threshold values
     self.thresholds = [self.config.critical, self.config.soft]
     
     # pre-generated alert values, but before sending always new instance is created
     # these values are used to update the newly created instance
     dictAlert = dict(Type = "WMAgent",
                      Workload = "n/a",
                      Component = self.generator.__class__.__name__,
                      Source = "<to_overwrite>")
     self.preAlert = alertAPI.getPredefinedAlert(**dictAlert)
     # flag controlling run of the Thread
     self._stopFlag = False
     # thread own sleep time
     self._threadSleepTime = 0.2 # seconds
Ejemplo n.º 5
0
    def __init__(self, config, generator):
        threading.Thread.__init__(self)
        # it's particular Poller config only
        self.config = config
        # reference to AlertGenerator instance
        self.generator = generator
        # store levels (critical, soft) for critical, soft thresholds correspondence
        # these values are defined in the AlertProcessor config
        # self.levels and self.thresholds has to have the same corresponding order
        # and critical has to be first - if this threshold is caught, no point
        # testing soft one
        # this belongs to the AlertGenerator and is in fact dependent on AlertProcessor
        # by referencing these two values - not sure if to tolerate such dependecy or
        # configure these two values independently in AlertGenerator itself (surely a
        # possible mismatch would make a bit of chaos)
        self.levels = [
            self.generator.config.AlertProcessor.critical.level,
            self.generator.config.AlertProcessor.soft.level
        ]

        # critical, soft threshold values
        self.thresholds = [self.config.critical, self.config.soft]

        # pre-generated alert values, but before sending always new instance is created
        # these values are used to update the newly created instance
        dictAlert = dict(Type="WMAgent",
                         Workload="n/a",
                         Component=self.generator.__class__.__name__,
                         Source="<to_overwrite>")
        self.preAlert = alertAPI.getPredefinedAlert(**dictAlert)
        # flag controlling run of the Thread
        self._stopFlag = False
        # thread own sleep time
        self._threadSleepTime = 0.2  # seconds
Ejemplo n.º 6
0
    def testAgentConfigurationRetrieving(self):
        """
        Test that getting some agent details (config values from config.Agent
        section) will be correctly propagated into Alert instances.
        Alert instance is obtained via API.getPredefinedAlert factory.

        """
        d = dict(Additional="detail")
        # instantiate just plain Alert, no configuration to take
        # into account at this point
        a = Alert(**d)
        self.assertEqual(a["HostName"], None)
        self.assertEqual(a["Contact"], None)
        self.assertEqual(a["TeamName"], None)
        self.assertEqual(a["AgentName"], None)
        self.assertEqual(a["Additional"], "detail")
        # instantiate via factory which reads configuration instance
        config = Configuration()
        config.section_("Agent")
        config.Agent.hostName = "some1"
        config.Agent.contact = "some2"
        config.Agent.teamName = "some3"
        config.Agent.agentName = "some4"
        a = alertAPI.getPredefinedAlert(**d)
        self.assertEqual(a["HostName"], "some1")
        self.assertEqual(a["Contact"], "some2")
        self.assertEqual(a["TeamName"], "some3")
        self.assertEqual(a["AgentName"], "some4")
        self.assertEqual(a["Additional"], "detail")
Ejemplo n.º 7
0
    def initAlerts(self, compName=None):
        """
        _initAlerts_

        Setup the alerts for the rest of the system.

        sender: instance of the Alert messages Sender
        sendAlert: the code what sends the actual Alerts
            (documented in WMCore/Alerts/APIgetSendAlert)

        note:
            Tests are done in the API_t belonging to Alerts fw.
            This particular method is called from a number of
                components and some have particular tests on alerts sending.
        """
        if not compName:
            compName = self.__class__.__name__
        preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                         compName=compName)
        sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert)
        self.sender = sender
        self.sendAlert = sendAlert
Ejemplo n.º 8
0
    def initAlerts(self, compName = None):
        """
        _initAlerts_

        Setup the alerts for the rest of the system.

        sender: instance of the Alert messages Sender
        sendAlert: the code what sends the actual Alerts
            (documented in WMCore/Alerts/APIgetSendAlert)
            
        note:
            Tests are done in the API_t belonging to Alerts fw.
            This particular method is called from a number of
                components and some have particular tests on alerts sending.
        """
        if not compName:
            compName = self.__class__.__name__
        preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                         compName = compName)
        sendAlert = alertAPI.getSendAlert(sender = sender,
                                          preAlert = preAlert)
        self.sender = sender
        self.sendAlert = sendAlert
Ejemplo n.º 9
0
 def __init__(self, config, generator):
     # it's particular Poller config only
     self.config = config
     # reference to AlertGenerator instance
     self.generator = generator
     # store levels (critical, soft) for critical, soft thresholds correspondence
     # these values are defined in the AlertProcessor config
     # self.levels and self.thresholds has to have the same corresponding order
     # and critical has to be first - if this threshold is caught, no point testing soft one
     self.levels = [self.generator.config.AlertProcessor.critical.level,
                    self.generator.config.AlertProcessor.soft.level]
     # critical, soft threshold values
     self.thresholds = [self.config.critical, self.config.soft]
     
     # pre-generated alert values, but before sending always new instance is created
     # these values are used to update the newly created instance
     dictAlert = dict(Type = "WMAgent",
                      Workload = "n/a",
                      Component = self.generator.__class__.__name__,
                      Source = "<to_overwrite>")
     self.preAlert = alertAPI.getPredefinedAlert(**dictAlert) 
Ejemplo n.º 10
0
    def __init__(self, logger = None, dbi = None, **params):

        WorkQueueBase.__init__(self, logger, dbi)
        self.parent_queue = None
        self.params = params

        # config argument (within params) shall be reference to
        # Configuration instance (will later be checked for presence of "Alert")
        self.config = params.get("Config", None)
        self.params.setdefault('CouchUrl', os.environ.get('COUCHURL'))
        if not self.params.get('CouchUrl'):
            raise RuntimeError, 'CouchUrl config value mandatory'
        self.params.setdefault('DbName', 'workqueue')
        self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox')
        self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here

        self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'],
                                        self.params['InboxDbName'],
                                        self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'),
                                        logger = self.logger)
        if self.params.get('ParentQueueCouchUrl'):
            self.parent_queue = WorkQueueBackend(self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0],
                                                 self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1])

        self.params.setdefault("GlobalDBS",
                               "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet")
        self.params.setdefault('QueueDepth', 2) # when less than this locally
        self.params.setdefault('LocationRefreshInterval', 600)
        self.params.setdefault('FullLocationRefreshInterval', 7200)
        self.params.setdefault('TrackLocationOrSubscription', 'subscription')
        self.params.setdefault('ReleaseIncompleteBlocks', False)
        self.params.setdefault('ReleaseRequireSubscribed', True)
        self.params.setdefault('PhEDExEndpoint', None)
        self.params.setdefault('PopulateFilesets', True)
        self.params.setdefault('LocalQueueFlag', True)

        self.params.setdefault('JobDumpConfig', None)
        self.params.setdefault('BossAirConfig', None)

        self.params['QueueURL'] = self.backend.queueUrl # url this queue is visible on
                                    # backend took previous QueueURL and sanitized it
        self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue
        self.params.setdefault('Teams', [''])
        self.params.setdefault('DrainMode', False)
        if self.params.get('CacheDir'):
            try:
                os.makedirs(self.params['CacheDir'])
            except OSError:
                pass
        elif self.params.get('PopulateFilesets'):
            raise RuntimeError, 'CacheDir mandatory for local queue'

        self.params.setdefault('SplittingMapping', {})
        self.params['SplittingMapping'].setdefault('DatasetBlock',
                                                   {'name': 'Block',
                                                    'args': {}}
                                                  )
        self.params['SplittingMapping'].setdefault('MonteCarlo',
                                                   {'name': 'MonteCarlo',
                                                    'args':{}}
                                                   )
        self.params['SplittingMapping'].setdefault('Dataset',
                                                   {'name': 'Dataset',
                                                    'args': {}}
                                                  )
        self.params['SplittingMapping'].setdefault('Block',
                                                   {'name': 'Block',
                                                    'args': {}}
                                                  )
        self.params['SplittingMapping'].setdefault('ResubmitBlock',
                                                   {'name': 'ResubmitBlock',
                                                    'args': {}}
                                                  )
        
        self.params.setdefault('EndPolicySettings', {})

        assert(self.params['TrackLocationOrSubscription'] in ('subscription',
                                                              'location'))
        # Can only release blocks on location
        if self.params['TrackLocationOrSubscription'] == 'location':
            if self.params['SplittingMapping']['DatasetBlock']['name'] != 'Block':
                raise RuntimeError, 'Only blocks can be released on location'

        if self.params.get('PhEDEx'):
            self.phedexService = self.params['PhEDEx']
        else:
            phedexArgs = {}
            if self.params.get('PhEDExEndpoint'):
                phedexArgs['endpoint'] = self.params['PhEDExEndpoint']
            self.phedexService = PhEDEx(phedexArgs)

        if self.params.get('SiteDB'):
            self.SiteDB = self.params['SiteDB']
        else:
            self.SiteDB = SiteDB()

        if type(self.params['Teams']) in types.StringTypes:
            self.params['Teams'] = [x.strip() for x in \
                                    self.params['Teams'].split(',')]

        self.dataLocationMapper = WorkQueueDataLocationMapper(self.logger, self.backend,
                                                              phedex = self.phedexService,
                                                              sitedb = self.SiteDB,
                                                              locationFrom = self.params['TrackLocationOrSubscription'],
                                                              incompleteBlocks = self.params['ReleaseIncompleteBlocks'],
                                                              requireBlocksSubscribed = not self.params['ReleaseIncompleteBlocks'],
                                                              fullRefreshInterval = self.params['FullLocationRefreshInterval'],
                                                              updateIntervalCoarseness = self.params['LocationRefreshInterval'])

        # initialize alerts sending client (self.sendAlert() method)
        # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10
        #    1 - 4 - lower levels ; 5 - 10 higher levels
        preAlert, self.alertSender = \
            alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager")
        self.sendAlert = alertAPI.getSendAlert(sender = self.alertSender,
                                               preAlert = preAlert)

        self.logger.debug("WorkQueue created successfully")
Ejemplo n.º 11
0
    def submit(self, jobs, info):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir  = self.config.JobSubmitter.submitDir
        timeout         = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs     = []
        jdlFiles       = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up CondorPlugin worker pool")
            self.input    = multiprocessing.Queue()
            self.result   = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target = submitWorker,
                                            args = (self.input, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        nSubmits   = 0
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        queueError = False
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            idList = [x['jobid'] for x in jobList]
            if queueError:
                # If the queue has failed, then we must not process
                # any more jobs this cycle.
                continue
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]
                idList    = [x['id'] for x in jobsReady]
                jdlList = self.makeSubmit(jobList = jobsReady)
                if not jdlList or jdlList == []:
                    # Then we got nothing
                    logging.error("No JDL file made!")
                    return {'NoResult': [0]}
                jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])
                handle = open(jdlFile, 'w')
                handle.writelines(jdlList)
                handle.close()
                jdlFiles.append(jdlFile)

                # Now submit them
                logging.info("About to submit %i jobs" %(len(jobsReady)))
                if self.glexecPath:
                    command = 'CS=`which condor_submit`; '
                    if self.glexecWrapScript:
                        command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                    command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                    if self.glexecUnwrapScript:
                        command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                    else:
                        command += '%s $CS %s' % (self.glexecPath, jdlFile)
                else:
                    command = "condor_submit %s" % jdlFile

                try:
                    self.input.put({'command': command, 'idList': idList})
                except AssertionError as ex:
                    msg =  "Critical error: input pipeline probably closed.\n"
                    msg += str(ex)
                    msg += "Error Procedure: Something critical has happened in the worker process\n"
                    msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                    msg += "Then refresh the worker pool\n"
                    logging.error(msg)
                    queueError = True
                    break
                nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for n in range(nSubmits):
            try:
                res = self.result.get(block = True, timeout = timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds." % timeout)
                queueError = True
                continue
            except AssertionError as ex:
                msg =  "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                queueError = True
                continue

            try:
                output   = res['stdout']
                error    = res['stderr']
                idList   = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg =  "Error in finding key from result pipe\n"
                msg += "Something has gone crticially wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()


        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in CondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 12
0
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
Ejemplo n.º 13
0
class WorkQueue(WorkQueueBase):
    """
    _WorkQueue_

    WorkQueue object - interface to WorkQueue functionality.
    """
    def __init__(self, logger=None, dbi=None, **params):

        WorkQueueBase.__init__(self, logger, dbi)
        self.parent_queue = None
        self.params = params

        # config argument (within params) shall be reference to
        # Configuration instance (will later be checked for presence of "Alert")
        self.config = params.get("Config", None)
        self.params.setdefault('CouchUrl', os.environ.get('COUCHURL'))
        if not self.params.get('CouchUrl'):
            raise RuntimeError, 'CouchUrl config value mandatory'
        self.params.setdefault('DbName', 'workqueue')
        self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox')
        self.params.setdefault('ParentQueueCouchUrl',
                               None)  # We get work from here

        self.backend = WorkQueueBackend(self.params['CouchUrl'],
                                        self.params['DbName'],
                                        self.params['InboxDbName'],
                                        self.params['ParentQueueCouchUrl'],
                                        self.params.get('QueueURL'),
                                        logger=self.logger)
        if self.params.get('ParentQueueCouchUrl'):
            try:
                self.parent_queue = WorkQueueBackend(
                    self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0],
                    self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1])
            except IndexError, ex:
                # Probable cause: Someone didn't put the global WorkQueue name in
                # the ParentCouchUrl
                msg = "Parsing failure for ParentQueueCouchUrl - probably missing dbname in input\n"
                msg += "Exception: %s\n" % str(ex)
                msg += str("ParentQueueCouchUrl: %s\n" %
                           self.params['ParentQueueCouchUrl'])
                self.logger.error(msg)
                raise WorkQueueError(msg)
            self.params['ParentQueueCouchUrl'] = self.parent_queue.queueUrl

        self.params.setdefault(
            "GlobalDBS",
            "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet")
        self.params.setdefault('QueueDepth',
                               0.5)  # when less than this locally
        self.params.setdefault('LocationRefreshInterval', 600)
        self.params.setdefault('FullLocationRefreshInterval', 7200)
        self.params.setdefault('TrackLocationOrSubscription', 'subscription')
        self.params.setdefault('ReleaseIncompleteBlocks', False)
        self.params.setdefault('ReleaseRequireSubscribed', True)
        self.params.setdefault('PhEDExEndpoint', None)
        self.params.setdefault('PopulateFilesets', True)
        self.params.setdefault('LocalQueueFlag', True)
        self.params.setdefault('QueueRetryTime', 86400)
        self.params.setdefault('stuckElementAlertTime', 86400)
        self.params.setdefault('reqmgrCompleteGraceTime', 604800)
        self.params.setdefault('cancelGraceTime', 604800)

        self.params.setdefault('JobDumpConfig', None)
        self.params.setdefault('BossAirConfig', None)

        self.params[
            'QueueURL'] = self.backend.queueUrl  # url this queue is visible on
        # backend took previous QueueURL and sanitized it
        self.params.setdefault('WMBSUrl',
                               None)  # this will only be set on local Queue
        if self.params.get('WMBSUrl'):
            self.params['WMBSUrl'] = Lexicon.sanitizeURL(
                self.params['WMBSUrl'])['url']
        self.params.setdefault('Teams', [])
        self.params.setdefault('DrainMode', False)
        if self.params.get('CacheDir'):
            try:
                os.makedirs(self.params['CacheDir'])
            except OSError:
                pass
        elif self.params.get('PopulateFilesets'):
            raise RuntimeError, 'CacheDir mandatory for local queue'

        self.params.setdefault('SplittingMapping', {})
        self.params['SplittingMapping'].setdefault('DatasetBlock', {
            'name': 'Block',
            'args': {}
        })
        self.params['SplittingMapping'].setdefault('MonteCarlo', {
            'name': 'MonteCarlo',
            'args': {}
        })
        self.params['SplittingMapping'].setdefault('Dataset', {
            'name': 'Dataset',
            'args': {}
        })
        self.params['SplittingMapping'].setdefault('Block', {
            'name': 'Block',
            'args': {}
        })
        self.params['SplittingMapping'].setdefault('ResubmitBlock', {
            'name': 'ResubmitBlock',
            'args': {}
        })

        self.params.setdefault('EndPolicySettings', {})

        assert (self.params['TrackLocationOrSubscription']
                in ('subscription', 'location'))
        # Can only release blocks on location
        if self.params['TrackLocationOrSubscription'] == 'location':
            if self.params['SplittingMapping']['DatasetBlock'][
                    'name'] != 'Block':
                raise RuntimeError, 'Only blocks can be released on location'

        if self.params.get('PhEDEx'):
            self.phedexService = self.params['PhEDEx']
        else:
            phedexArgs = {}
            if self.params.get('PhEDExEndpoint'):
                phedexArgs['endpoint'] = self.params['PhEDExEndpoint']
            self.phedexService = PhEDEx(phedexArgs)

        if self.params.get('SiteDB'):
            self.SiteDB = self.params['SiteDB']
        else:
            self.SiteDB = SiteDB()

        if type(self.params['Teams']) in types.StringTypes:
            self.params['Teams'] = [x.strip() for x in \
                                    self.params['Teams'].split(',')]

        self.dataLocationMapper = WorkQueueDataLocationMapper(
            self.logger,
            self.backend,
            phedex=self.phedexService,
            sitedb=self.SiteDB,
            locationFrom=self.params['TrackLocationOrSubscription'],
            incompleteBlocks=self.params['ReleaseIncompleteBlocks'],
            requireBlocksSubscribed=not self.params['ReleaseIncompleteBlocks'],
            fullRefreshInterval=self.params['FullLocationRefreshInterval'],
            updateIntervalCoarseness=self.params['LocationRefreshInterval'])

        # initialize alerts sending client (self.sendAlert() method)
        # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10
        #    1 - 4 - lower levels ; 5 - 10 higher levels
        preAlert, self.alertSender = \
            alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager")
        self.sendAlert = alertAPI.getSendAlert(sender=self.alertSender,
                                               preAlert=preAlert)

        self.logger.debug("WorkQueue created successfully")
Ejemplo n.º 14
0

import sys
import time

from WMCore.Alerts import API as alertAPI
from WMCore.Alerts.Alert import Alert
from WMCore.Alerts.ZMQ.Sender import Sender

machine = "maxatest.cern.ch"

target = "tcp://%s:6557" % machine
targetController = "tcp://%s:6559" % machine
if len(sys.argv) > 2:
    target = sys.argv[1]
    targetController = sys.argv[2]

dictAlert = dict(Type="AlertTestClient", Workload="n/a", Component=__name__, Source=__name__)
preAlert = alertAPI.getPredefinedAlert(**dictAlert)
sender = Sender(target, targetController, "AlertTestClient")
print ("created Sender client for alerts target: %s  controller: %s" % (target, targetController))


sender.register()
a = Alert(**preAlert)
a["Timestamp"] = time.time()
a["Level"] = 6
print "sending alert:\n'%s'" % a
sender(a)
sender.unregister()
Ejemplo n.º 15
0
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirPyCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
Ejemplo n.º 16
0
import time

from WMCore.Alerts import API as alertAPI
from WMCore.Alerts.Alert import Alert
from WMCore.Alerts.ZMQ.Sender import Sender

machine = "maxatest.cern.ch"

target = "tcp://%s:6557" % machine
targetController = "tcp://%s:6559" % machine
if len(sys.argv) > 2:
    target = sys.argv[1]
    targetController = sys.argv[2]

dictAlert = dict(Type="AlertTestClient",
                 Workload="n/a",
                 Component=__name__,
                 Source=__name__)
preAlert = alertAPI.getPredefinedAlert(**dictAlert)
sender = Sender(target, targetController, "AlertTestClient")
print("created Sender client for alerts target: %s  controller: %s" %
      (target, targetController))

sender.register()
a = Alert(**preAlert)
a["Timestamp"] = time.time()
a["Level"] = 6
print "sending alert:\n'%s'" % a
sender(a)
sender.unregister()
Ejemplo n.º 17
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir  = self.config.JobSubmitter.submitDir
        timeout         = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs     = []
        jdlFiles       = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up CondorPlugin worker pool")
            self.input    = multiprocessing.Queue()
            self.result   = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target = submitWorker,
                                            args = (self.input, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        nSubmits   = 0
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        queueError = False
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            idList = [x['jobid'] for x in jobList]
            if queueError:
                # If the queue has failed, then we must not process
                # any more jobs this cycle.
                continue
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]
                idList    = [x['id'] for x in jobsReady]
                jdlList = self.makeSubmit(jobList = jobsReady)
                if not jdlList or jdlList == []:
                    # Then we got nothing
                    logging.error("No JDL file made!")
                    return {'NoResult': [0]}
                jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])
                handle = open(jdlFile, 'w')
                handle.writelines(jdlList)
                handle.close()
                jdlFiles.append(jdlFile)

                # Now submit them
                logging.info("About to submit %i jobs" %(len(jobsReady)))
                if self.glexecPath:
                    command = 'CS=`which condor_submit`; '
                    if self.glexecWrapScript:
                        command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                    command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                    if self.glexecUnwrapScript:
                        command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                    else:
                        command += '%s $CS %s' % (self.glexecPath, jdlFile)
                else:
                    command = "condor_submit %s" % jdlFile

                try:
                    self.input.put({'command': command, 'idList': idList})
                except AssertionError as ex:
                    msg =  "Critical error: input pipeline probably closed.\n"
                    msg += str(ex)
                    msg += "Error Procedure: Something critical has happened in the worker process\n"
                    msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                    msg += "Then refresh the worker pool\n"
                    logging.error(msg)
                    queueError = True
                    break
                nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for n in range(nSubmits):
            try:
                res = self.result.get(block = True, timeout = timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds." % timeout)
                queueError = True
                continue
            except AssertionError as ex:
                msg =  "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                queueError = True
                continue

            try:
                output   = res['stdout']
                error    = res['stderr']
                idList   = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg =  "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()


        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in CondorPlugin")
        return successfulJobs, failedJobs