Example #1
0
 def __init__(self, conf):
     self.conf = conf
     self.name = 'stalker-runner-%d' % os.getpid()
     log_type = conf.get('log_type', 'syslog')
     log_file = conf.get('log_file', '/var/log/stalker/stalker-runner.log')
     if log_type == 'syslog':
         self.logger = get_syslogger(conf, self.name)
     else:
         self.logger = get_logger(self.name, log_path=log_file)
     self.pool = eventlet.GreenPool()
     self.check_key = conf.get('check_key', 'canhazstatus')
     redis_host = conf.get('redis_host', '127.0.0.1')
     redis_port = int(conf.get('redis_port', '6379'))
     redis_pass = conf.get('redis_password', '')
     redis_usock = conf.get('redis_socket', None)
     self.wq = conf.get('worker_id', 'worker1')
     self.rc = redis.Redis(redis_host, redis_port, password=redis_pass,
                           unix_socket_path=redis_usock)
     mongo_host = conf.get('mongo_host', '127.0.0.1')
     mongo_port = int(conf.get('mongo_port', '27017'))
     db_name = conf.get('db_name', 'stalkerweb')
     self.c = MongoClient(host=mongo_host, port=mongo_port)
     self.debug = False
     self.db = self.c[db_name]
     self.checks = self.db['checks']
     self.state_log = self.db['state_log']
     self.notifications = self.db['notifications']
     self.host_window = int(conf.get('host_flood_window', '60'))
     self.host_threshold = int(conf.get('host_flood_threshold', '5'))
     self.flood_window = int(conf.get('dc_flood_window', '120'))
     self.flood_threshold = int(conf.get('dc_flood_threshold', '100'))
     self.flap_window = int(conf.get('flap_window', '1200'))
     self.flap_threshold = int(conf.get('flap_threshold', '5'))
     self.alert_threshold = int(conf.get('alert_threshold', '3'))
     self.urlopen_timeout = int(conf.get('urlopen_timeout', '15'))
     self.notify_plugins = {}
     self._load_notification_plugins(conf)
     self.statsd = StatsdEvent(conf, self.logger, 'stalker_runner.')
Example #2
0
 def __init__(self, conf):
     self.conf = conf
     log_file = conf.get('log_path', '/var/log/stalker/stalker-manager.log')
     self.logger = get_logger('stalker_manager', log_path=log_file)
     redis_host = conf.get('redis_host', '127.0.0.1')
     redis_port = int(conf.get('redis_port', '6379'))
     redis_pass = conf.get('redis_password', '')
     redis_usock = conf.get('redis_socket', None)
     self.wq = conf.get('qname', 'worker1')
     self.rc = redis.Redis(redis_host, redis_port, password=redis_pass,
                           unix_socket_path=redis_usock)
     mongo_host = conf.get('mongo_host', '127.0.0.1')
     mongo_port = int(conf.get('mongo_port', '27017'))
     db_name = conf.get('db_name', 'stalkerweb')
     self.c = MongoClient(host=mongo_host, port=mongo_port)
     self.db = self.c[db_name]
     self.checks = self.db['checks']
     self.notifications = self.db['notifications']
     self.scan_interval = int(conf.get('scan_interval', '5'))
     self.pause_file = conf.get('pause_file', '/tmp/.sm-pause')
     self.shuffle_on_start = True
     self.statsd = StatsdEvent(conf, self.logger, 'stalker_manager.')
     self.metrics = {'checks': 0, 'pending': 0, 'suspended': 0,
                     'failing': 0, 'flapping': 0, 'qsize': 0}
Example #3
0
class StalkerRunner(object):

    def __init__(self, conf):
        self.conf = conf
        self.name = 'stalker-runner-%d' % os.getpid()
        log_type = conf.get('log_type', 'syslog')
        log_file = conf.get('log_file', '/var/log/stalker/stalker-runner.log')
        if log_type == 'syslog':
            self.logger = get_syslogger(conf, self.name)
        else:
            self.logger = get_logger(self.name, log_path=log_file)
        self.pool = eventlet.GreenPool()
        self.check_key = conf.get('check_key', 'canhazstatus')
        redis_host = conf.get('redis_host', '127.0.0.1')
        redis_port = int(conf.get('redis_port', '6379'))
        redis_pass = conf.get('redis_password', '')
        redis_usock = conf.get('redis_socket', None)
        self.wq = conf.get('worker_id', 'worker1')
        self.rc = redis.Redis(redis_host, redis_port, password=redis_pass,
                              unix_socket_path=redis_usock)
        mongo_host = conf.get('mongo_host', '127.0.0.1')
        mongo_port = int(conf.get('mongo_port', '27017'))
        db_name = conf.get('db_name', 'stalkerweb')
        self.c = MongoClient(host=mongo_host, port=mongo_port)
        self.debug = False
        self.db = self.c[db_name]
        self.checks = self.db['checks']
        self.state_log = self.db['state_log']
        self.notifications = self.db['notifications']
        self.host_window = int(conf.get('host_flood_window', '60'))
        self.host_threshold = int(conf.get('host_flood_threshold', '5'))
        self.flood_window = int(conf.get('dc_flood_window', '120'))
        self.flood_threshold = int(conf.get('dc_flood_threshold', '100'))
        self.flap_window = int(conf.get('flap_window', '1200'))
        self.flap_threshold = int(conf.get('flap_threshold', '5'))
        self.alert_threshold = int(conf.get('alert_threshold', '3'))
        self.urlopen_timeout = int(conf.get('urlopen_timeout', '15'))
        self.notify_plugins = {}
        self._load_notification_plugins(conf)
        self.statsd = StatsdEvent(conf, self.logger, 'stalker_runner.')

    def _load_notification_plugins(self, conf):
        """Load any enabled notification plugins"""
        if conf.get('mailgun_enable', 'n').lower() in TRUE_VALUES:
            from stalker_notifications import Mailgun
            mailgun = Mailgun(
                conf=conf, logger=self.logger, redis_client=self.rc)
            self.notify_plugins['mailgun'] = mailgun
        if conf.get('pagerduty_enable', 'n').lower() in TRUE_VALUES:
            from stalker_notifications import PagerDuty
            pagerduty = PagerDuty(conf=conf, logger=self.logger,
                                  redis_client=self.rc)
            self.notify_plugins['pagerduty'] = pagerduty
        if conf.get('smtplib_enable', 'n').lower() in TRUE_VALUES:
            from stalker_notifications import EmailNotify
            email_notify = EmailNotify(conf=conf, logger=self.logger,
                                       redis_client=self.rc)
            self.notify_plugins['email_notify'] = email_notify

    def _get_checks(self, max_count=100, max_time=1, timeout=1):
        """Gather some checks off the Redis queue and batch them up"""
        checks = []
        expire_time = time() + max_time
        while len(checks) != max_count:
            if len(checks) > 0 and time() > expire_time:
                # we've exceeded our max_time return what we've got at
                # least
                return checks
            c = self.rc.blpop(self.wq, timeout=timeout)
            eventlet.sleep()
            if c:
                checks.append(c)
                self.logger.debug("grabbed check")
            else:
                if len(checks) > 0:
                    return checks
                else:
                    # still have no checks, keep waiting
                    pass
        return checks

    def _exec_check(self, url):
        """Actually execute a check on the remote host"""
        req = urllib2.Request(url, headers={'X-CHECK-KEY': self.check_key})
        response = urllib2.urlopen(req, timeout=self.urlopen_timeout)
        content = response.read()
        if not content:
            raise Exception("No content")
        return loads(content)

    def _flap_incr(self, flapid):
        """incr flap counter for a specific check"""
        pipe = self.rc.pipeline()
        pipe.multi()
        pipe.incr(flapid)
        pipe.expire(flapid, self.flap_window)
        pipe.execute()

    def _log_state_change(self, check):
        """Log that a state change occurred in the state_log table"""
        try:
            self.state_log.insert({'hostname': check['hostname'],
                                   'check': check['check'],
                                   'cid': check['_id'],
                                   'status': check['status'],
                                   'last': check['last'],
                                   'out': check['out']})
        except Exception:
            self.logger.exception('Error writing to state_log')

    def host_ncount(self, hostname):
        """Get a count of how many outstanding notifications a host has"""
        return self.notifications.find({'hostname': hostname}).count()

    def host_flood(self, hostname):
        """Check if a host is flooding"""
        count = self.notifications.find({"ts": {"$gt": time() - self.host_window},
                                         "hostname": hostname}).count()
        if count > self.host_threshold:
            self.logger.info('Host flood detected. Suppressing alerts for %s' % hostname)
            return True
        else:
            return False

    def global_flood(self):
        """Check if we're experiencing a global alert flood"""
        count = self.notifications.find({"ts": {"$gt": time() - self.flood_window}}).count()
        if count > self.flood_threshold:
            self.logger.info('Global alert flood detected. Suppressing alerts')
            return True
        else:
            return False

    def flapping(self, flapid):
        """Check if a check is flapping"""
        flap_count = int(self.rc.get(flapid) or 0)
        self.logger.debug('%s %d' % (flapid, flap_count))
        if flap_count >= self.flap_threshold:
            return True
        else:
            return False

    def _emit_fail(self, check):
        """Emit a failure event via the notification plugins"""
        self.logger.info('alert %s' % check)
        for plugin in self.notify_plugins.itervalues():
            try:
                plugin.fail(check)
            except Exception:
                self.logger.exception('Error emitting failure')

    def _emit_clear(self, check):
        """Emit a clear event via the notification plugins"""
        self.logger.info('cleared %s' % check)
        for plugin in self.notify_plugins.itervalues():
            try:
                plugin.clear(check)
            except Exception:
                self.logger.exception('Error emitting clear')

    def check_failed(self, check):
        """Perform failure notifications if required"""
        if not self.notifications.find_one({'hostname': check['hostname'],
                                            'check': check['check']}):
            n = {'cid': check['_id'], 'hostname': check['hostname'],
                 'check': check['check'], 'ts': time(), 'cleared': False}
            try:
                q = self.notifications.insert(n)
            except Exception:
                self.logger.exception('Error updating notifications table!')
                return
            if not self.host_flood(check['hostname']) and not self.global_flood():
                self._emit_fail(check)
        else:
            self.logger.debug('Notification entry already exists!')

    def check_cleared(self, check):
        """Perform clear notifications if required"""
        if self.notifications.find_one({'hostname': check['hostname'],
                                        'check': check['check']}):
            try:
                q = self.notifications.remove({'cid': check['_id']})
            except Exception:
                self.logger.exception('Error removing notifications entry.')
            self._emit_clear(check)
        else:
            self.logger.debug('No notification entry to clear')

    def emit_host_flood_alert(self, hostname):
        """Emit a host level flood alert via the notification plugins"""
        check = {'hostname': hostname, 'check': 'host_alert_flood',
                 'out': 'Host level alert flood detected!'}
        for plugin in self.notify_plugins.itervalues():
            try:
                plugin.fail(check)
            except Exception:
                self.logger.exception('Error emitting failure')

    def emit_flood_alert(self):
        """Emit a flood notification event via the notification plugins"""
        check = {'hostname': 'alertflood', 'check': 'dc_alert_flood',
                 'out': 'DC wide alert flood detected!'}
        for plugin in self.notify_plugins.itervalues():
            try:
                plugin.fail(check)
            except Exception:
                self.logger.exception('Error emitting failure')

    def state_has_changed(self, check, previous_status):
        """Determin if a state has changed, and update state log accordingly"""
        if check['status'] != previous_status:
            self.logger.debug('%s:%s state changed.' % (check['hostname'],
                                                       check['check']))
            self._log_state_change(check)
            state_changed = True
            self.statsd.counter('state_change')
        else:
            self.logger.debug('%s:%s state unchanged.' % (check['hostname'],
                                                          check['check']))
            state_changed = False
        return state_changed

    def state_change(self, check, previous_status):
        """Handle check result state changes"""
        state_changed = self.state_has_changed(check, previous_status)
        if check['status'] is True and state_changed is True:
            self.check_cleared(check)
        elif check['status'] is False:
            # we don't check if state_changed to allow for alert escalations
            # at a later date. In the mean time this means check_failed gets
            # called everytime a check is run and fails.
            self.logger.info('%s:%s failure # %d' % (check['hostname'],
                                                     check['check'],
                                                     check['fail_count']))
            if check['flapping']:
                self.logger.info('%s:%s is flapping - skipping fail/clear event' %
                                (check['hostname'], check['check']))
                #emit_flap notification
            elif check['fail_count'] >= self.alert_threshold:
                self.check_failed(check)

    def run_check(self, payload):
        """Run a check and process its result"""
        check = loads(payload[1])
        check_name = check['check']
        flapid = "flap:%s:%s" % (check['hostname'], check['check'])
        previous_status = check['status']
        try:
            result = self._exec_check('https://%s:5050/%s' % (check['ip'],
                                                              check_name))
        except Exception as err:
            result = {check_name: {'status': 2, 'out': '', 'err': str(err)}}
            self.statsd.counter('checks.error')
        if result[check_name]['status'] == 0:
            if previous_status is False:
                self._flap_incr(flapid)
            query = {'_id': ObjectId(check['_id'])}
            update = {"$set": {'pending': False, 'status': True,
                               'flapping': self.flapping(flapid),
                               'next': time() + check['interval'],
                               'last': time(),
                               'out': result[check_name]['out'] +
                               result[check_name]['err'],
                               'fail_count': 0}}
            self.statsd.counter('checks.passed')
        else:  # check is failing
            if previous_status is True:
                self._flap_incr(flapid)
            query = {'_id': ObjectId(check['_id'])}
            if 'follow_up' not in check:  # continue to work with old schema
                check['follow_up'] = check['interval']
            update = {"$set": {'pending': False, 'status': False,
                               'flapping': self.flapping(flapid),
                               'next': time() + check['follow_up'],
                               'last': time(),
                               'out': result[check_name]['out'] +
                               result[check_name]['err']},
                      "$inc": {'fail_count': 1}}
            self.statsd.counter('checks.failed')
        try:
            response = self.checks.find_and_modify(query=query, update=update,
                                                   new=True)
        except Exception:
            response = None
            self.logger.exception('Error on check find_and_modify:')
        if response:
            self.state_change(response, previous_status)
            return True
        else:
            return False

    def start(self):
        while 1:
            self.logger.debug("Checking queue for work")
            checks = self._get_checks()
            if checks:
                count = len(checks)
                self.logger.debug("Got %d checks" % count)
                self.statsd.counter('queue.get', count)
                try:
                    check_result = [x for x in self.pool.imap(self.run_check,
                                                              checks)]
                    self.logger.debug(check_result)
                except Exception:
                    self.logger.exception('Error running checks')
            else:
                self.logger.debug('No checks, sleeping')
            eventlet.sleep()
Example #4
0
class StalkerManager(object):

    def __init__(self, conf):
        self.conf = conf
        log_file = conf.get('log_path', '/var/log/stalker/stalker-manager.log')
        self.logger = get_logger('stalker_manager', log_path=log_file)
        redis_host = conf.get('redis_host', '127.0.0.1')
        redis_port = int(conf.get('redis_port', '6379'))
        redis_pass = conf.get('redis_password', '')
        redis_usock = conf.get('redis_socket', None)
        self.wq = conf.get('qname', 'worker1')
        self.rc = redis.Redis(redis_host, redis_port, password=redis_pass,
                              unix_socket_path=redis_usock)
        mongo_host = conf.get('mongo_host', '127.0.0.1')
        mongo_port = int(conf.get('mongo_port', '27017'))
        db_name = conf.get('db_name', 'stalkerweb')
        self.c = MongoClient(host=mongo_host, port=mongo_port)
        self.db = self.c[db_name]
        self.checks = self.db['checks']
        self.notifications = self.db['notifications']
        self.scan_interval = int(conf.get('scan_interval', '5'))
        self.pause_file = conf.get('pause_file', '/tmp/.sm-pause')
        self.shuffle_on_start = True
        self.statsd = StatsdEvent(conf, self.logger, 'stalker_manager.')
        self.metrics = {'checks': 0, 'pending': 0, 'suspended': 0,
                        'failing': 0, 'flapping': 0, 'qsize': 0}

    def _collect_metrics(self):
        self.metrics['checks'] = self.checks.count()
        self.metrics['pending'] = self.checks.find({'pending': True}).count()
        self.metrics['suspended'] = self.checks.find({'suspended':
                                                     True}).count()
        self.metrics['failing'] = self.checks.find({'status': False}).count()
        self.metrics['flapping'] = self.checks.find({'flapping': True}).count()
        self.metrics['qsize'] = self.queue_len()
        self.logger.info("stats: %s" % self.metrics)
        self.rc.mset(self.metrics)
        self.statsd.batch_gauge(self.metrics, prefix='stalker.')

    def startup_shuffle(self):
        # reshuffle all checks that need to be done right now and schedule
        # them for a future time. i.e. if the stalker-manager was offline
        # for an extended period of time.
        if not self.shuffle_on_start:
            return
        else:
            count = 0
            for i in self.checks.find({'next': {"$lt": time()}}):
                r = self.checks.update({'_id': i['_id']},
                                       {"$set": {"next": time() + randint(1, 600)}})
                count += 1
            self.logger.info('Reshuffled %d checks on startup.' % count)

    def pause_if_asked(self):
        """Check if pause file exists and sleep until its removed if it does"""
        if exists(self.pause_file):
            self.logger.info('Pausing')
            while exists(self.pause_file):
                sleep(1)
            self.logger.info('Pause removed')

    def queue_len(self, q='worker1'):
        """Return # of items in queue"""
        return self.rc.llen(q)

    def queue_check(self, i):
        """Queue up a check for the stalker_runners"""
        # if we had multiple stalker_runners we could roundrobin q's
        self.rc.rpush('worker1', dumps(i))

    def sanitize(self, flush_queued=True):
        """scan the checks db for checks marked pending but not actually
        in progress. i.e. redis died, or services where kill -9'd."""
        pending = [x['_id'] for x in self.checks.find(
            {'pending': True}, fields={'_id': True})]
        self.logger.warning('Found %d pending items' % len(pending))
        if flush_queued:
            self.rc.delete('worker1')
            q = self.checks.update(
                {'pending': True}, {'$set': {'pending': False}}, multi=True)
            if q['err']:
                raise Exception('Error clearing pendings')
        else:
            q = self.checks.update(
                {'pending': True}, {'$set': {'pending': False}}, multi=True)
            if q['err']:
                raise Exception('Error clearing pendings')

    def scan_checks(self):
        """scan the checks db for checks that need to run
        mark them as pending and then drop'em on the q for the runner."""
        self.pause_if_asked()
        qcount = 0
        for check in self.checks.find({'next': {"$lt": time()},
                                       'pending': False,
                                       'suspended': False}):
            try:
                u = self.checks.update({'_id': check['_id']},
                                       {"$set": {'pending': True}})
                if u['updatedExisting']:
                    self.queue_check(check)
                    qcount += 1
            except Exception as err:
                try:
                    u = self.checks.update({'_id': check['_id']},
                                           {"$set": {'pending': True}})
                except Exception as err2:
                    self.logger.error(err2)
                self.logger.error(err)
        if qcount > 0:
            self.logger.info('Queued %d checks' % qcount)
            self.statsd.counter('queue.put', qcount)
        self._collect_metrics()

    def start(self):
        self.logger.info('starting up')
        self.sanitize()
        self.startup_shuffle()
        while 1:
            try:
                self.scan_checks()
                sleep(self.scan_interval)
            except Exception as err:
                print err