def __init__(self, conf): self.conf = conf self.name = 'stalker-runner-%d' % os.getpid() log_type = conf.get('log_type', 'syslog') log_file = conf.get('log_file', '/var/log/stalker/stalker-runner.log') if log_type == 'syslog': self.logger = get_syslogger(conf, self.name) else: self.logger = get_logger(self.name, log_path=log_file) self.pool = eventlet.GreenPool() self.check_key = conf.get('check_key', 'canhazstatus') redis_host = conf.get('redis_host', '127.0.0.1') redis_port = int(conf.get('redis_port', '6379')) redis_pass = conf.get('redis_password', '') redis_usock = conf.get('redis_socket', None) self.wq = conf.get('worker_id', 'worker1') self.rc = redis.Redis(redis_host, redis_port, password=redis_pass, unix_socket_path=redis_usock) mongo_host = conf.get('mongo_host', '127.0.0.1') mongo_port = int(conf.get('mongo_port', '27017')) db_name = conf.get('db_name', 'stalkerweb') self.c = MongoClient(host=mongo_host, port=mongo_port) self.debug = False self.db = self.c[db_name] self.checks = self.db['checks'] self.state_log = self.db['state_log'] self.notifications = self.db['notifications'] self.host_window = int(conf.get('host_flood_window', '60')) self.host_threshold = int(conf.get('host_flood_threshold', '5')) self.flood_window = int(conf.get('dc_flood_window', '120')) self.flood_threshold = int(conf.get('dc_flood_threshold', '100')) self.flap_window = int(conf.get('flap_window', '1200')) self.flap_threshold = int(conf.get('flap_threshold', '5')) self.alert_threshold = int(conf.get('alert_threshold', '3')) self.urlopen_timeout = int(conf.get('urlopen_timeout', '15')) self.notify_plugins = {} self._load_notification_plugins(conf) self.statsd = StatsdEvent(conf, self.logger, 'stalker_runner.')
def __init__(self, conf): self.conf = conf log_file = conf.get('log_path', '/var/log/stalker/stalker-manager.log') self.logger = get_logger('stalker_manager', log_path=log_file) redis_host = conf.get('redis_host', '127.0.0.1') redis_port = int(conf.get('redis_port', '6379')) redis_pass = conf.get('redis_password', '') redis_usock = conf.get('redis_socket', None) self.wq = conf.get('qname', 'worker1') self.rc = redis.Redis(redis_host, redis_port, password=redis_pass, unix_socket_path=redis_usock) mongo_host = conf.get('mongo_host', '127.0.0.1') mongo_port = int(conf.get('mongo_port', '27017')) db_name = conf.get('db_name', 'stalkerweb') self.c = MongoClient(host=mongo_host, port=mongo_port) self.db = self.c[db_name] self.checks = self.db['checks'] self.notifications = self.db['notifications'] self.scan_interval = int(conf.get('scan_interval', '5')) self.pause_file = conf.get('pause_file', '/tmp/.sm-pause') self.shuffle_on_start = True self.statsd = StatsdEvent(conf, self.logger, 'stalker_manager.') self.metrics = {'checks': 0, 'pending': 0, 'suspended': 0, 'failing': 0, 'flapping': 0, 'qsize': 0}
class StalkerRunner(object): def __init__(self, conf): self.conf = conf self.name = 'stalker-runner-%d' % os.getpid() log_type = conf.get('log_type', 'syslog') log_file = conf.get('log_file', '/var/log/stalker/stalker-runner.log') if log_type == 'syslog': self.logger = get_syslogger(conf, self.name) else: self.logger = get_logger(self.name, log_path=log_file) self.pool = eventlet.GreenPool() self.check_key = conf.get('check_key', 'canhazstatus') redis_host = conf.get('redis_host', '127.0.0.1') redis_port = int(conf.get('redis_port', '6379')) redis_pass = conf.get('redis_password', '') redis_usock = conf.get('redis_socket', None) self.wq = conf.get('worker_id', 'worker1') self.rc = redis.Redis(redis_host, redis_port, password=redis_pass, unix_socket_path=redis_usock) mongo_host = conf.get('mongo_host', '127.0.0.1') mongo_port = int(conf.get('mongo_port', '27017')) db_name = conf.get('db_name', 'stalkerweb') self.c = MongoClient(host=mongo_host, port=mongo_port) self.debug = False self.db = self.c[db_name] self.checks = self.db['checks'] self.state_log = self.db['state_log'] self.notifications = self.db['notifications'] self.host_window = int(conf.get('host_flood_window', '60')) self.host_threshold = int(conf.get('host_flood_threshold', '5')) self.flood_window = int(conf.get('dc_flood_window', '120')) self.flood_threshold = int(conf.get('dc_flood_threshold', '100')) self.flap_window = int(conf.get('flap_window', '1200')) self.flap_threshold = int(conf.get('flap_threshold', '5')) self.alert_threshold = int(conf.get('alert_threshold', '3')) self.urlopen_timeout = int(conf.get('urlopen_timeout', '15')) self.notify_plugins = {} self._load_notification_plugins(conf) self.statsd = StatsdEvent(conf, self.logger, 'stalker_runner.') def _load_notification_plugins(self, conf): """Load any enabled notification plugins""" if conf.get('mailgun_enable', 'n').lower() in TRUE_VALUES: from stalker_notifications import Mailgun mailgun = Mailgun( conf=conf, logger=self.logger, redis_client=self.rc) self.notify_plugins['mailgun'] = mailgun if conf.get('pagerduty_enable', 'n').lower() in TRUE_VALUES: from stalker_notifications import PagerDuty pagerduty = PagerDuty(conf=conf, logger=self.logger, redis_client=self.rc) self.notify_plugins['pagerduty'] = pagerduty if conf.get('smtplib_enable', 'n').lower() in TRUE_VALUES: from stalker_notifications import EmailNotify email_notify = EmailNotify(conf=conf, logger=self.logger, redis_client=self.rc) self.notify_plugins['email_notify'] = email_notify def _get_checks(self, max_count=100, max_time=1, timeout=1): """Gather some checks off the Redis queue and batch them up""" checks = [] expire_time = time() + max_time while len(checks) != max_count: if len(checks) > 0 and time() > expire_time: # we've exceeded our max_time return what we've got at # least return checks c = self.rc.blpop(self.wq, timeout=timeout) eventlet.sleep() if c: checks.append(c) self.logger.debug("grabbed check") else: if len(checks) > 0: return checks else: # still have no checks, keep waiting pass return checks def _exec_check(self, url): """Actually execute a check on the remote host""" req = urllib2.Request(url, headers={'X-CHECK-KEY': self.check_key}) response = urllib2.urlopen(req, timeout=self.urlopen_timeout) content = response.read() if not content: raise Exception("No content") return loads(content) def _flap_incr(self, flapid): """incr flap counter for a specific check""" pipe = self.rc.pipeline() pipe.multi() pipe.incr(flapid) pipe.expire(flapid, self.flap_window) pipe.execute() def _log_state_change(self, check): """Log that a state change occurred in the state_log table""" try: self.state_log.insert({'hostname': check['hostname'], 'check': check['check'], 'cid': check['_id'], 'status': check['status'], 'last': check['last'], 'out': check['out']}) except Exception: self.logger.exception('Error writing to state_log') def host_ncount(self, hostname): """Get a count of how many outstanding notifications a host has""" return self.notifications.find({'hostname': hostname}).count() def host_flood(self, hostname): """Check if a host is flooding""" count = self.notifications.find({"ts": {"$gt": time() - self.host_window}, "hostname": hostname}).count() if count > self.host_threshold: self.logger.info('Host flood detected. Suppressing alerts for %s' % hostname) return True else: return False def global_flood(self): """Check if we're experiencing a global alert flood""" count = self.notifications.find({"ts": {"$gt": time() - self.flood_window}}).count() if count > self.flood_threshold: self.logger.info('Global alert flood detected. Suppressing alerts') return True else: return False def flapping(self, flapid): """Check if a check is flapping""" flap_count = int(self.rc.get(flapid) or 0) self.logger.debug('%s %d' % (flapid, flap_count)) if flap_count >= self.flap_threshold: return True else: return False def _emit_fail(self, check): """Emit a failure event via the notification plugins""" self.logger.info('alert %s' % check) for plugin in self.notify_plugins.itervalues(): try: plugin.fail(check) except Exception: self.logger.exception('Error emitting failure') def _emit_clear(self, check): """Emit a clear event via the notification plugins""" self.logger.info('cleared %s' % check) for plugin in self.notify_plugins.itervalues(): try: plugin.clear(check) except Exception: self.logger.exception('Error emitting clear') def check_failed(self, check): """Perform failure notifications if required""" if not self.notifications.find_one({'hostname': check['hostname'], 'check': check['check']}): n = {'cid': check['_id'], 'hostname': check['hostname'], 'check': check['check'], 'ts': time(), 'cleared': False} try: q = self.notifications.insert(n) except Exception: self.logger.exception('Error updating notifications table!') return if not self.host_flood(check['hostname']) and not self.global_flood(): self._emit_fail(check) else: self.logger.debug('Notification entry already exists!') def check_cleared(self, check): """Perform clear notifications if required""" if self.notifications.find_one({'hostname': check['hostname'], 'check': check['check']}): try: q = self.notifications.remove({'cid': check['_id']}) except Exception: self.logger.exception('Error removing notifications entry.') self._emit_clear(check) else: self.logger.debug('No notification entry to clear') def emit_host_flood_alert(self, hostname): """Emit a host level flood alert via the notification plugins""" check = {'hostname': hostname, 'check': 'host_alert_flood', 'out': 'Host level alert flood detected!'} for plugin in self.notify_plugins.itervalues(): try: plugin.fail(check) except Exception: self.logger.exception('Error emitting failure') def emit_flood_alert(self): """Emit a flood notification event via the notification plugins""" check = {'hostname': 'alertflood', 'check': 'dc_alert_flood', 'out': 'DC wide alert flood detected!'} for plugin in self.notify_plugins.itervalues(): try: plugin.fail(check) except Exception: self.logger.exception('Error emitting failure') def state_has_changed(self, check, previous_status): """Determin if a state has changed, and update state log accordingly""" if check['status'] != previous_status: self.logger.debug('%s:%s state changed.' % (check['hostname'], check['check'])) self._log_state_change(check) state_changed = True self.statsd.counter('state_change') else: self.logger.debug('%s:%s state unchanged.' % (check['hostname'], check['check'])) state_changed = False return state_changed def state_change(self, check, previous_status): """Handle check result state changes""" state_changed = self.state_has_changed(check, previous_status) if check['status'] is True and state_changed is True: self.check_cleared(check) elif check['status'] is False: # we don't check if state_changed to allow for alert escalations # at a later date. In the mean time this means check_failed gets # called everytime a check is run and fails. self.logger.info('%s:%s failure # %d' % (check['hostname'], check['check'], check['fail_count'])) if check['flapping']: self.logger.info('%s:%s is flapping - skipping fail/clear event' % (check['hostname'], check['check'])) #emit_flap notification elif check['fail_count'] >= self.alert_threshold: self.check_failed(check) def run_check(self, payload): """Run a check and process its result""" check = loads(payload[1]) check_name = check['check'] flapid = "flap:%s:%s" % (check['hostname'], check['check']) previous_status = check['status'] try: result = self._exec_check('https://%s:5050/%s' % (check['ip'], check_name)) except Exception as err: result = {check_name: {'status': 2, 'out': '', 'err': str(err)}} self.statsd.counter('checks.error') if result[check_name]['status'] == 0: if previous_status is False: self._flap_incr(flapid) query = {'_id': ObjectId(check['_id'])} update = {"$set": {'pending': False, 'status': True, 'flapping': self.flapping(flapid), 'next': time() + check['interval'], 'last': time(), 'out': result[check_name]['out'] + result[check_name]['err'], 'fail_count': 0}} self.statsd.counter('checks.passed') else: # check is failing if previous_status is True: self._flap_incr(flapid) query = {'_id': ObjectId(check['_id'])} if 'follow_up' not in check: # continue to work with old schema check['follow_up'] = check['interval'] update = {"$set": {'pending': False, 'status': False, 'flapping': self.flapping(flapid), 'next': time() + check['follow_up'], 'last': time(), 'out': result[check_name]['out'] + result[check_name]['err']}, "$inc": {'fail_count': 1}} self.statsd.counter('checks.failed') try: response = self.checks.find_and_modify(query=query, update=update, new=True) except Exception: response = None self.logger.exception('Error on check find_and_modify:') if response: self.state_change(response, previous_status) return True else: return False def start(self): while 1: self.logger.debug("Checking queue for work") checks = self._get_checks() if checks: count = len(checks) self.logger.debug("Got %d checks" % count) self.statsd.counter('queue.get', count) try: check_result = [x for x in self.pool.imap(self.run_check, checks)] self.logger.debug(check_result) except Exception: self.logger.exception('Error running checks') else: self.logger.debug('No checks, sleeping') eventlet.sleep()
class StalkerManager(object): def __init__(self, conf): self.conf = conf log_file = conf.get('log_path', '/var/log/stalker/stalker-manager.log') self.logger = get_logger('stalker_manager', log_path=log_file) redis_host = conf.get('redis_host', '127.0.0.1') redis_port = int(conf.get('redis_port', '6379')) redis_pass = conf.get('redis_password', '') redis_usock = conf.get('redis_socket', None) self.wq = conf.get('qname', 'worker1') self.rc = redis.Redis(redis_host, redis_port, password=redis_pass, unix_socket_path=redis_usock) mongo_host = conf.get('mongo_host', '127.0.0.1') mongo_port = int(conf.get('mongo_port', '27017')) db_name = conf.get('db_name', 'stalkerweb') self.c = MongoClient(host=mongo_host, port=mongo_port) self.db = self.c[db_name] self.checks = self.db['checks'] self.notifications = self.db['notifications'] self.scan_interval = int(conf.get('scan_interval', '5')) self.pause_file = conf.get('pause_file', '/tmp/.sm-pause') self.shuffle_on_start = True self.statsd = StatsdEvent(conf, self.logger, 'stalker_manager.') self.metrics = {'checks': 0, 'pending': 0, 'suspended': 0, 'failing': 0, 'flapping': 0, 'qsize': 0} def _collect_metrics(self): self.metrics['checks'] = self.checks.count() self.metrics['pending'] = self.checks.find({'pending': True}).count() self.metrics['suspended'] = self.checks.find({'suspended': True}).count() self.metrics['failing'] = self.checks.find({'status': False}).count() self.metrics['flapping'] = self.checks.find({'flapping': True}).count() self.metrics['qsize'] = self.queue_len() self.logger.info("stats: %s" % self.metrics) self.rc.mset(self.metrics) self.statsd.batch_gauge(self.metrics, prefix='stalker.') def startup_shuffle(self): # reshuffle all checks that need to be done right now and schedule # them for a future time. i.e. if the stalker-manager was offline # for an extended period of time. if not self.shuffle_on_start: return else: count = 0 for i in self.checks.find({'next': {"$lt": time()}}): r = self.checks.update({'_id': i['_id']}, {"$set": {"next": time() + randint(1, 600)}}) count += 1 self.logger.info('Reshuffled %d checks on startup.' % count) def pause_if_asked(self): """Check if pause file exists and sleep until its removed if it does""" if exists(self.pause_file): self.logger.info('Pausing') while exists(self.pause_file): sleep(1) self.logger.info('Pause removed') def queue_len(self, q='worker1'): """Return # of items in queue""" return self.rc.llen(q) def queue_check(self, i): """Queue up a check for the stalker_runners""" # if we had multiple stalker_runners we could roundrobin q's self.rc.rpush('worker1', dumps(i)) def sanitize(self, flush_queued=True): """scan the checks db for checks marked pending but not actually in progress. i.e. redis died, or services where kill -9'd.""" pending = [x['_id'] for x in self.checks.find( {'pending': True}, fields={'_id': True})] self.logger.warning('Found %d pending items' % len(pending)) if flush_queued: self.rc.delete('worker1') q = self.checks.update( {'pending': True}, {'$set': {'pending': False}}, multi=True) if q['err']: raise Exception('Error clearing pendings') else: q = self.checks.update( {'pending': True}, {'$set': {'pending': False}}, multi=True) if q['err']: raise Exception('Error clearing pendings') def scan_checks(self): """scan the checks db for checks that need to run mark them as pending and then drop'em on the q for the runner.""" self.pause_if_asked() qcount = 0 for check in self.checks.find({'next': {"$lt": time()}, 'pending': False, 'suspended': False}): try: u = self.checks.update({'_id': check['_id']}, {"$set": {'pending': True}}) if u['updatedExisting']: self.queue_check(check) qcount += 1 except Exception as err: try: u = self.checks.update({'_id': check['_id']}, {"$set": {'pending': True}}) except Exception as err2: self.logger.error(err2) self.logger.error(err) if qcount > 0: self.logger.info('Queued %d checks' % qcount) self.statsd.counter('queue.put', qcount) self._collect_metrics() def start(self): self.logger.info('starting up') self.sanitize() self.startup_shuffle() while 1: try: self.scan_checks() sleep(self.scan_interval) except Exception as err: print err