def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=LoggerMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_queue) while not self.shuttingdown: try: LOG.debug('Waiting for log messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=GangliaMessage(self.mq)) while not self.shuttingdown: try: rules = init_rules() # re-read rule config each time self.metric_check(rules) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=UrlmonMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase alert rules urls = init_urls() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName())
class AlertaDaemon(Daemon): alerta_opts = { 'forward_duplicate': 'no', } def __init__(self, prog, **kwargs): config.register_opts(AlertaDaemon.alerta_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True self.queue = Queue.Queue() # Create internal queue self.db = Mongo() # mongo database self.carbon = Carbon() # carbon metrics self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every) self.mq.send(heartbeat) time.sleep(CONF.loop_every) LOG.info('Alert processing queue length is %d', self.queue.qsize()) self.carbon.metric_send('alerta.alerts.queueLength', self.queue.qsize()) self.db.update_queue_metric(self.queue.qsize()) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
def run(self): self.running = True self.queue = Queue.Queue() # Create internal queue self.db = Mongo() # mongo database self.carbon = Carbon() # carbon metrics self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect( callback=ServerMessage(self.mq, self.queue, self.statsd)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName())
def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=DynectMessage(self.mq)) while not self.shuttingdown: try: self.queryDynect() if self.updating: self.alertDynect() self.last_info = self.info LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True self.running = False
def run(self): self.running = True # Start token bucket thread self.tokens = LeakyBucket(tokens=20, rate=30) self.tokens.start() self.onhold = dict() # Connect to message queue self.mq = Messaging() self.mq.connect( callback=MailerMessage(self.mq, self.onhold, self.tokens)) self.mq.subscribe(destination=CONF.outbound_topic) while not self.shuttingdown: try: LOG.debug('Send email messages...') for alertid in self.onhold.keys(): try: (mailAlert, hold_time) = self.onhold[alertid] except KeyError: continue if time.time() > hold_time: if not self.tokens.get_token(): LOG.warning( '%s : No tokens left, rate limiting this alert', alertid) continue email = Mailer(mailAlert) mail_to = CONF.mail_list.split(',') if 'mailto' in mailAlert.tags: mail_to.append(mailAlert.tags['mailto']) email.send(mail_to=mail_to) try: del self.onhold[alertid] except KeyError: continue time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False self.tokens.shutdown() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class UrlmonDaemon(Daemon): def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=UrlmonMessage(self.mq)) self.dedup = DeDup() self.statsd = StatsD() # graphite metrics # Initialiase alert rules urls = init_urls() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: for url in urls: self.queue.put(url) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.info('URL check queue length is %d', self.queue.qsize()) time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class PagerDutyDaemon(Daemon): def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=PagerDutyMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_topic) # TODO(nsatterl): use dedicated queue? while not self.shuttingdown: try: LOG.debug('Waiting for PagerDuty messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class LoggerDaemon(Daemon): """ Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends """ def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=LoggerMessage()) self.mq.subscribe(destination=CONF.outbound_queue) while not self.shuttingdown: try: LOG.debug('Waiting for log messages...') time.sleep(30) LOG.debug('Send heartbeat...') heartbeat = Heartbeat() self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class MailerDaemon(Daemon): def run(self): self.running = True # Start token bucket thread self.tokens = LeakyBucket(tokens=20, rate=30) self.tokens.start() self.onhold = dict() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=MailerMessage(self.mq, self.onhold, self.tokens)) self.mq.subscribe(destination=CONF.outbound_topic) while not self.shuttingdown: try: LOG.debug('Send email messages...') for alertid in self.onhold.keys(): try: (mailAlert, hold_time) = self.onhold[alertid] except KeyError: continue if time.time() > hold_time: if not self.tokens.get_token(): LOG.warning('%s : No tokens left, rate limiting this alert', alertid) continue email = Mailer(mailAlert) mail_to = CONF.mail_list.split(',') for tag in mailAlert.tags: if tag.startswith('email'): mail_to.append(tag.split(':')[1]) email.send(mail_to=mail_to) try: del self.onhold[alertid] except KeyError: continue time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False self.tokens.shutdown() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class AlertaDaemon(Daemon): def run(self): self.running = True self.queue = Queue.Queue() # Create internal queue self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every) self.mq.send(heartbeat) LOG.debug('Internal queue size is %s messages', self.queue.qsize()) time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
def run(self): self.running = True # Initialiase alert config init_config() # Start token bucket thread _TokenThread = TokenTopUp() _TokenThread.start() # Start notify thread _NotifyThread = ReleaseThread() _NotifyThread.start() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=NotifyMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_topic) while not self.shuttingdown: try: # Read (or re-read) config as necessary if os.path.getmtime(CONF.yaml_config) != config_mod_time: init_config() config_mod_time = os.path.getmtime(CONF.yaml_config) LOG.debug('Waiting for email messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True _TokenThread.shutdown() _NotifyThread.shutdown() LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class CloudWatchDaemon(Daemon): cloudwatch_opts = { 'cloudwatch_sqs_region': 'eu-west-1', 'cloudwatch_sqs_queue': 'cloudwatch-to-alerta', 'cloudwatch_access_key': '022QF06E7MXBSAMPLE', 'cloudwatch_secret_key': '' } def __init__(self, prog, **kwargs): config.register_opts(CloudWatchDaemon.cloudwatch_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=CloudWatchMessage(self.mq)) self.dedup = DeDup(by_value=True) LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue) try: sqs = boto.sqs.connect_to_region( CONF.cloudwatch_sqs_region, aws_access_key_id=CONF.cloudwatch_access_key, aws_secret_access_key=CONF.cloudwatch_secret_key ) except boto.exception.SQSError, e: LOG.error('SQS API call failed: %s', e) sys.exit(1) try: q = sqs.create_queue(CONF.cloudwatch_sqs_queue) q.set_message_class(RawMessage) except boto.exception.SQSError, e: LOG.error('SQS queue error: %s', e) sys.exit(1)
def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect() # Initialiase alert rules init_urls() url_mod_time = os.path.getmtime(URLFILE) # Start worker threads for i in range(NUM_THREADS): w = WorkerThread(queue) w.start() LOG.info('Starting thread: %s', w.getName()) while not self.shuttingdown: try: # Read (or re-read) urls as necessary if os.path.getmtime(URLFILE) != url_mod_time: init_urls() url_mod_time = os.path.getmtime(URLFILE) for url in urls: queue.put(('url', url)) queue.put(('timestamp', time.time())) LOG.debug('Send heartbeat...') heartbeat = Heartbeat() self.mq.send(heartbeat) time.sleep(_check_rate) urlmon_qsize = queue.qsize() LOG.info('URL check queue length is %d', urlmon_qsize) if GMETRIC_SEND: gmetric_cmd = "%s --name urlmon_qsize --value %d --type uint16 --units \" \" --slope both --group urlmon %s" % ( GMETRIC_CMD, urlmon_qsize, GMETRIC_OPTIONS) LOG.debug("%s", gmetric_cmd) os.system("%s" % gmetric_cmd) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(NUM_THREADS): queue.put(('stop', None)) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class CloudWatchDaemon(Daemon): cloudwatch_opts = { 'cloudwatch_sqs_region': 'eu-west-1', 'cloudwatch_sqs_queue': 'cloudwatch-to-alerta', 'cloudwatch_access_key': '022QF06E7MXBSAMPLE', 'cloudwatch_secret_key': '' } def __init__(self, prog, **kwargs): config.register_opts(CloudWatchDaemon.cloudwatch_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=CloudWatchMessage(self.mq)) self.dedup = DeDup(by_value=True) LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue) try: sqs = boto.sqs.connect_to_region( CONF.cloudwatch_sqs_region, aws_access_key_id=CONF.cloudwatch_access_key, aws_secret_access_key=CONF.cloudwatch_secret_key) except boto.exception.SQSError, e: LOG.error('SQS API call failed: %s', e) sys.exit(1) try: q = sqs.create_queue(CONF.cloudwatch_sqs_queue) q.set_message_class(RawMessage) except boto.exception.SQSError, e: LOG.error('SQS queue error: %s', e) sys.exit(1)
def run(self): self.running = True self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=CloudWatchMessage(self.mq)) self.dedup = DeDup(by_value=True) LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue) try: sqs = boto.sqs.connect_to_region( CONF.cloudwatch_sqs_region, aws_access_key_id=CONF.cloudwatch_access_key, aws_secret_access_key=CONF.cloudwatch_secret_key) except boto.exception.SQSError, e: LOG.error('SQS API call failed: %s', e) sys.exit(1)
def run(self): data = sys.stdin.read() LOG.info('snmptrapd -> %s', data) snmptrapAlert = self.parse_snmptrap(data) mq = Messaging() mq.connect() mq.send(snmptrapAlert) mq.disconnect()
class NotifyDaemon(Daemon): def run(self): self.running = True # Initialiase alert config init_config() # Start token bucket thread _TokenThread = TokenTopUp() _TokenThread.start() # Start notify thread _NotifyThread = ReleaseThread() _NotifyThread.start() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=NotifyMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_topic) while not self.shuttingdown: try: # Read (or re-read) config as necessary if os.path.getmtime(CONF.yaml_config) != config_mod_time: init_config() config_mod_time = os.path.getmtime(CONF.yaml_config) LOG.debug('Waiting for email messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True _TokenThread.shutdown() _NotifyThread.shutdown() LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class AlertaDaemon(Daemon): def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=ServerMessage(self.queue)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s alert handler threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started alert handler thread: %s', w.getName()) while not self.shuttingdown: try: time.sleep(0.1) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True for i in range(CONF.server_threads): self.queue.put(None) LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class LoggerDaemon(Daemon): """ Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends """ logger_opts = { 'es_host': 'localhost', 'es_port': 9200, 'es_index': 'alerta-%Y.%m.%d', # NB. Kibana config must match this index } def __init__(self, prog, **kwargs): config.register_opts(LoggerDaemon.logger_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=LoggerMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_queue) while not self.shuttingdown: try: LOG.debug('Waiting for log messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class PagerDutyDaemon(Daemon): pagerduty_opts = { 'pagerduty_endpoint': 'https://events.pagerduty.com/generic/2010-04-15/create_event.json', 'pagerduty_api_key': '', } def __init__(self, prog, **kwargs): config.register_opts(PagerDutyDaemon.pagerduty_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=PagerDutyMessage(self.mq)) self.mq.subscribe(destination=CONF.outbound_topic) # TODO(nsatterl): use dedicated queue? while not self.shuttingdown: try: LOG.debug('Waiting for PagerDuty messages...') time.sleep(CONF.loop_every) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=ServerMessage(self.queue)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s alert handler threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started alert handler thread: %s', w.getName())
def run(self): self.running = True self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=CloudWatchMessage(self.mq)) self.dedup = DeDup(by_value=True) LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue) try: sqs = boto.sqs.connect_to_region( CONF.cloudwatch_sqs_region, aws_access_key_id=CONF.cloudwatch_access_key, aws_secret_access_key=CONF.cloudwatch_secret_key ) except boto.exception.SQSError, e: LOG.error('SQS API call failed: %s', e) sys.exit(1)
def run(self): self.running = True self.queue = Queue.Queue() # Create internal queue self.db = Mongo() # mongo database self.carbon = Carbon() # carbon metrics self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName())
def main(self): if CONF.heartbeat: msg = Heartbeat( origin=CONF.origin, version=__version__, ) else: msg = Alert( resource=CONF.resource, event=CONF.event, correlate=CONF.correlate, group=CONF.group, value=CONF.value, severity=CONF.severity, environment=CONF.environment, service=CONF.service, text=CONF.text, event_type='exceptionAlert', # TODO(nsatterl): make this configurable? tags=CONF.tags, origin=CONF.origin, threshold_info='n/a', #TODO(nsatterl): make this configurable? timeout=CONF.timeout, ) if CONF.dry_run: print msg else: LOG.debug('Message => %s', repr(msg)) mq = Messaging() mq.connect() mq.send(msg) mq.disconnect() return msg.get_id()
class DynectDaemon(Daemon): dynect_opts = { 'dynect_customer': '', 'dynect_username': '', 'dynect_password': '', } def __init__(self, prog, **kwargs): config.register_opts(DynectDaemon.dynect_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last_info = {} self.updating = False self.dedup = DeDup(threshold=10) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=DynectMessage(self.mq)) while not self.shuttingdown: try: self.queryDynect() if self.updating: self.alertDynect() self.last_info = self.info LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True self.running = False def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = [ 'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError' ] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = dict() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert) def check_weight(self, parent, resource): weight = self.info[resource]['status'].split(':')[2] for pool in [ resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent ]: if self.info[pool]['status'].split(':')[1] == 'no': LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status']) continue LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight) if self.info[pool]['status'].split(':')[2] != weight: return False return True def queryDynect(self): LOG.info('Query DynECT to get the state of GSLBs') try: rest_iface = DynectRest() if CONF.debug and CONF.use_stderr: rest_iface.verbose = True # login credentials = { 'customer_name': CONF.dynect_customer, 'user_name': CONF.dynect_username, 'password': CONF.dynect_password, } LOG.debug('credentials = %s', credentials) response = rest_iface.execute('/Session/', 'POST', credentials) if response['status'] != 'success': LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO']) self.updating = False return # Discover all the Zones in DynECT response = rest_iface.execute('/Zone/', 'GET') LOG.debug('/Zone/ => %s', json.dumps(response, indent=4)) zone_resources = response['data'] # Discover all the LoadBalancers for resource in zone_resources: zone = resource.split('/')[ 3] # eg. /REST/Zone/guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET') LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4)) gslb = response['data'] # Discover LoadBalancer pool information. for lb in gslb: fqdn = lb.split( '/' )[4] # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/ response = rest_iface.execute( '/LoadBalance/' + zone + '/' + fqdn + '/', 'GET') LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4)) status = response['data']['status'] monitor = response['data']['monitor'] self.info['gslb-' + fqdn] = { 'status': status, 'gslb': fqdn, 'rawData': monitor } for pool in response['data']['pool']: name = '%s-%s' % (fqdn, pool['label'].replace( ' ', '-')) status = '%s:%s:%s' % ( pool['status'], pool['serve_mode'], pool['weight']) self.info['pool-' + name] = { 'status': status, 'gslb': fqdn, 'rawData': pool } LOG.info('Finished object discovery query.') LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4)) # logout rest_iface.execute('/Session/', 'DELETE') except Exception, e: LOG.error('Failed to discover GSLBs: %s', e) self.updating = False self.updating = True
class DynectDaemon(Daemon): dynect_opts = { 'dynect_customer': '', 'dynect_username': '', 'dynect_password': '', } def __init__(self, prog, **kwargs): config.register_opts(DynectDaemon.dynect_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last_info = {} self.updating = False self.dedup = DeDup(threshold=10) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=DynectMessage(self.mq)) while not self.shuttingdown: try: self.queryDynect() if self.updating: self.alertDynect() self.last_info = self.info LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True self.running = False def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = list() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert) def check_weight(self, parent, resource): weight = self.info[resource]['status'].split(':')[2] for pool in [resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent]: if self.info[pool]['status'].split(':')[1] == 'no': LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status']) continue LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight) if self.info[pool]['status'].split(':')[2] != weight: return False return True def queryDynect(self): LOG.info('Query DynECT to get the state of GSLBs') try: rest_iface = DynectRest() if CONF.debug and CONF.use_stderr: rest_iface.verbose = True # login credentials = { 'customer_name': CONF.dynect_customer, 'user_name': CONF.dynect_username, 'password': CONF.dynect_password, } LOG.debug('credentials = %s', credentials) response = rest_iface.execute('/Session/', 'POST', credentials) if response['status'] != 'success': LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO']) self.updating = False return # Discover all the Zones in DynECT response = rest_iface.execute('/Zone/', 'GET') LOG.debug('/Zone/ => %s', json.dumps(response, indent=4)) zone_resources = response['data'] # Discover all the LoadBalancers for resource in zone_resources: zone = resource.split('/')[3] # eg. /REST/Zone/guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET') LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4)) gslb = response['data'] # Discover LoadBalancer pool information. for lb in gslb: fqdn = lb.split('/')[4] # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/' + fqdn + '/', 'GET') LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4)) status = response['data']['status'] monitor = response['data']['monitor'] self.info['gslb-' + fqdn] = {'status': status, 'gslb': fqdn, 'rawData': monitor} for pool in response['data']['pool']: name = '%s-%s' % (fqdn, pool['label'].replace(' ', '-')) status = '%s:%s:%s' % (pool['status'], pool['serve_mode'], pool['weight']) self.info['pool-' + name] = {'status': status, 'gslb': fqdn, 'rawData': pool} LOG.info('Finished object discovery query.') LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4)) # logout rest_iface.execute('/Session/', 'DELETE') except Exception, e: LOG.error('Failed to discover GSLBs: %s', e) self.updating = False self.updating = True
class SolarWindsDaemon(Daemon): solarwinds_opts = { 'solarwinds_host': 'localhost', 'solarwinds_username': '******', 'solarwinds_password': '', 'solarwinds_group': 'websys', } def __init__(self, prog, **kwargs): config.register_opts(SolarWindsDaemon.solarwinds_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True while True: try: swis = SwisClient(username=CONF.solarwinds_username, password=CONF.solarwinds_password) except Exception, e: LOG.error('SolarWinds SWIS Client error: %s', e) time.sleep(30) else: break LOG.info('Polling for SolarWinds events on %s' % CONF.solarwinds_host) # Connect to message queue self.mq = Messaging() self.mq.connect(callback=SolarWindsMessage(self.mq)) self.dedup = DeDup(by_value=True) while not self.shuttingdown: try: LOG.debug('Polling SolarWinds...') send_heartbeat = True # network, interface and volume events try: events = swis.get_npm_events() except IOError: events = [] send_heartbeat = False solarwindsAlerts = self.parse_events(events) for solarwindsAlert in solarwindsAlerts: if self.dedup.is_send(solarwindsAlert): self.mq.send(solarwindsAlert) # Cisco UCS events try: events = swis.get_ucs_events() except IOError: events = [] send_heartbeat = False solarwindsAlerts = self.parse_events(events) for solarwindsAlert in solarwindsAlerts: if self.dedup.is_send(solarwindsAlert): self.mq.send(solarwindsAlert) if send_heartbeat: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) else: LOG.error('SolarWinds failure. Skipping heartbeat.') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class AlertaDaemon(Daemon): alerta_opts = { 'forward_duplicate': 'no', } def __init__(self, prog, **kwargs): config.register_opts(AlertaDaemon.alerta_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True self.queue = Queue.Queue() # Create internal queue self.db = Mongo() # mongo database self.carbon = Carbon() # carbon metrics self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect( callback=ServerMessage(self.mq, self.queue, self.statsd)) self.mq.subscribe() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every) self.mq.send(heartbeat) time.sleep(CONF.loop_every) LOG.info('Alert processing queue length is %d', self.queue.qsize()) self.carbon.metric_send('alerta.alerts.queueLength', self.queue.qsize()) self.db.update_queue_metric(self.queue.qsize()) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class PingerDaemon(Daemon): pinger_opts = { "ping_file": "/etc/alerta/alert-pinger.targets", "ping_max_timeout": 15, # seconds "ping_max_retries": 2, "ping_slow_warning": 5, # ms "ping_slow_critical": 10, # ms "server_threads": 20, } def __init__(self, prog, **kwargs): config.register_opts(PingerDaemon.pinger_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=PingerMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase ping targets ping_list = init_targets() # Start worker threads LOG.debug("Starting %s worker threads...", CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error("Worker thread #%s did not start: %s", i, e) continue LOG.info("Started worker thread: %s", w.getName()) while not self.shuttingdown: try: for p in ping_list: if "targets" in p and p["targets"]: for target in p["targets"]: environment = p["environment"] service = p["service"] retries = p.get("retries", CONF.ping_max_retries) self.queue.put((environment, service, target, retries, time.time())) LOG.debug("Send heartbeat...") heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) time.sleep(CONF.loop_every) LOG.info("Ping queue length is %d", self.queue.qsize()) self.carbon.metric_send("alert.pinger.queueLength", self.queue.qsize()) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info("Shutdown request received...") self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info("Disconnecting from message broker...") self.mq.disconnect()
class UrlmonDaemon(Daemon): urlmon_opts = { 'urlmon_file': '/etc/alerta/alert-urlmon.targets', 'urlmon_max_timeout': 15, # seconds 'urlmon_slow_warning': 2000, # ms 'urlmon_slow_critical': 5000, # ms } def __init__(self, prog, **kwargs): config.register_opts(UrlmonDaemon.urlmon_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=UrlmonMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase alert rules urls = init_urls() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: for url in urls: self.queue.put((url, time.time())) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) time.sleep(CONF.loop_every) LOG.info('URL check queue length is %d', self.queue.qsize()) self.carbon.metric_send('alert.urlmon.queueLength', self.queue.qsize()) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class PingerDaemon(Daemon): pinger_opts = { 'ping_file': '/etc/alerta/alert-pinger.targets', 'ping_max_timeout': 15, # seconds 'ping_max_retries': 2, 'ping_slow_warning': 5, # ms 'ping_slow_critical': 10, # ms 'server_threads': 20, } def __init__(self, prog, **kwargs): config.register_opts(PingerDaemon.pinger_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=PingerMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase ping targets ping_list = init_targets() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: for p in ping_list: if 'targets' in p and p['targets']: for target in p['targets']: environment = p['environment'] service = p['service'] retries = p.get('retries', CONF.ping_max_retries) self.queue.put( (environment, service, target, retries, time.time())) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) time.sleep(CONF.loop_every) LOG.info('Ping queue length is %d', self.queue.qsize()) self.carbon.metric_send('alert.pinger.queueLength', self.queue.qsize()) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class GangliaDaemon(Daemon): def __init__(self, prog): Daemon.__init__(self, prog) self.dedup = DeDup(by_value=True) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=GangliaMessage(self.mq)) while not self.shuttingdown: try: rules = init_rules() # re-read rule config each time self.metric_check(rules) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect() def metric_check(self, rules): for rule in rules: # Check rule is valid if len(rule['thresholdInfo']) != len(rule['text']): LOG.warning( 'Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event']) continue # Get list of metrics required to evaluate each rule params = dict() if 'filter' in rule and rule['filter'] is not None: params[rule['filter']] = 1 for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']): matches = re.findall('\$([a-z0-9A-Z_]+)', s) for m in matches: if m != 'now': params['metric=' + m] = 1 metric_filter = '&'.join(params.keys()) LOG.debug('Metric filter = %s', metric_filter) # Get metric data for each rule response = GangliaDaemon.get_metrics(metric_filter) LOG.debug('Ganglia API response: %s', response) # Make non-metric substitutions in value, thresholdInfo and text now = int(time.time()) rule['value'] = re.sub('\$now', str(now), rule['value']) idx = 0 for threshold in rule['thresholdInfo']: rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold) idx += 1 idx = 0 for text in rule['text']: rule['text'][idx] = re.sub( '\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text) idx += 1 metric = dict() for m in response: # Make metric-based substitutions in resource eg. per instance, host or cluster resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource']) resource = re.sub('\$host', m.get('host', '__NA__'), resource) resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource) if '__NA__' in resource: LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource']) continue LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource) # Don't generate cluster alerts from host-based metrics if 'host' in m and not '$host' in rule['resource']: LOG.debug( 'Skipping host-based metric for cluster-based rule') continue # Build up info for alert if metric value triggers threshold if resource not in metric: metric[resource] = dict() if 'thresholdInfo' not in metric[resource]: metric[resource]['thresholdInfo'] = list( rule['thresholdInfo']) LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo']) if 'text' not in metric[resource]: metric[resource]['text'] = list(rule['text']) LOG.debug('Set text to %s', metric[resource]['text']) if m['metric'] in rule['value']: # Determine service and environment from rule if given if 'environment' in rule: metric[resource]['environment'] = [rule['environment']] else: metric[resource]['environment'] = [m['environment']] LOG.debug('Set environment for alert to %s', metric[resource]['environment']) if 'service' in rule: metric[resource]['service'] = [rule['service']] else: metric[resource]['service'] = [m['service']] LOG.debug('Set service for alert to %s', metric[resource]['service']) # Use raw metric value, or sum or average if aggregated metric if 'value' in m: v = GangliaDaemon.quote(m['value']) # raw value elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote( m['sum']) # aggregated sum value if "<metric>.sum" else: try: v = "%.1f" % (float(m['sum']) / float(m['num']) ) # average of aggregate value except ZeroDivisionError: v = 0.0 LOG.debug('Value for %s on %s is %s', m['id'], resource, v) # If no value assign rule value if 'value' not in metric[resource]: metric[resource]['value'] = rule['value'] metric[resource]['value'] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), metric[resource]['value']) metric[resource]['units'] = m['units'] # Assign tags metric[resource]['tags'] = list() metric[resource]['tags'].extend(rule['tags']) metric[resource]['tags'].append('cluster:%s' % m['cluster']) if 'tags' in m and m['tags'] is not None: metric[resource]['tags'].extend(m['tags']) # Assign graph URL if 'graphUrl' not in metric[resource]: metric[resource]['graphUrls'] = list() if 'graphUrl' in m: metric[resource]['graphUrls'].append(m['graphUrl']) for g in rule['graphs']: if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], m['host'], g)) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], g)) metric[resource]['moreInfo'] = '' if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit('/', 2) [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host']) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit( '/', 2)[0:2]) + '/?c=%s' % m['cluster'] # Substitutions for threshold info if m['metric'] in ''.join(rule['thresholdInfo']): LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 idx = 0 for threshold in metric[resource]['thresholdInfo']: metric[resource]['thresholdInfo'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), threshold) idx += 1 # Substitutions for text if m['metric'] in ''.join(rule['text']): LOG.debug('Text to be substituted: %s', ''.join(rule['text'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 if m['type'] == 'timestamp' or m['units'] == 'timestamp': v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v))) LOG.debug('Metric resource text %s', metric) idx = 0 for text in metric[resource]['text']: metric[resource]['text'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), text) idx += 1 LOG.debug('end of metric loop') for resource in metric: LOG.debug('Calculate final value for resource %s', resource) index = 0 try: calculated_value = eval(metric[resource]['value']) except KeyError: LOG.warning( 'Could not calculate %s value for %s because %s is not being reported', rule['event'], resource, rule['value']) continue except (SyntaxError, NameError): LOG.error( 'Could not calculate %s value for %s => eval(%s)', rule['event'], resource, metric[resource]['value']) continue except ZeroDivisionError: LOG.debug( 'Could not calculate %s value for %s => eval(%s) (division by zero). Setting to 0 instead.', rule['event'], resource, metric[resource]['value']) calculated_value = 0 except Exception: LOG.error( 'Could not calculate %s value for %s => eval(%s) (threw unknown exception)', rule['event'], resource, metric[resource]['value']) continue LOG.debug('Calculated value for resource %s => %s', resource, calculated_value) # Compare final value with each threshold for ti in metric[resource]['thresholdInfo']: severity, op, threshold = ti.split(':') rule_eval = '%s %s %s' % ( GangliaDaemon.quote(calculated_value), op, threshold) try: result = eval(rule_eval) except SyntaxError: LOG.error( 'Could not evaluate %s threshold for %s => eval(%s)', rule['event'], resource, rule_eval) result = False if result: event = rule['event'] group = rule['group'] value = "%s%s" % (calculated_value, GangliaDaemon.format_units( metric[resource]['units'])) environment = metric[resource]['environment'] service = metric[resource]['service'] text = metric[resource]['text'][index] tags = metric[resource]['tags'] threshold_info = ','.join(rule['thresholdInfo']) more_info = metric[resource]['moreInfo'] graph_urls = metric[resource]['graphUrls'] gangliaAlert = Alert( resource=resource, event=event, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='gangliaAlert', tags=tags, threshold_info=threshold_info, more_info=more_info, graph_urls=graph_urls, raw_data= '', # TODO(nsatterl): put raw metric values used to do calculation here ) if self.dedup.is_send(gangliaAlert): self.mq.send(gangliaAlert) break # First match wins index += 1 @staticmethod def get_metrics(filter): url = "http://%s:%s/ganglia/api/v1/metrics?%s" % ( CONF.ganglia_host, CONF.ganglia_port, filter) LOG.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, 15) except urllib2.URLError, e: LOG.error('Could not retrieve metric data from %s - %s', url, e) return dict() if r.getcode() is None: LOG.error('Error during connection or data transfer (timeout=%d)', 15) return dict() response = json.loads(r.read())['response'] if response['status'] == 'error': LOG.error('No metrics retreived - %s', response['message']) return dict() LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time']) return response['metrics']
class PingerDaemon(Daemon): def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=PingerMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics self.statsd = StatsD() # graphite metrics # Initialiase ping targets ping_list = init_targets() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon, self.statsd) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName()) while not self.shuttingdown: try: for p in ping_list: if 'targets' in p and p['targets']: for target in p['targets']: environment = p['environment'] service = p['service'] retries = p.get('retries', CONF.ping_max_retries) self.queue.put((environment, service, target, retries)) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.info('Ping queue length is %d', self.queue.qsize()) time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False for i in range(CONF.server_threads): self.queue.put(None) w.join() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class GangliaDaemon(Daemon): def __init__(self, prog): Daemon.__init__(self, prog) self.dedup = DeDup(by_value=True) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=GangliaMessage(self.mq)) while not self.shuttingdown: try: rules = init_rules() # re-read rule config each time self.metric_check(rules) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect() def metric_check(self, rules): for rule in rules: # Check rule is valid if len(rule['thresholdInfo']) != len(rule['text']): LOG.warning('Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event']) continue # Get list of metrics required to evaluate each rule params = dict() if 'filter' in rule and rule['filter'] is not None: params[rule['filter']] = 1 for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']): matches = re.findall('\$([a-z0-9A-Z_]+)', s) for m in matches: if m != 'now': params['metric=' + m] = 1 metric_filter = '&'.join(params.keys()) LOG.debug('Metric filter = %s', metric_filter) # Get metric data for each rule response = GangliaDaemon.get_metrics(metric_filter) LOG.debug('Ganglia API response: %s', response) # Make non-metric substitutions in value, thresholdInfo and text now = int(time.time()) rule['value'] = re.sub('\$now', str(now), rule['value']) idx = 0 for threshold in rule['thresholdInfo']: rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold) idx += 1 idx = 0 for text in rule['text']: rule['text'][idx] = re.sub('\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text) idx += 1 metric = dict() for m in response: # Make metric-based substitutions in resource eg. per instance, host or cluster resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource']) resource = re.sub('\$host', m.get('host', '__NA__'), resource) resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource) if '__NA__' in resource: LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource']) continue LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource) # Don't generate cluster alerts from host-based metrics if 'host' in m and not '$host' in rule['resource']: LOG.debug('Skipping host-based metric for cluster-based rule') continue # Build up info for alert if metric value triggers threshold if resource not in metric: metric[resource] = dict() if 'thresholdInfo' not in metric[resource]: metric[resource]['thresholdInfo'] = list(rule['thresholdInfo']) LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo']) if 'text' not in metric[resource]: metric[resource]['text'] = list(rule['text']) LOG.debug('Set text to %s', metric[resource]['text']) if m['metric'] in rule['value']: # Determine service and environment from rule if given if 'environment' in rule: metric[resource]['environment'] = [rule['environment']] else: metric[resource]['environment'] = [m['environment']] LOG.debug('Set environment for alert to %s', metric[resource]['environment']) if 'service' in rule: metric[resource]['service'] = [rule['service']] else: metric[resource]['service'] = [m['service']] LOG.debug('Set service for alert to %s', metric[resource]['service']) # Use raw metric value, or sum or average if aggregated metric if 'value' in m: v = GangliaDaemon.quote(m['value']) # raw value elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) # aggregated sum value if "<metric>.sum" else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) # average of aggregate value except ZeroDivisionError: v = 0.0 LOG.debug('Value for %s on %s is %s', m['id'], resource, v) # If no value assign rule value if 'value' not in metric[resource]: metric[resource]['value'] = rule['value'] metric[resource]['value'] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), metric[resource]['value']) metric[resource]['units'] = m['units'] # Assign tags metric[resource]['tags'] = list() metric[resource]['tags'].extend(rule['tags']) metric[resource]['tags'].append('cluster:%s' % m['cluster']) if 'tags' in m and m['tags'] is not None: metric[resource]['tags'].extend(m['tags']) # Assign graph URL if 'graphUrl' not in metric[resource]: metric[resource]['graphUrls'] = list() if 'graphUrl' in m: metric[resource]['graphUrls'].append(m['graphUrl']) for g in rule['graphs']: if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], m['host'], g)) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], g)) metric[resource]['moreInfo'] = '' if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host']) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s' % m['cluster'] # Substitutions for threshold info if m['metric'] in ''.join(rule['thresholdInfo']): LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 idx = 0 for threshold in metric[resource]['thresholdInfo']: metric[resource]['thresholdInfo'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), threshold) idx += 1 # Substitutions for text if m['metric'] in ''.join(rule['text']): LOG.debug('Text to be substituted: %s', ''.join(rule['text'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 if m['type'] == 'timestamp' or m['units'] == 'timestamp': v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v))) LOG.debug('Metric resource text %s', metric) idx = 0 for text in metric[resource]['text']: metric[resource]['text'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), text) idx += 1 LOG.debug('end of metric loop') for resource in metric: LOG.debug('Calculate final value for resource %s', resource) index = 0 try: calculated_value = eval(metric[resource]['value']) except KeyError: LOG.warning('Could not calculate %s value for %s because %s is not being reported', rule['event'], resource, rule['value']) continue except (SyntaxError, NameError): LOG.error('Could not calculate %s value for %s => eval(%s)', rule['event'], resource, metric[resource]['value']) continue except ZeroDivisionError: LOG.debug( 'Could not calculate %s value for %s => eval(%s) (division by zero). Setting to 0 instead.', rule['event'], resource, metric[resource]['value']) calculated_value = 0 except Exception: LOG.error('Could not calculate %s value for %s => eval(%s) (threw unknown exception)', rule['event'], resource, metric[resource]['value']) continue LOG.debug('Calculated value for resource %s => %s', resource, calculated_value) # Compare final value with each threshold for ti in metric[resource]['thresholdInfo']: severity, op, threshold = ti.split(':') rule_eval = '%s %s %s' % (GangliaDaemon.quote(calculated_value), op, threshold) try: result = eval(rule_eval) except SyntaxError: LOG.error('Could not evaluate %s threshold for %s => eval(%s)', rule['event'], resource, rule_eval) result = False if result: event = rule['event'] group = rule['group'] value = "%s%s" % (calculated_value, GangliaDaemon.format_units(metric[resource]['units'])) environment = metric[resource]['environment'] service = metric[resource]['service'] text = metric[resource]['text'][index] tags = metric[resource]['tags'] threshold_info = ','.join(rule['thresholdInfo']) more_info = metric[resource]['moreInfo'] graph_urls = metric[resource]['graphUrls'] gangliaAlert = Alert( resource=resource, event=event, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='gangliaAlert', tags=tags, threshold_info=threshold_info, more_info=more_info, graph_urls=graph_urls, raw_data='', # TODO(nsatterl): put raw metric values used to do calculation here ) if self.dedup.is_send(gangliaAlert): self.mq.send(gangliaAlert) break # First match wins index += 1 @staticmethod def get_metrics(filter): url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (CONF.ganglia_host, CONF.ganglia_port, filter) LOG.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, 15) except urllib2.URLError, e: LOG.error('Could not retrieve metric data from %s - %s', url, e) return dict() if r.getcode() is None: LOG.error('Error during connection or data transfer (timeout=%d)', 15) return dict() response = json.loads(r.read())['response'] if response['status'] == 'error': LOG.error('No metrics retreived - %s', response['message']) return dict() LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time']) return response['metrics']
import sys from flask import Flask from alerta.common import config from alerta.common import log as logging from alerta.common.mq import Messaging from alerta.server.database import Mongo Version = '2.1.0' LOG = logging.getLogger(__name__) CONF = config.CONF config.parse_args(version=Version) logging.setup('alerta') app = Flask(__name__) app.config.from_object(__name__) db = Mongo() mq = Messaging() mq.connect() import views import management.views
class IrcbotDaemon(Daemon): ircbot_opts = { 'irc_host': 'localhost', 'irc_port': 6667, 'irc_channel': '#alerts', 'irc_user': '******', } def __init__(self, prog, **kwargs): config.register_opts(IrcbotDaemon.ircbot_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True # An IRC client may send 1 message every 2 seconds # See section 5.8 in http://datatracker.ietf.org/doc/rfc2813/ tokens = LeakyBucket(tokens=20, rate=2) tokens.start() # Connect to IRC server try: irc = socket.socket(socket.AF_INET, socket.SOCK_STREAM) irc.connect((CONF.irc_host, CONF.irc_port)) time.sleep(1) irc.send('NICK %s\r\n' % CONF.irc_user) time.sleep(1) irc.send('USER %s 8 * : %s\r\n' % (CONF.irc_user, CONF.irc_user)) LOG.debug('USER -> %s', irc.recv(4096)) time.sleep(1) irc.send('JOIN %s\r\n' % CONF.irc_channel) LOG.debug('JOIN -> %s', irc.recv(4096)) except Exception, e: LOG.error('IRC connection error: %s', e) sys.exit(1) LOG.info('Joined IRC channel %s on %s as USER %s', CONF.irc_channel, CONF.irc_host, CONF.irc_user) # Connect to message queue self.mq = Messaging() self.mq.connect(callback=IrcbotMessage(self.mq, irc, tokens)) self.mq.subscribe(destination=CONF.outbound_topic) while not self.shuttingdown: try: LOG.debug('Waiting for IRC messages...') ip, op, rdy = select.select([irc], [], [], CONF.loop_every) if ip: for i in ip: if i == irc: data = irc.recv(4096).rstrip('\r\n') if len(data) > 0: if 'ERROR' in data: LOG.error('%s. Exiting...', data) sys.exit(1) else: LOG.debug('%s', data) else: LOG.warning('IRC server sent no data') if 'PING' in data: LOG.info('IRC PING received -> PONG ' + data.split()[1]) irc.send('PONG ' + data.split()[1] + '\r\n') elif 'ack' in data.lower(): LOG.info('Request to ACK %s by %s', data.split()[4], data.split()[0]) ack_alert(data.split()[4]) elif 'delete' in data.lower(): LOG.info('Request to DELETE %s by %s', data.split()[4], data.split()[0]) delete_alert(data.split()[4]) elif data.find('!alerta quit') != -1: irc.send('QUIT\r\n') else: LOG.warning('IRC: %s', data) else: i.recv() else: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False tokens.shutdown() LOG.info('Disconnecting from message broker...') self.mq.disconnect()
import sys from flask import Flask from alerta.common import config from alerta.common import log as logging from alerta.common.mq import Messaging from alerta.server.database import Mongo Version = '2.0.1' LOG = logging.getLogger(__name__) CONF = config.CONF config.parse_args(sys.argv[1:], version=Version) logging.setup('alerta') app = Flask(__name__) app.config.from_object(__name__) db = Mongo() mq = Messaging() mq.connect() import views import management.views
class AwsDaemon(Daemon): aws_opts = { 'fog_file': '/etc/fog/alerta.conf', 'ec2_regions': ['eu-west-1', 'us-east-1'], 'http_proxy': None, 'https_proxy': None, } def __init__(self, prog, **kwargs): config.register_opts(AwsDaemon.aws_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last = {} self.lookup = {} self.dedup = DeDup() def run(self): self.running = True # Read in FOG config file try: self.fog = yaml.load(open(CONF.fog_file).read()) except IOError, e: LOG.error('Could not read AWS credentials file %s: %s', CONF.fog_file, e) sys.exit(1) if not self.fog: LOG.error('No AWS credentials found in FOG file %s. Exiting...', CONF.fog_file) sys.exit(1) # Connect to message queue self.mq = Messaging() self.mq.connect(callback=AwsMessage(self.mq)) if CONF.http_proxy: os.environ['http_proxy'] = CONF.http_proxy if CONF.https_proxy: os.environ['https_proxy'] = CONF.https_proxy while not self.shuttingdown: try: self.ec2_status_check() LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()