Beispiel #1
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #2
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #3
0
    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Beispiel #4
0
class AlertaDaemon(Daemon):

    alerta_opts = {
        'forward_duplicate': 'no',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(AlertaDaemon.alerta_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        self.queue = Queue.Queue()  # Create internal queue
        self.db = Mongo()       # mongo database
        self.carbon = Carbon()  # carbon metrics
        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info('Alert processing queue length is %d', self.queue.qsize())
                self.carbon.metric_send('alerta.alerts.queueLength', self.queue.qsize())
                self.db.update_queue_metric(self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #5
0
    def run(self):

        self.running = True

        self.queue = Queue.Queue()  # Create internal queue
        self.db = Mongo()  # mongo database
        self.carbon = Carbon()  # carbon metrics
        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(
            callback=ServerMessage(self.mq, self.queue, self.statsd))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Beispiel #6
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False
Beispiel #7
0
    def run(self):

        self.running = True

        # Start token bucket thread
        self.tokens = LeakyBucket(tokens=20, rate=30)
        self.tokens.start()

        self.onhold = dict()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(
            callback=MailerMessage(self.mq, self.onhold, self.tokens))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                LOG.debug('Send email messages...')
                for alertid in self.onhold.keys():
                    try:
                        (mailAlert, hold_time) = self.onhold[alertid]
                    except KeyError:
                        continue

                    if time.time() > hold_time:
                        if not self.tokens.get_token():
                            LOG.warning(
                                '%s : No tokens left, rate limiting this alert',
                                alertid)
                            continue

                        email = Mailer(mailAlert)
                        mail_to = CONF.mail_list.split(',')

                        if 'mailto' in mailAlert.tags:
                            mail_to.append(mailAlert.tags['mailto'])
                        email.send(mail_to=mail_to)
                        try:
                            del self.onhold[alertid]
                        except KeyError:
                            continue

                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False
        self.tokens.shutdown()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #8
0
class UrlmonDaemon(Daemon):

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.statsd = StatsD()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                for url in urls:
                    self.queue.put(url)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.info('URL check queue length is %d', self.queue.qsize())

                time.sleep(CONF.loop_every)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #9
0
class PagerDutyDaemon(Daemon):

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PagerDutyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)   # TODO(nsatterl): use dedicated queue?

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for PagerDuty messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #10
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage())
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(30)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat()
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #11
0
class MailerDaemon(Daemon):

    def run(self):

        self.running = True

        # Start token bucket thread
        self.tokens = LeakyBucket(tokens=20, rate=30)
        self.tokens.start()

        self.onhold = dict()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=MailerMessage(self.mq, self.onhold, self.tokens))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                LOG.debug('Send email messages...')
                for alertid in self.onhold.keys():
                    try:
                        (mailAlert, hold_time) = self.onhold[alertid]
                    except KeyError:
                        continue

                    if time.time() > hold_time:
                        if not self.tokens.get_token():
                            LOG.warning('%s : No tokens left, rate limiting this alert', alertid)
                            continue

                        email = Mailer(mailAlert)
                        mail_to = CONF.mail_list.split(',')

                        for tag in mailAlert.tags:
                            if tag.startswith('email'):
                                mail_to.append(tag.split(':')[1])
                        email.send(mail_to=mail_to)
                        try:
                            del self.onhold[alertid]
                        except KeyError:
                            continue

                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False
        self.tokens.shutdown()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #12
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #13
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False
Beispiel #14
0
    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Beispiel #15
0
    def run(self):
        
        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #16
0
class AlertaDaemon(Daemon):

    def run(self):

        self.running = True

        self.queue = Queue.Queue()  # Create internal queue
        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every)
                self.mq.send(heartbeat)

                LOG.debug('Internal queue size is %s messages', self.queue.qsize())

                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #17
0
    def run(self):

        self.running = True

        # Initialiase alert config
        init_config()

        # Start token bucket thread
        _TokenThread = TokenTopUp()
        _TokenThread.start()

        # Start notify thread
        _NotifyThread = ReleaseThread()
        _NotifyThread.start()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=NotifyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                # Read (or re-read) config as necessary
                if os.path.getmtime(CONF.yaml_config) != config_mod_time:
                    init_config()
                    config_mod_time = os.path.getmtime(CONF.yaml_config)

                LOG.debug('Waiting for email messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        _TokenThread.shutdown()
        _NotifyThread.shutdown()

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #18
0
class CloudWatchDaemon(Daemon):

    cloudwatch_opts = {
        'cloudwatch_sqs_region': 'eu-west-1',
        'cloudwatch_sqs_queue': 'cloudwatch-to-alerta',
        'cloudwatch_access_key': '022QF06E7MXBSAMPLE',
        'cloudwatch_secret_key': ''
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(CloudWatchDaemon.cloudwatch_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=CloudWatchMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue)
        try:
            sqs = boto.sqs.connect_to_region(
                CONF.cloudwatch_sqs_region,
                aws_access_key_id=CONF.cloudwatch_access_key,
                aws_secret_access_key=CONF.cloudwatch_secret_key
            )
        except boto.exception.SQSError, e:
            LOG.error('SQS API call failed: %s', e)
            sys.exit(1)

        try:
            q = sqs.create_queue(CONF.cloudwatch_sqs_queue)
            q.set_message_class(RawMessage)
        except boto.exception.SQSError, e:
            LOG.error('SQS queue error: %s', e)
            sys.exit(1)
Beispiel #19
0
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect()

        # Initialiase alert rules
        init_urls()
        url_mod_time = os.path.getmtime(URLFILE)

        # Start worker threads
        for i in range(NUM_THREADS):
            w = WorkerThread(queue)
            w.start()
            LOG.info('Starting thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                # Read (or re-read) urls as necessary
                if os.path.getmtime(URLFILE) != url_mod_time:
                    init_urls()
                    url_mod_time = os.path.getmtime(URLFILE)

                for url in urls:
                    queue.put(('url', url))
                queue.put(('timestamp', time.time()))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat()
                self.mq.send(heartbeat)

                time.sleep(_check_rate)

                urlmon_qsize = queue.qsize()
                LOG.info('URL check queue length is %d', urlmon_qsize)

                if GMETRIC_SEND:
                    gmetric_cmd = "%s --name urlmon_qsize --value %d --type uint16 --units \" \" --slope both --group urlmon %s" % (
                        GMETRIC_CMD, urlmon_qsize, GMETRIC_OPTIONS)
                    LOG.debug("%s", gmetric_cmd)
                    os.system("%s" % gmetric_cmd)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(NUM_THREADS):
            queue.put(('stop', None))
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #20
0
class CloudWatchDaemon(Daemon):

    cloudwatch_opts = {
        'cloudwatch_sqs_region': 'eu-west-1',
        'cloudwatch_sqs_queue': 'cloudwatch-to-alerta',
        'cloudwatch_access_key': '022QF06E7MXBSAMPLE',
        'cloudwatch_secret_key': ''
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(CloudWatchDaemon.cloudwatch_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=CloudWatchMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue)
        try:
            sqs = boto.sqs.connect_to_region(
                CONF.cloudwatch_sqs_region,
                aws_access_key_id=CONF.cloudwatch_access_key,
                aws_secret_access_key=CONF.cloudwatch_secret_key)
        except boto.exception.SQSError, e:
            LOG.error('SQS API call failed: %s', e)
            sys.exit(1)

        try:
            q = sqs.create_queue(CONF.cloudwatch_sqs_queue)
            q.set_message_class(RawMessage)
        except boto.exception.SQSError, e:
            LOG.error('SQS queue error: %s', e)
            sys.exit(1)
Beispiel #21
0
    def run(self):

        self.running = True

        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=CloudWatchMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue)
        try:
            sqs = boto.sqs.connect_to_region(
                CONF.cloudwatch_sqs_region,
                aws_access_key_id=CONF.cloudwatch_access_key,
                aws_secret_access_key=CONF.cloudwatch_secret_key)
        except boto.exception.SQSError, e:
            LOG.error('SQS API call failed: %s', e)
            sys.exit(1)
Beispiel #22
0
    def run(self):

        data = sys.stdin.read()
        LOG.info('snmptrapd -> %s', data)

        snmptrapAlert = self.parse_snmptrap(data)

        mq = Messaging()
        mq.connect()
        mq.send(snmptrapAlert)
        mq.disconnect()
Beispiel #23
0
class NotifyDaemon(Daemon):

    def run(self):

        self.running = True

        # Initialiase alert config
        init_config()

        # Start token bucket thread
        _TokenThread = TokenTopUp()
        _TokenThread.start()

        # Start notify thread
        _NotifyThread = ReleaseThread()
        _NotifyThread.start()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=NotifyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                # Read (or re-read) config as necessary
                if os.path.getmtime(CONF.yaml_config) != config_mod_time:
                    init_config()
                    config_mod_time = os.path.getmtime(CONF.yaml_config)

                LOG.debug('Waiting for email messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        _TokenThread.shutdown()
        _NotifyThread.shutdown()

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #24
0
class AlertaDaemon(Daemon):

    def run(self):
        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=ServerMessage(self.queue))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s alert handler threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started alert handler thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                time.sleep(0.1)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True
                for i in range(CONF.server_threads):
                    self.queue.put(None)

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #25
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """

    logger_opts = {
        'es_host': 'localhost',
        'es_port': 9200,
        'es_index':
        'alerta-%Y.%m.%d',  # NB. Kibana config must match this index
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(LoggerDaemon.logger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #26
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """

    logger_opts = {
        'es_host': 'localhost',
        'es_port': 9200,
        'es_index': 'alerta-%Y.%m.%d',  # NB. Kibana config must match this index
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(LoggerDaemon.logger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #27
0
class PagerDutyDaemon(Daemon):

    pagerduty_opts = {
        'pagerduty_endpoint': 'https://events.pagerduty.com/generic/2010-04-15/create_event.json',
        'pagerduty_api_key': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(PagerDutyDaemon.pagerduty_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PagerDutyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)   # TODO(nsatterl): use dedicated queue?

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for PagerDuty messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #28
0
    def run(self):
        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=ServerMessage(self.queue))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s alert handler threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started alert handler thread: %s', w.getName())
Beispiel #29
0
    def run(self):

        self.running = True

        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=CloudWatchMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue)
        try:
            sqs = boto.sqs.connect_to_region(
                CONF.cloudwatch_sqs_region,
                aws_access_key_id=CONF.cloudwatch_access_key,
                aws_secret_access_key=CONF.cloudwatch_secret_key
            )
        except boto.exception.SQSError, e:
            LOG.error('SQS API call failed: %s', e)
            sys.exit(1)
Beispiel #30
0
    def run(self):

        self.running = True

        self.queue = Queue.Queue()  # Create internal queue
        self.db = Mongo()       # mongo database
        self.carbon = Carbon()  # carbon metrics
        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=ServerMessage(self.mq, self.queue, self.statsd))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Beispiel #31
0
    def main(self):

        if CONF.heartbeat:
            msg = Heartbeat(
                origin=CONF.origin,
                version=__version__,
            )
        else:
            msg = Alert(
                resource=CONF.resource,
                event=CONF.event,
                correlate=CONF.correlate,
                group=CONF.group,
                value=CONF.value,
                severity=CONF.severity,
                environment=CONF.environment,
                service=CONF.service,
                text=CONF.text,
                event_type='exceptionAlert',  # TODO(nsatterl): make this configurable?
                tags=CONF.tags,
                origin=CONF.origin,
                threshold_info='n/a',   #TODO(nsatterl): make this configurable?
                timeout=CONF.timeout,
            )

        if CONF.dry_run:
            print msg
        else:
            LOG.debug('Message => %s', repr(msg))

            mq = Messaging()
            mq.connect()
            mq.send(msg)
            mq.disconnect()

        return msg.get_id()
Beispiel #32
0
class DynectDaemon(Daemon):

    dynect_opts = {
        'dynect_customer': '',
        'dynect_username': '',
        'dynect_password': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False

    def alertDynect(self):

        for resource in self.info:

            if resource not in self.last_info:
                continue

            if resource.startswith('gslb-'):

                # gslb status       = ok | unk | trouble | failover

                text = 'GSLB status is %s.' % self.info[resource]['status']

                if self.info[resource]['status'] == 'ok':
                    event = 'GslbOK'
                    severity = severity_code.NORMAL
                else:
                    event = 'GslbNotOK'
                    severity = severity_code.CRITICAL
                correlate = ['GslbOK', 'GslbNotOK']

            elif resource.startswith('pool-'):

                # pool status       = up | unk | down
                # pool serve_mode   = obey | always | remove | no
                # pool weight	(1-15)

                if 'down' in self.info[resource]['status']:
                    event = 'PoolDown'
                    severity = severity_code.MAJOR
                    text = 'Pool is down'
                elif 'obey' not in self.info[resource]['status']:
                    event = 'PoolServe'
                    severity = severity_code.MAJOR
                    text = 'Pool with an incorrect serve mode'
                elif self.check_weight(self.info[resource]['gslb'],
                                       resource) is False:
                    event = 'PoolWeightError'
                    severity = severity_code.MINOR
                    text = 'Pool with an incorrect weight'
                else:
                    event = 'PoolUp'
                    severity = severity_code.NORMAL
                    text = 'Pool status is normal'
                correlate = [
                    'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'
                ]

            else:
                LOG.warning('Unknown resource type: %s', resource)
                continue

            # Defaults
            group = 'GSLB'
            value = self.info[resource]['status']
            environment = ['PROD']
            service = ['Network']
            tags = dict()
            timeout = None
            threshold_info = None
            summary = None
            raw_data = self.info[resource]['rawData']

            dynectAlert = Alert(
                resource=resource,
                event=event,
                correlate=correlate,
                group=group,
                value=value,
                severity=severity,
                environment=environment,
                service=service,
                text=text,
                event_type='serviceAlert',
                tags=tags,
                timeout=timeout,
                threshold_info=threshold_info,
                summary=summary,
                raw_data=raw_data,
            )

            suppress = dynectAlert.transform_alert()
            if suppress:
                LOG.info('Suppressing %s alert', dynectAlert.event)
                LOG.debug('%s', dynectAlert)
                continue

            if self.dedup.is_send(dynectAlert):
                self.mq.send(dynectAlert)

    def check_weight(self, parent, resource):

        weight = self.info[resource]['status'].split(':')[2]
        for pool in [
                resource for resource in self.info
                if resource.startswith('pool')
                and self.info[resource]['gslb'] == parent
        ]:
            if self.info[pool]['status'].split(':')[1] == 'no':
                LOG.warning('Skipping %s because not serving for pool %s',
                            pool, self.info[pool]['status'])
                continue

            LOG.debug('pool %s weight %s <=> %s', pool,
                      self.info[pool]['status'].split(':')[2], weight)
            if self.info[pool]['status'].split(':')[2] != weight:
                return False
        return True

    def queryDynect(self):

        LOG.info('Query DynECT to get the state of GSLBs')
        try:
            rest_iface = DynectRest()
            if CONF.debug and CONF.use_stderr:
                rest_iface.verbose = True

            # login
            credentials = {
                'customer_name': CONF.dynect_customer,
                'user_name': CONF.dynect_username,
                'password': CONF.dynect_password,
            }
            LOG.debug('credentials = %s', credentials)
            response = rest_iface.execute('/Session/', 'POST', credentials)

            if response['status'] != 'success':
                LOG.error('Failed to create API session: %s',
                          response['msgs'][0]['INFO'])
                self.updating = False
                return

            # Discover all the Zones in DynECT
            response = rest_iface.execute('/Zone/', 'GET')
            LOG.debug('/Zone/ => %s', json.dumps(response, indent=4))
            zone_resources = response['data']

            # Discover all the LoadBalancers
            for resource in zone_resources:
                zone = resource.split('/')[
                    3]  # eg. /REST/Zone/guardiannews.com/
                response = rest_iface.execute('/LoadBalance/' + zone + '/',
                                              'GET')
                LOG.debug('/LoadBalance/%s/ => %s', zone,
                          json.dumps(response, indent=4))
                gslb = response['data']

                # Discover LoadBalancer pool information.
                for lb in gslb:
                    fqdn = lb.split(
                        '/'
                    )[4]  # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/
                    response = rest_iface.execute(
                        '/LoadBalance/' + zone + '/' + fqdn + '/', 'GET')
                    LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn,
                              json.dumps(response, indent=4))
                    status = response['data']['status']
                    monitor = response['data']['monitor']
                    self.info['gslb-' + fqdn] = {
                        'status': status,
                        'gslb': fqdn,
                        'rawData': monitor
                    }

                    for pool in response['data']['pool']:
                        name = '%s-%s' % (fqdn, pool['label'].replace(
                            ' ', '-'))
                        status = '%s:%s:%s' % (
                            pool['status'], pool['serve_mode'], pool['weight'])
                        self.info['pool-' + name] = {
                            'status': status,
                            'gslb': fqdn,
                            'rawData': pool
                        }

            LOG.info('Finished object discovery query.')
            LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4))

            # logout
            rest_iface.execute('/Session/', 'DELETE')

        except Exception, e:
            LOG.error('Failed to discover GSLBs: %s', e)
            self.updating = False

        self.updating = True
Beispiel #33
0
class DynectDaemon(Daemon):

    dynect_opts = {
        'dynect_customer': '',
        'dynect_username': '',
        'dynect_password': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False

    def alertDynect(self):

        for resource in self.info:

            if resource not in self.last_info:
                continue

            if resource.startswith('gslb-'):

                # gslb status       = ok | unk | trouble | failover

                text = 'GSLB status is %s.' % self.info[resource]['status']

                if self.info[resource]['status'] == 'ok':
                    event = 'GslbOK'
                    severity = severity_code.NORMAL
                else:
                    event = 'GslbNotOK'
                    severity = severity_code.CRITICAL
                correlate = ['GslbOK', 'GslbNotOK']

            elif resource.startswith('pool-'):

                # pool status       = up | unk | down
                # pool serve_mode   = obey | always | remove | no
                # pool weight	(1-15)

                if 'down' in self.info[resource]['status']:
                    event = 'PoolDown'
                    severity = severity_code.MAJOR
                    text = 'Pool is down'
                elif 'obey' not in self.info[resource]['status']:
                    event = 'PoolServe'
                    severity = severity_code.MAJOR
                    text = 'Pool with an incorrect serve mode'
                elif self.check_weight(self.info[resource]['gslb'], resource) is False:
                    event = 'PoolWeightError'
                    severity = severity_code.MINOR
                    text = 'Pool with an incorrect weight'
                else:
                    event = 'PoolUp'
                    severity = severity_code.NORMAL
                    text = 'Pool status is normal'
                correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError']

            else:
                LOG.warning('Unknown resource type: %s', resource)
                continue

            # Defaults
            group = 'GSLB'
            value = self.info[resource]['status']
            environment = ['PROD']
            service = ['Network']
            tags = list()
            timeout = None
            threshold_info = None
            summary = None
            raw_data = self.info[resource]['rawData']

            dynectAlert = Alert(
                resource=resource,
                event=event,
                correlate=correlate,
                group=group,
                value=value,
                severity=severity,
                environment=environment,
                service=service,
                text=text,
                event_type='serviceAlert',
                tags=tags,
                timeout=timeout,
                threshold_info=threshold_info,
                summary=summary,
                raw_data=raw_data,
            )

            suppress = dynectAlert.transform_alert()
            if suppress:
                LOG.info('Suppressing %s alert', dynectAlert.event)
                LOG.debug('%s', dynectAlert)
                continue

            if self.dedup.is_send(dynectAlert):
                self.mq.send(dynectAlert)

    def check_weight(self, parent, resource):
        
        weight = self.info[resource]['status'].split(':')[2]
        for pool in [resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent]:
            if self.info[pool]['status'].split(':')[1] == 'no':
                LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status'])
                continue

            LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight)
            if self.info[pool]['status'].split(':')[2] != weight:
                return False
        return True

    def queryDynect(self):

        LOG.info('Query DynECT to get the state of GSLBs')
        try:
            rest_iface = DynectRest()
            if CONF.debug and CONF.use_stderr:
                rest_iface.verbose = True

            # login
            credentials = {
                'customer_name': CONF.dynect_customer,
                'user_name': CONF.dynect_username,
                'password': CONF.dynect_password,
            }
            LOG.debug('credentials = %s', credentials)
            response = rest_iface.execute('/Session/', 'POST', credentials)

            if response['status'] != 'success':
                LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO'])
                self.updating = False
                return

            # Discover all the Zones in DynECT
            response = rest_iface.execute('/Zone/', 'GET')
            LOG.debug('/Zone/ => %s', json.dumps(response, indent=4))
            zone_resources = response['data']

            # Discover all the LoadBalancers
            for resource in zone_resources:
                zone = resource.split('/')[3]  # eg. /REST/Zone/guardiannews.com/
                response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET')
                LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4))
                gslb = response['data']

                # Discover LoadBalancer pool information.
                for lb in gslb:
                    fqdn = lb.split('/')[4]  # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/
                    response = rest_iface.execute('/LoadBalance/' + zone + '/' + fqdn + '/', 'GET')
                    LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4))
                    status = response['data']['status']
                    monitor = response['data']['monitor']
                    self.info['gslb-' + fqdn] = {'status': status, 'gslb': fqdn, 'rawData': monitor}

                    for pool in response['data']['pool']:
                        name = '%s-%s' % (fqdn, pool['label'].replace(' ', '-'))
                        status = '%s:%s:%s' % (pool['status'], pool['serve_mode'], pool['weight'])
                        self.info['pool-' + name] = {'status': status, 'gslb': fqdn, 'rawData': pool}

            LOG.info('Finished object discovery query.')
            LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4))

            # logout
            rest_iface.execute('/Session/', 'DELETE')

        except Exception, e:
            LOG.error('Failed to discover GSLBs: %s', e)
            self.updating = False

        self.updating = True
Beispiel #34
0
class SolarWindsDaemon(Daemon):

    solarwinds_opts = {
        'solarwinds_host': 'localhost',
        'solarwinds_username': '******',
        'solarwinds_password': '',
        'solarwinds_group': 'websys',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(SolarWindsDaemon.solarwinds_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        while True:
            try:
                swis = SwisClient(username=CONF.solarwinds_username,
                                  password=CONF.solarwinds_password)
            except Exception, e:
                LOG.error('SolarWinds SWIS Client error: %s', e)
                time.sleep(30)
            else:
                break
        LOG.info('Polling for SolarWinds events on %s' % CONF.solarwinds_host)

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=SolarWindsMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        while not self.shuttingdown:
            try:
                LOG.debug('Polling SolarWinds...')
                send_heartbeat = True

                # network, interface and volume events
                try:
                    events = swis.get_npm_events()
                except IOError:
                    events = []
                    send_heartbeat = False

                solarwindsAlerts = self.parse_events(events)
                for solarwindsAlert in solarwindsAlerts:
                    if self.dedup.is_send(solarwindsAlert):
                        self.mq.send(solarwindsAlert)

                # Cisco UCS events
                try:
                    events = swis.get_ucs_events()
                except IOError:
                    events = []
                    send_heartbeat = False

                solarwindsAlerts = self.parse_events(events)
                for solarwindsAlert in solarwindsAlerts:
                    if self.dedup.is_send(solarwindsAlert):
                        self.mq.send(solarwindsAlert)

                if send_heartbeat:
                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)
                else:
                    LOG.error('SolarWinds failure. Skipping heartbeat.')

                time.sleep(CONF.loop_every)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #35
0
class AlertaDaemon(Daemon):

    alerta_opts = {
        'forward_duplicate': 'no',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(AlertaDaemon.alerta_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        self.queue = Queue.Queue()  # Create internal queue
        self.db = Mongo()  # mongo database
        self.carbon = Carbon()  # carbon metrics
        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(
            callback=ServerMessage(self.mq, self.queue, self.statsd))
        self.mq.subscribe()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version, timeout=CONF.loop_every)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info('Alert processing queue length is %d',
                         self.queue.qsize())
                self.carbon.metric_send('alerta.alerts.queueLength',
                                        self.queue.qsize())
                self.db.update_queue_metric(self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #36
0
class PingerDaemon(Daemon):

    pinger_opts = {
        "ping_file": "/etc/alerta/alert-pinger.targets",
        "ping_max_timeout": 15,  # seconds
        "ping_max_retries": 2,
        "ping_slow_warning": 5,  # ms
        "ping_slow_critical": 10,  # ms
        "server_threads": 20,
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(PingerDaemon.pinger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PingerMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase ping targets
        ping_list = init_targets()

        # Start worker threads
        LOG.debug("Starting %s worker threads...", CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error("Worker thread #%s did not start: %s", i, e)
                continue
            LOG.info("Started worker thread: %s", w.getName())

        while not self.shuttingdown:
            try:
                for p in ping_list:
                    if "targets" in p and p["targets"]:
                        for target in p["targets"]:
                            environment = p["environment"]
                            service = p["service"]
                            retries = p.get("retries", CONF.ping_max_retries)
                            self.queue.put((environment, service, target, retries, time.time()))

                LOG.debug("Send heartbeat...")
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info("Ping queue length is %d", self.queue.qsize())
                self.carbon.metric_send("alert.pinger.queueLength", self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info("Shutdown request received...")
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info("Disconnecting from message broker...")
        self.mq.disconnect()
Beispiel #37
0
class UrlmonDaemon(Daemon):

    urlmon_opts = {
        'urlmon_file': '/etc/alerta/alert-urlmon.targets',
        'urlmon_max_timeout': 15,  # seconds
        'urlmon_slow_warning': 2000,  # ms
        'urlmon_slow_critical': 5000,  # ms
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(UrlmonDaemon.urlmon_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                for url in urls:
                    self.queue.put((url, time.time()))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info('URL check queue length is %d', self.queue.qsize())
                self.carbon.metric_send('alert.urlmon.queueLength',
                                        self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #38
0
class PingerDaemon(Daemon):

    pinger_opts = {
        'ping_file': '/etc/alerta/alert-pinger.targets',
        'ping_max_timeout': 15,  # seconds
        'ping_max_retries': 2,
        'ping_slow_warning': 5,  # ms
        'ping_slow_critical': 10,  # ms
        'server_threads': 20,
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(PingerDaemon.pinger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PingerMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase ping targets
        ping_list = init_targets()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                for p in ping_list:
                    if 'targets' in p and p['targets']:
                        for target in p['targets']:
                            environment = p['environment']
                            service = p['service']
                            retries = p.get('retries', CONF.ping_max_retries)
                            self.queue.put(
                                (environment, service, target, retries,
                                 time.time()))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info('Ping queue length is %d', self.queue.qsize())
                self.carbon.metric_send('alert.pinger.queueLength',
                                        self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #39
0
class GangliaDaemon(Daemon):
    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning(
                    'Skipping invalid rule %s - MUST define alert text for each threshold.',
                    rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']),
                      rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now),
                                                    threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub(
                    '\$now',
                    time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)),
                    text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'),
                                  rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'),
                                  resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s',
                              m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'],
                          rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug(
                        'Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(
                        rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s',
                              metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s',
                              metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s',
                              metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(
                            m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num'])
                                          )  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub(
                        '\$%s(\.sum)?' % m['metric'], str(v),
                        metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' %
                                                    m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' %
                                (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)
                            [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit(
                                '/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S',
                                          time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning(
                        'Could not calculate %s value for %s because %s is not being reported',
                        rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s)',
                        rule['event'], resource, metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                        rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource,
                          calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (
                        GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error(
                            'Could not evaluate %s threshold for %s => eval(%s)',
                            rule['event'], resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value,
                                          GangliaDaemon.format_units(
                                              metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data=
                            '',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (
            CONF.ganglia_host, CONF.ganglia_port, filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)',
                      15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'],
                 response['time'])

        return response['metrics']
Beispiel #40
0
class UrlmonDaemon(Daemon):

    urlmon_opts = {
        'urlmon_file': '/etc/alerta/alert-urlmon.targets',
        'urlmon_max_timeout': 15,  # seconds
        'urlmon_slow_warning': 2000,   # ms
        'urlmon_slow_critical': 5000,  # ms
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(UrlmonDaemon.urlmon_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                for url in urls:
                    self.queue.put((url, time.time()))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                time.sleep(CONF.loop_every)
                LOG.info('URL check queue length is %d', self.queue.qsize())
                self.carbon.metric_send('alert.urlmon.queueLength', self.queue.qsize())

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #41
0
class PingerDaemon(Daemon):

    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PingerMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics
        self.statsd = StatsD()  # graphite metrics

        # Initialiase ping targets
        ping_list = init_targets()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon, self.statsd)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                for p in ping_list:
                    if 'targets' in p and p['targets']:
                        for target in p['targets']:
                            environment = p['environment']
                            service = p['service']
                            retries = p.get('retries', CONF.ping_max_retries)
                            self.queue.put((environment, service, target, retries))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.info('Ping queue length is %d', self.queue.qsize())
                time.sleep(CONF.loop_every)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(CONF.server_threads):
            self.queue.put(None)
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #42
0
class GangliaDaemon(Daemon):

    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):
        
        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning('Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub('\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug('Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s', metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s', metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                       metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' % m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                                        threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning('Could not calculate %s value for %s because %s is not being reported',
                                rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error('Could not calculate %s value for %s => eval(%s)', rule['event'], resource,
                              metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error('Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                              rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource, calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error('Could not evaluate %s threshold for %s => eval(%s)', rule['event'],
                                  resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value, GangliaDaemon.format_units(metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data='',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (CONF.ganglia_host, CONF.ganglia_port,  filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)', 15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time'])

        return response['metrics']
Beispiel #43
0
import sys

from flask import Flask

from alerta.common import config
from alerta.common import log as logging
from alerta.common.mq import Messaging
from alerta.server.database import Mongo

Version = '2.1.0'

LOG = logging.getLogger(__name__)
CONF = config.CONF

config.parse_args(version=Version)
logging.setup('alerta')

app = Flask(__name__)
app.config.from_object(__name__)
db = Mongo()

mq = Messaging()
mq.connect()

import views
import management.views

Beispiel #44
0
class IrcbotDaemon(Daemon):

    ircbot_opts = {
        'irc_host': 'localhost',
        'irc_port': 6667,
        'irc_channel': '#alerts',
        'irc_user': '******',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(IrcbotDaemon.ircbot_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # An IRC client may send 1 message every 2 seconds
        # See section 5.8 in http://datatracker.ietf.org/doc/rfc2813/
        tokens = LeakyBucket(tokens=20, rate=2)
        tokens.start()

        # Connect to IRC server
        try:
            irc = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            irc.connect((CONF.irc_host, CONF.irc_port))
            time.sleep(1)
            irc.send('NICK %s\r\n' % CONF.irc_user)
            time.sleep(1)
            irc.send('USER %s 8 * : %s\r\n' % (CONF.irc_user, CONF.irc_user))
            LOG.debug('USER -> %s', irc.recv(4096))
            time.sleep(1)
            irc.send('JOIN %s\r\n' % CONF.irc_channel)
            LOG.debug('JOIN ->  %s', irc.recv(4096))
        except Exception, e:
            LOG.error('IRC connection error: %s', e)
            sys.exit(1)

        LOG.info('Joined IRC channel %s on %s as USER %s', CONF.irc_channel, CONF.irc_host, CONF.irc_user)

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=IrcbotMessage(self.mq, irc, tokens))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for IRC messages...')
                ip, op, rdy = select.select([irc], [], [], CONF.loop_every)
                if ip:
                    for i in ip:
                        if i == irc:
                            data = irc.recv(4096).rstrip('\r\n')
                            if len(data) > 0:
                                if 'ERROR' in data:
                                    LOG.error('%s. Exiting...', data)
                                    sys.exit(1)
                                else:
                                    LOG.debug('%s', data)
                            else:
                                LOG.warning('IRC server sent no data')
                            if 'PING' in data:
                                LOG.info('IRC PING received -> PONG ' + data.split()[1])
                                irc.send('PONG ' + data.split()[1] + '\r\n')
                            elif 'ack' in data.lower():
                                LOG.info('Request to ACK %s by %s', data.split()[4], data.split()[0])
                                ack_alert(data.split()[4])
                            elif 'delete' in data.lower():
                                LOG.info('Request to DELETE %s by %s', data.split()[4], data.split()[0])
                                delete_alert(data.split()[4])
                            elif data.find('!alerta quit') != -1:
                                irc.send('QUIT\r\n')
                            else:
                                LOG.warning('IRC: %s', data)
                        else:
                            i.recv()
                else:
                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False
        tokens.shutdown()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Beispiel #45
0
import sys

from flask import Flask

from alerta.common import config
from alerta.common import log as logging
from alerta.common.mq import Messaging
from alerta.server.database import Mongo

Version = '2.0.1'

LOG = logging.getLogger(__name__)
CONF = config.CONF

config.parse_args(sys.argv[1:], version=Version)
logging.setup('alerta')

app = Flask(__name__)
app.config.from_object(__name__)
db = Mongo()

mq = Messaging()
mq.connect()

import views
import management.views

Beispiel #46
0
class AwsDaemon(Daemon):

    aws_opts = {
        'fog_file': '/etc/fog/alerta.conf',
        'ec2_regions': ['eu-west-1', 'us-east-1'],
        'http_proxy': None,
        'https_proxy': None,
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(AwsDaemon.aws_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last = {}
        self.lookup = {}
        self.dedup = DeDup()

    def run(self):

        self.running = True

        # Read in FOG config file
        try:
            self.fog = yaml.load(open(CONF.fog_file).read())
        except IOError, e:
            LOG.error('Could not read AWS credentials file %s: %s',
                      CONF.fog_file, e)
            sys.exit(1)

        if not self.fog:
            LOG.error('No AWS credentials found in FOG file %s. Exiting...',
                      CONF.fog_file)
            sys.exit(1)

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=AwsMessage(self.mq))

        if CONF.http_proxy:
            os.environ['http_proxy'] = CONF.http_proxy
        if CONF.https_proxy:
            os.environ['https_proxy'] = CONF.https_proxy

        while not self.shuttingdown:
            try:
                self.ec2_status_check()

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()