Esempio n. 1
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage())
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(30)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat()
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 2
0
File: daemon.py Progetto: ob3/alerta
class PagerDutyDaemon(Daemon):

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PagerDutyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)   # TODO(nsatterl): use dedicated queue?

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for PagerDuty messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 3
0
class MailerDaemon(Daemon):
    def run(self):

        self.running = True

        # Start token bucket thread
        self.tokens = LeakyBucket(tokens=20, rate=30)
        self.tokens.start()

        self.onhold = dict()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(
            callback=MailerMessage(self.mq, self.onhold, self.tokens))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                LOG.debug('Send email messages...')
                for alertid in self.onhold.keys():
                    try:
                        (mailAlert, hold_time) = self.onhold[alertid]
                    except KeyError:
                        continue

                    if time.time() > hold_time:
                        if not self.tokens.get_token():
                            LOG.warning(
                                '%s : No tokens left, rate limiting this alert',
                                alertid)
                            continue

                        email = Mailer(mailAlert)
                        mail_to = CONF.mail_list.split(',')

                        if 'mailto' in mailAlert.tags:
                            mail_to.append(mailAlert.tags['mailto'])
                        email.send(mail_to=mail_to)
                        try:
                            del self.onhold[alertid]
                        except KeyError:
                            continue

                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False
        self.tokens.shutdown()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 4
0
class MailerDaemon(Daemon):

    def run(self):

        self.running = True

        # Start token bucket thread
        self.tokens = LeakyBucket(tokens=20, rate=30)
        self.tokens.start()

        self.onhold = dict()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=MailerMessage(self.mq, self.onhold, self.tokens))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                LOG.debug('Send email messages...')
                for alertid in self.onhold.keys():
                    try:
                        (mailAlert, hold_time) = self.onhold[alertid]
                    except KeyError:
                        continue

                    if time.time() > hold_time:
                        if not self.tokens.get_token():
                            LOG.warning('%s : No tokens left, rate limiting this alert', alertid)
                            continue

                        email = Mailer(mailAlert)
                        mail_to = CONF.mail_list.split(',')

                        for tag in mailAlert.tags:
                            if tag.startswith('email'):
                                mail_to.append(tag.split(':')[1])
                        email.send(mail_to=mail_to)
                        try:
                            del self.onhold[alertid]
                        except KeyError:
                            continue

                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False
        self.tokens.shutdown()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 5
0
class UrlmonDaemon(Daemon):
    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect()

        # Initialiase alert rules
        init_urls()
        url_mod_time = os.path.getmtime(URLFILE)

        # Start worker threads
        for i in range(NUM_THREADS):
            w = WorkerThread(queue)
            w.start()
            LOG.info('Starting thread: %s', w.getName())

        while not self.shuttingdown:
            try:
                # Read (or re-read) urls as necessary
                if os.path.getmtime(URLFILE) != url_mod_time:
                    init_urls()
                    url_mod_time = os.path.getmtime(URLFILE)

                for url in urls:
                    queue.put(('url', url))
                queue.put(('timestamp', time.time()))

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat()
                self.mq.send(heartbeat)

                time.sleep(_check_rate)

                urlmon_qsize = queue.qsize()
                LOG.info('URL check queue length is %d', urlmon_qsize)

                if GMETRIC_SEND:
                    gmetric_cmd = "%s --name urlmon_qsize --value %d --type uint16 --units \" \" --slope both --group urlmon %s" % (
                        GMETRIC_CMD, urlmon_qsize, GMETRIC_OPTIONS)
                    LOG.debug("%s", gmetric_cmd)
                    os.system("%s" % gmetric_cmd)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        for i in range(NUM_THREADS):
            queue.put(('stop', None))
        w.join()

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 6
0
    def run(self):

        data = sys.stdin.read()
        LOG.info('snmptrapd -> %s', data)

        snmptrapAlert = self.parse_snmptrap(data)

        mq = Messaging()
        mq.connect()
        mq.send(snmptrapAlert)
        mq.disconnect()
Esempio n. 7
0
class NotifyDaemon(Daemon):

    def run(self):

        self.running = True

        # Initialiase alert config
        init_config()

        # Start token bucket thread
        _TokenThread = TokenTopUp()
        _TokenThread.start()

        # Start notify thread
        _NotifyThread = ReleaseThread()
        _NotifyThread.start()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=NotifyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                # Read (or re-read) config as necessary
                if os.path.getmtime(CONF.yaml_config) != config_mod_time:
                    init_config()
                    config_mod_time = os.path.getmtime(CONF.yaml_config)

                LOG.debug('Waiting for email messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        _TokenThread.shutdown()
        _NotifyThread.shutdown()

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 8
0
class NotifyDaemon(Daemon):
    def run(self):

        self.running = True

        # Initialiase alert config
        init_config()

        # Start token bucket thread
        _TokenThread = TokenTopUp()
        _TokenThread.start()

        # Start notify thread
        _NotifyThread = ReleaseThread()
        _NotifyThread.start()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=NotifyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)

        while not self.shuttingdown:
            try:
                # Read (or re-read) config as necessary
                if os.path.getmtime(CONF.yaml_config) != config_mod_time:
                    init_config()
                    config_mod_time = os.path.getmtime(CONF.yaml_config)

                LOG.debug('Waiting for email messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        _TokenThread.shutdown()
        _NotifyThread.shutdown()

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 9
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """

    logger_opts = {
        'es_host': 'localhost',
        'es_port': 9200,
        'es_index':
        'alerta-%Y.%m.%d',  # NB. Kibana config must match this index
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(LoggerDaemon.logger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 10
0
class LoggerDaemon(Daemon):
    """
    Index alerts in ElasticSearch using Logstash format so that logstash GUI and/or Kibana can be used as front-ends
    """

    logger_opts = {
        'es_host': 'localhost',
        'es_port': 9200,
        'es_index': 'alerta-%Y.%m.%d',  # NB. Kibana config must match this index
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(LoggerDaemon.logger_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=LoggerMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_queue)

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for log messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 11
0
class PagerDutyDaemon(Daemon):

    pagerduty_opts = {
        'pagerduty_endpoint': 'https://events.pagerduty.com/generic/2010-04-15/create_event.json',
        'pagerduty_api_key': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(PagerDutyDaemon.pagerduty_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=PagerDutyMessage(self.mq))
        self.mq.subscribe(destination=CONF.outbound_topic)   # TODO(nsatterl): use dedicated queue?

        while not self.shuttingdown:
            try:
                LOG.debug('Waiting for PagerDuty messages...')
                time.sleep(CONF.loop_every)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Esempio n. 12
0
    def main(self):

        if CONF.heartbeat:
            msg = Heartbeat(
                origin=CONF.origin,
                version=__version__,
            )
        else:
            msg = Alert(
                resource=CONF.resource,
                event=CONF.event,
                correlate=CONF.correlate,
                group=CONF.group,
                value=CONF.value,
                severity=CONF.severity,
                environment=CONF.environment,
                service=CONF.service,
                text=CONF.text,
                event_type='exceptionAlert',  # TODO(nsatterl): make this configurable?
                tags=CONF.tags,
                origin=CONF.origin,
                threshold_info='n/a',   #TODO(nsatterl): make this configurable?
                timeout=CONF.timeout,
            )

        if CONF.dry_run:
            print msg
        else:
            LOG.debug('Message => %s', repr(msg))

            mq = Messaging()
            mq.connect()
            mq.send(msg)
            mq.disconnect()

        return msg.get_id()
Esempio n. 13
0
class GangliaDaemon(Daemon):
    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning(
                    'Skipping invalid rule %s - MUST define alert text for each threshold.',
                    rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']),
                      rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now),
                                                    threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub(
                    '\$now',
                    time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)),
                    text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'),
                                  rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'),
                                  resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s',
                              m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'],
                          rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug(
                        'Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(
                        rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s',
                              metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s',
                              metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s',
                              metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(
                            m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num'])
                                          )  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub(
                        '\$%s(\.sum)?' % m['metric'], str(v),
                        metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' %
                                                    m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' %
                                (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)
                            [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit(
                                '/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S',
                                          time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning(
                        'Could not calculate %s value for %s because %s is not being reported',
                        rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s)',
                        rule['event'], resource, metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                        rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource,
                          calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (
                        GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error(
                            'Could not evaluate %s threshold for %s => eval(%s)',
                            rule['event'], resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value,
                                          GangliaDaemon.format_units(
                                              metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data=
                            '',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (
            CONF.ganglia_host, CONF.ganglia_port, filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)',
                      15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'],
                 response['time'])

        return response['metrics']
Esempio n. 14
0
class GangliaDaemon(Daemon):

    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):
        
        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning('Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub('\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug('Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s', metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s', metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                       metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' % m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                                        threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning('Could not calculate %s value for %s because %s is not being reported',
                                rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error('Could not calculate %s value for %s => eval(%s)', rule['event'], resource,
                              metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error('Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                              rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource, calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error('Could not evaluate %s threshold for %s => eval(%s)', rule['event'],
                                  resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value, GangliaDaemon.format_units(metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data='',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (CONF.ganglia_host, CONF.ganglia_port,  filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)', 15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time'])

        return response['metrics']