Example #1
0
    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)
Example #2
0
    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=UrlmonMessage(self.mq))

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase alert rules
        urls = init_urls()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Example #3
0
    def run(self):

        self.running = True

        # Create internal queue
        self.queue = Queue.Queue()

        self.api = ApiClient()

        self.dedup = DeDup()

        self.carbon = Carbon()  # graphite metrics

        # Initialiase ping targets
        ping_list = init_targets()

        # Start worker threads
        LOG.debug('Starting %s worker threads...', CONF.server_threads)
        for i in range(CONF.server_threads):
            w = WorkerThread(self.api, self.queue, self.dedup, self.carbon)
            try:
                w.start()
            except Exception, e:
                LOG.error('Worker thread #%s did not start: %s', i, e)
                continue
            LOG.info('Started worker thread: %s', w.getName())
Example #4
0
    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)
Example #5
0
    def __init__(self, prog, **kwargs):

        config.register_opts(AwsDaemon.aws_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last = {}
        self.lookup = {}
        self.dedup = DeDup()
Example #6
0
    def run(self):

        self.running = True

        self.statsd = StatsD()  # graphite metrics

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=CloudWatchMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue)
        try:
            sqs = boto.sqs.connect_to_region(
                CONF.cloudwatch_sqs_region,
                aws_access_key_id=CONF.cloudwatch_access_key,
                aws_secret_access_key=CONF.cloudwatch_secret_key)
        except boto.exception.SQSError, e:
            LOG.error('SQS API call failed: %s', e)
            sys.exit(1)
Example #7
0
class DynectDaemon(Daemon):

    dynect_opts = {
        'dynect_customer': '',
        'dynect_username': '',
        'dynect_password': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False

    def alertDynect(self):

        for resource in self.info:

            if resource not in self.last_info:
                continue

            if resource.startswith('gslb-'):

                # gslb status       = ok | unk | trouble | failover

                text = 'GSLB status is %s.' % self.info[resource]['status']

                if self.info[resource]['status'] == 'ok':
                    event = 'GslbOK'
                    severity = severity_code.NORMAL
                else:
                    event = 'GslbNotOK'
                    severity = severity_code.CRITICAL
                correlate = ['GslbOK', 'GslbNotOK']

            elif resource.startswith('pool-'):

                # pool status       = up | unk | down
                # pool serve_mode   = obey | always | remove | no
                # pool weight	(1-15)

                if 'down' in self.info[resource]['status']:
                    event = 'PoolDown'
                    severity = severity_code.MAJOR
                    text = 'Pool is down'
                elif 'obey' not in self.info[resource]['status']:
                    event = 'PoolServe'
                    severity = severity_code.MAJOR
                    text = 'Pool with an incorrect serve mode'
                elif self.check_weight(self.info[resource]['gslb'], resource) is False:
                    event = 'PoolWeightError'
                    severity = severity_code.MINOR
                    text = 'Pool with an incorrect weight'
                else:
                    event = 'PoolUp'
                    severity = severity_code.NORMAL
                    text = 'Pool status is normal'
                correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError']

            else:
                LOG.warning('Unknown resource type: %s', resource)
                continue

            # Defaults
            group = 'GSLB'
            value = self.info[resource]['status']
            environment = ['PROD']
            service = ['Network']
            tags = list()
            timeout = None
            threshold_info = None
            summary = None
            raw_data = self.info[resource]['rawData']

            dynectAlert = Alert(
                resource=resource,
                event=event,
                correlate=correlate,
                group=group,
                value=value,
                severity=severity,
                environment=environment,
                service=service,
                text=text,
                event_type='serviceAlert',
                tags=tags,
                timeout=timeout,
                threshold_info=threshold_info,
                summary=summary,
                raw_data=raw_data,
            )

            suppress = dynectAlert.transform_alert()
            if suppress:
                LOG.info('Suppressing %s alert', dynectAlert.event)
                LOG.debug('%s', dynectAlert)
                continue

            if self.dedup.is_send(dynectAlert):
                self.mq.send(dynectAlert)

    def check_weight(self, parent, resource):
        
        weight = self.info[resource]['status'].split(':')[2]
        for pool in [resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent]:
            if self.info[pool]['status'].split(':')[1] == 'no':
                LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status'])
                continue

            LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight)
            if self.info[pool]['status'].split(':')[2] != weight:
                return False
        return True

    def queryDynect(self):

        LOG.info('Query DynECT to get the state of GSLBs')
        try:
            rest_iface = DynectRest()
            if CONF.debug and CONF.use_stderr:
                rest_iface.verbose = True

            # login
            credentials = {
                'customer_name': CONF.dynect_customer,
                'user_name': CONF.dynect_username,
                'password': CONF.dynect_password,
            }
            LOG.debug('credentials = %s', credentials)
            response = rest_iface.execute('/Session/', 'POST', credentials)

            if response['status'] != 'success':
                LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO'])
                self.updating = False
                return

            # Discover all the Zones in DynECT
            response = rest_iface.execute('/Zone/', 'GET')
            LOG.debug('/Zone/ => %s', json.dumps(response, indent=4))
            zone_resources = response['data']

            # Discover all the LoadBalancers
            for resource in zone_resources:
                zone = resource.split('/')[3]  # eg. /REST/Zone/guardiannews.com/
                response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET')
                LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4))
                gslb = response['data']

                # Discover LoadBalancer pool information.
                for lb in gslb:
                    fqdn = lb.split('/')[4]  # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/
                    response = rest_iface.execute('/LoadBalance/' + zone + '/' + fqdn + '/', 'GET')
                    LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4))
                    status = response['data']['status']
                    monitor = response['data']['monitor']
                    self.info['gslb-' + fqdn] = {'status': status, 'gslb': fqdn, 'rawData': monitor}

                    for pool in response['data']['pool']:
                        name = '%s-%s' % (fqdn, pool['label'].replace(' ', '-'))
                        status = '%s:%s:%s' % (pool['status'], pool['serve_mode'], pool['weight'])
                        self.info['pool-' + name] = {'status': status, 'gslb': fqdn, 'rawData': pool}

            LOG.info('Finished object discovery query.')
            LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4))

            # logout
            rest_iface.execute('/Session/', 'DELETE')

        except Exception, e:
            LOG.error('Failed to discover GSLBs: %s', e)
            self.updating = False

        self.updating = True
Example #8
0
    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)
Example #9
0
class GangliaDaemon(Daemon):
    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning(
                    'Skipping invalid rule %s - MUST define alert text for each threshold.',
                    rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']),
                      rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now),
                                                    threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub(
                    '\$now',
                    time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)),
                    text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'),
                                  rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'),
                                  resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s',
                              m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'],
                          rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug(
                        'Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(
                        rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s',
                              metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s',
                              metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s',
                              metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(
                            m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num'])
                                          )  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub(
                        '\$%s(\.sum)?' % m['metric'], str(v),
                        metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' %
                                                    m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append(
                                '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) +
                                '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' %
                                (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)
                            [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit(
                                '/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s',
                              ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S',
                                          time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub(
                            '\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning(
                        'Could not calculate %s value for %s because %s is not being reported',
                        rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s)',
                        rule['event'], resource, metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error(
                        'Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                        rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource,
                          calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (
                        GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error(
                            'Could not evaluate %s threshold for %s => eval(%s)',
                            rule['event'], resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value,
                                          GangliaDaemon.format_units(
                                              metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data=
                            '',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (
            CONF.ganglia_host, CONF.ganglia_port, filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)',
                      15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'],
                 response['time'])

        return response['metrics']
Example #10
0
    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)
Example #11
0
class GangliaDaemon(Daemon):

    def __init__(self, prog):

        Daemon.__init__(self, prog)

        self.dedup = DeDup(by_value=True)

    def run(self):
        
        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=GangliaMessage(self.mq))

        while not self.shuttingdown:
            try:
                rules = init_rules()  # re-read rule config each time
                self.metric_check(rules)

                LOG.debug('Send heartbeat...')
                heartbeat = Heartbeat(version=Version)
                self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()

    def metric_check(self, rules):

        for rule in rules:
            # Check rule is valid
            if len(rule['thresholdInfo']) != len(rule['text']):
                LOG.warning('Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event'])
                continue

            # Get list of metrics required to evaluate each rule
            params = dict()
            if 'filter' in rule and rule['filter'] is not None:
                params[rule['filter']] = 1

            for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']):
                matches = re.findall('\$([a-z0-9A-Z_]+)', s)
                for m in matches:
                    if m != 'now':
                        params['metric=' + m] = 1
            metric_filter = '&'.join(params.keys())
            LOG.debug('Metric filter = %s', metric_filter)

            # Get metric data for each rule
            response = GangliaDaemon.get_metrics(metric_filter)
            LOG.debug('Ganglia API response: %s', response)

            # Make non-metric substitutions in value, thresholdInfo and text
            now = int(time.time())
            rule['value'] = re.sub('\$now', str(now), rule['value'])
            idx = 0
            for threshold in rule['thresholdInfo']:
                rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold)
                idx += 1
            idx = 0
            for text in rule['text']:
                rule['text'][idx] = re.sub('\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text)
                idx += 1

            metric = dict()
            for m in response:

                # Make metric-based substitutions in resource eg. per instance, host or cluster
                resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource'])
                resource = re.sub('\$host', m.get('host', '__NA__'), resource)
                resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource)

                if '__NA__' in resource:
                    LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource'])
                    continue

                LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource)

                # Don't generate cluster alerts from host-based metrics
                if 'host' in m and not '$host' in rule['resource']:
                    LOG.debug('Skipping host-based metric for cluster-based rule')
                    continue

                # Build up info for alert if metric value triggers threshold
                if resource not in metric:
                    metric[resource] = dict()
                if 'thresholdInfo' not in metric[resource]:
                    metric[resource]['thresholdInfo'] = list(rule['thresholdInfo'])
                    LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo'])
                if 'text' not in metric[resource]:
                    metric[resource]['text'] = list(rule['text'])
                    LOG.debug('Set text to %s', metric[resource]['text'])

                if m['metric'] in rule['value']:
                    # Determine service and environment from rule if given
                    if 'environment' in rule:
                        metric[resource]['environment'] = [rule['environment']]
                    else:
                        metric[resource]['environment'] = [m['environment']]
                    LOG.debug('Set environment for alert to %s', metric[resource]['environment'])
                    if 'service' in rule:
                        metric[resource]['service'] = [rule['service']]
                    else:
                        metric[resource]['service'] = [m['service']]
                    LOG.debug('Set service for alert to %s', metric[resource]['service'])

                    # Use raw metric value, or sum or average if aggregated metric
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])  # raw value
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])  # aggregated sum value if "<metric>.sum"
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))  # average of aggregate value
                        except ZeroDivisionError:
                            v = 0.0
                    LOG.debug('Value for %s on %s is %s', m['id'], resource, v)

                    # If no value assign rule value
                    if 'value' not in metric[resource]:
                        metric[resource]['value'] = rule['value']
                    metric[resource]['value'] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                       metric[resource]['value'])
                    metric[resource]['units'] = m['units']

                    # Assign tags
                    metric[resource]['tags'] = list()
                    metric[resource]['tags'].extend(rule['tags'])
                    metric[resource]['tags'].append('cluster:%s' % m['cluster'])
                    if 'tags' in m and m['tags'] is not None:
                        metric[resource]['tags'].extend(m['tags'])

                    # Assign graph URL
                    if 'graphUrl' not in metric[resource]:
                        metric[resource]['graphUrls'] = list()
                    if 'graphUrl' in m:
                        metric[resource]['graphUrls'].append(m['graphUrl'])

                    for g in rule['graphs']:
                        if '$host' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], m['host'], g))
                        if '$cluster' in rule['resource'] and 'graphUrl' in m:
                            metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2])
                                                                + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default'
                                                                % (m['cluster'], g))

                    metric[resource]['moreInfo'] = ''
                    if '$host' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(
                            m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host'])
                    if '$cluster' in rule['resource'] and 'graphUrl' in m:
                        metric[resource]['moreInfo'] = '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s' % m['cluster']

                # Substitutions for threshold info
                if m['metric'] in ''.join(rule['thresholdInfo']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    idx = 0
                    for threshold in metric[resource]['thresholdInfo']:
                        metric[resource]['thresholdInfo'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v),
                                                                        threshold)
                        idx += 1

                # Substitutions for text
                if m['metric'] in ''.join(rule['text']):
                    LOG.debug('Text to be substituted: %s', ''.join(rule['text']))
                    if 'value' in m:
                        v = GangliaDaemon.quote(m['value'])
                    elif rule['value'].endswith('.sum'):
                        v = GangliaDaemon.quote(m['sum'])
                    else:
                        try:
                            v = "%.1f" % (float(m['sum']) / float(m['num']))
                        except ZeroDivisionError:
                            v = 0.0

                    if m['type'] == 'timestamp' or m['units'] == 'timestamp':
                        v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v)))

                    LOG.debug('Metric resource text %s', metric)
                    idx = 0
                    for text in metric[resource]['text']:
                        metric[resource]['text'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), text)
                        idx += 1
                LOG.debug('end of metric loop')

            for resource in metric:
                LOG.debug('Calculate final value for resource %s', resource)
                index = 0
                try:
                    calculated_value = eval(metric[resource]['value'])
                except KeyError:
                    LOG.warning('Could not calculate %s value for %s because %s is not being reported',
                                rule['event'], resource, rule['value'])
                    continue
                except (SyntaxError, NameError):
                    LOG.error('Could not calculate %s value for %s => eval(%s)', rule['event'], resource,
                              metric[resource]['value'])
                    continue
                except ZeroDivisionError:
                    LOG.debug(
                        'Could not calculate %s value for %s => eval(%s) (division by zero).  Setting to 0 instead.',
                        rule['event'], resource, metric[resource]['value'])
                    calculated_value = 0
                except Exception:
                    LOG.error('Could not calculate %s value for %s => eval(%s) (threw unknown exception)',
                              rule['event'], resource, metric[resource]['value'])
                    continue

                LOG.debug('Calculated value for resource %s => %s', resource, calculated_value)

                # Compare final value with each threshold
                for ti in metric[resource]['thresholdInfo']:
                    severity, op, threshold = ti.split(':')
                    rule_eval = '%s %s %s' % (GangliaDaemon.quote(calculated_value), op, threshold)
                    try:
                        result = eval(rule_eval)
                    except SyntaxError:
                        LOG.error('Could not evaluate %s threshold for %s => eval(%s)', rule['event'],
                                  resource, rule_eval)
                        result = False

                    if result:

                        event = rule['event']
                        group = rule['group']
                        value = "%s%s" % (calculated_value, GangliaDaemon.format_units(metric[resource]['units']))
                        environment = metric[resource]['environment']
                        service = metric[resource]['service']
                        text = metric[resource]['text'][index]
                        tags = metric[resource]['tags']
                        threshold_info = ','.join(rule['thresholdInfo'])
                        more_info = metric[resource]['moreInfo']
                        graph_urls = metric[resource]['graphUrls']

                        gangliaAlert = Alert(
                            resource=resource,
                            event=event,
                            group=group,
                            value=value,
                            severity=severity,
                            environment=environment,
                            service=service,
                            text=text,
                            event_type='gangliaAlert',
                            tags=tags,
                            threshold_info=threshold_info,
                            more_info=more_info,
                            graph_urls=graph_urls,
                            raw_data='',  # TODO(nsatterl): put raw metric values used to do calculation here
                        )

                        if self.dedup.is_send(gangliaAlert):
                            self.mq.send(gangliaAlert)

                        break  # First match wins
                    index += 1

    @staticmethod
    def get_metrics(filter):
        url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (CONF.ganglia_host, CONF.ganglia_port,  filter)
        LOG.info('Metric request %s', url)

        try:
            r = urllib2.urlopen(url, None, 15)
        except urllib2.URLError, e:
            LOG.error('Could not retrieve metric data from %s - %s', url, e)
            return dict()

        if r.getcode() is None:
            LOG.error('Error during connection or data transfer (timeout=%d)', 15)
            return dict()

        response = json.loads(r.read())['response']
        if response['status'] == 'error':
            LOG.error('No metrics retreived - %s', response['message'])
            return dict()

        LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time'])

        return response['metrics']
Example #12
0
class SolarWindsDaemon(Daemon):

    solarwinds_opts = {
        'solarwinds_host': 'localhost',
        'solarwinds_username': '******',
        'solarwinds_password': '',
        'solarwinds_group': 'websys',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(SolarWindsDaemon.solarwinds_opts)

        Daemon.__init__(self, prog, kwargs)

    def run(self):

        self.running = True

        while True:
            try:
                swis = SwisClient(username=CONF.solarwinds_username,
                                  password=CONF.solarwinds_password)
            except Exception, e:
                LOG.error('SolarWinds SWIS Client error: %s', e)
                time.sleep(30)
            else:
                break
        LOG.info('Polling for SolarWinds events on %s' % CONF.solarwinds_host)

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=SolarWindsMessage(self.mq))

        self.dedup = DeDup(by_value=True)

        while not self.shuttingdown:
            try:
                LOG.debug('Polling SolarWinds...')
                send_heartbeat = True

                # network, interface and volume events
                try:
                    events = swis.get_npm_events()
                except IOError:
                    events = []
                    send_heartbeat = False

                solarwindsAlerts = self.parse_events(events)
                for solarwindsAlert in solarwindsAlerts:
                    if self.dedup.is_send(solarwindsAlert):
                        self.mq.send(solarwindsAlert)

                # Cisco UCS events
                try:
                    events = swis.get_ucs_events()
                except IOError:
                    events = []
                    send_heartbeat = False

                solarwindsAlerts = self.parse_events(events)
                for solarwindsAlert in solarwindsAlerts:
                    if self.dedup.is_send(solarwindsAlert):
                        self.mq.send(solarwindsAlert)

                if send_heartbeat:
                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)
                else:
                    LOG.error('SolarWinds failure. Skipping heartbeat.')

                time.sleep(CONF.loop_every)

            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        LOG.info('Shutdown request received...')
        self.running = False

        LOG.info('Disconnecting from message broker...')
        self.mq.disconnect()
Example #13
0
class DynectDaemon(Daemon):

    dynect_opts = {
        'dynect_customer': '',
        'dynect_username': '',
        'dynect_password': '',
    }

    def __init__(self, prog, **kwargs):

        config.register_opts(DynectDaemon.dynect_opts)

        Daemon.__init__(self, prog, kwargs)

        self.info = {}
        self.last_info = {}
        self.updating = False
        self.dedup = DeDup(threshold=10)

    def run(self):

        self.running = True

        # Connect to message queue
        self.mq = Messaging()
        self.mq.connect(callback=DynectMessage(self.mq))

        while not self.shuttingdown:
            try:
                self.queryDynect()

                if self.updating:
                    self.alertDynect()
                    self.last_info = self.info

                    LOG.debug('Send heartbeat...')
                    heartbeat = Heartbeat(version=Version)
                    self.mq.send(heartbeat)

                LOG.debug('Waiting for next check run...')
                time.sleep(CONF.loop_every)
            except (KeyboardInterrupt, SystemExit):
                self.shuttingdown = True

        self.running = False

    def alertDynect(self):

        for resource in self.info:

            if resource not in self.last_info:
                continue

            if resource.startswith('gslb-'):

                # gslb status       = ok | unk | trouble | failover

                text = 'GSLB status is %s.' % self.info[resource]['status']

                if self.info[resource]['status'] == 'ok':
                    event = 'GslbOK'
                    severity = severity_code.NORMAL
                else:
                    event = 'GslbNotOK'
                    severity = severity_code.CRITICAL
                correlate = ['GslbOK', 'GslbNotOK']

            elif resource.startswith('pool-'):

                # pool status       = up | unk | down
                # pool serve_mode   = obey | always | remove | no
                # pool weight	(1-15)

                if 'down' in self.info[resource]['status']:
                    event = 'PoolDown'
                    severity = severity_code.MAJOR
                    text = 'Pool is down'
                elif 'obey' not in self.info[resource]['status']:
                    event = 'PoolServe'
                    severity = severity_code.MAJOR
                    text = 'Pool with an incorrect serve mode'
                elif self.check_weight(self.info[resource]['gslb'],
                                       resource) is False:
                    event = 'PoolWeightError'
                    severity = severity_code.MINOR
                    text = 'Pool with an incorrect weight'
                else:
                    event = 'PoolUp'
                    severity = severity_code.NORMAL
                    text = 'Pool status is normal'
                correlate = [
                    'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'
                ]

            else:
                LOG.warning('Unknown resource type: %s', resource)
                continue

            # Defaults
            group = 'GSLB'
            value = self.info[resource]['status']
            environment = ['PROD']
            service = ['Network']
            tags = dict()
            timeout = None
            threshold_info = None
            summary = None
            raw_data = self.info[resource]['rawData']

            dynectAlert = Alert(
                resource=resource,
                event=event,
                correlate=correlate,
                group=group,
                value=value,
                severity=severity,
                environment=environment,
                service=service,
                text=text,
                event_type='serviceAlert',
                tags=tags,
                timeout=timeout,
                threshold_info=threshold_info,
                summary=summary,
                raw_data=raw_data,
            )

            suppress = dynectAlert.transform_alert()
            if suppress:
                LOG.info('Suppressing %s alert', dynectAlert.event)
                LOG.debug('%s', dynectAlert)
                continue

            if self.dedup.is_send(dynectAlert):
                self.mq.send(dynectAlert)

    def check_weight(self, parent, resource):

        weight = self.info[resource]['status'].split(':')[2]
        for pool in [
                resource for resource in self.info
                if resource.startswith('pool')
                and self.info[resource]['gslb'] == parent
        ]:
            if self.info[pool]['status'].split(':')[1] == 'no':
                LOG.warning('Skipping %s because not serving for pool %s',
                            pool, self.info[pool]['status'])
                continue

            LOG.debug('pool %s weight %s <=> %s', pool,
                      self.info[pool]['status'].split(':')[2], weight)
            if self.info[pool]['status'].split(':')[2] != weight:
                return False
        return True

    def queryDynect(self):

        LOG.info('Query DynECT to get the state of GSLBs')
        try:
            rest_iface = DynectRest()
            if CONF.debug and CONF.use_stderr:
                rest_iface.verbose = True

            # login
            credentials = {
                'customer_name': CONF.dynect_customer,
                'user_name': CONF.dynect_username,
                'password': CONF.dynect_password,
            }
            LOG.debug('credentials = %s', credentials)
            response = rest_iface.execute('/Session/', 'POST', credentials)

            if response['status'] != 'success':
                LOG.error('Failed to create API session: %s',
                          response['msgs'][0]['INFO'])
                self.updating = False
                return

            # Discover all the Zones in DynECT
            response = rest_iface.execute('/Zone/', 'GET')
            LOG.debug('/Zone/ => %s', json.dumps(response, indent=4))
            zone_resources = response['data']

            # Discover all the LoadBalancers
            for resource in zone_resources:
                zone = resource.split('/')[
                    3]  # eg. /REST/Zone/guardiannews.com/
                response = rest_iface.execute('/LoadBalance/' + zone + '/',
                                              'GET')
                LOG.debug('/LoadBalance/%s/ => %s', zone,
                          json.dumps(response, indent=4))
                gslb = response['data']

                # Discover LoadBalancer pool information.
                for lb in gslb:
                    fqdn = lb.split(
                        '/'
                    )[4]  # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/
                    response = rest_iface.execute(
                        '/LoadBalance/' + zone + '/' + fqdn + '/', 'GET')
                    LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn,
                              json.dumps(response, indent=4))
                    status = response['data']['status']
                    monitor = response['data']['monitor']
                    self.info['gslb-' + fqdn] = {
                        'status': status,
                        'gslb': fqdn,
                        'rawData': monitor
                    }

                    for pool in response['data']['pool']:
                        name = '%s-%s' % (fqdn, pool['label'].replace(
                            ' ', '-'))
                        status = '%s:%s:%s' % (
                            pool['status'], pool['serve_mode'], pool['weight'])
                        self.info['pool-' + name] = {
                            'status': status,
                            'gslb': fqdn,
                            'rawData': pool
                        }

            LOG.info('Finished object discovery query.')
            LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4))

            # logout
            rest_iface.execute('/Session/', 'DELETE')

        except Exception, e:
            LOG.error('Failed to discover GSLBs: %s', e)
            self.updating = False

        self.updating = True