def __init__(self, prog, **kwargs): config.register_opts(DynectDaemon.dynect_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last_info = {} self.updating = False self.dedup = DeDup(threshold=10)
def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() # Connect to message queue self.mq = Messaging() self.mq.connect(callback=UrlmonMessage(self.mq)) self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase alert rules urls = init_urls() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.mq, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName())
def run(self): self.running = True # Create internal queue self.queue = Queue.Queue() self.api = ApiClient() self.dedup = DeDup() self.carbon = Carbon() # graphite metrics # Initialiase ping targets ping_list = init_targets() # Start worker threads LOG.debug('Starting %s worker threads...', CONF.server_threads) for i in range(CONF.server_threads): w = WorkerThread(self.api, self.queue, self.dedup, self.carbon) try: w.start() except Exception, e: LOG.error('Worker thread #%s did not start: %s', i, e) continue LOG.info('Started worker thread: %s', w.getName())
def __init__(self, prog, **kwargs): config.register_opts(AwsDaemon.aws_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last = {} self.lookup = {} self.dedup = DeDup()
def run(self): self.running = True self.statsd = StatsD() # graphite metrics # Connect to message queue self.mq = Messaging() self.mq.connect(callback=CloudWatchMessage(self.mq)) self.dedup = DeDup(by_value=True) LOG.info('Connecting to SQS queue %s', CONF.cloudwatch_sqs_queue) try: sqs = boto.sqs.connect_to_region( CONF.cloudwatch_sqs_region, aws_access_key_id=CONF.cloudwatch_access_key, aws_secret_access_key=CONF.cloudwatch_secret_key) except boto.exception.SQSError, e: LOG.error('SQS API call failed: %s', e) sys.exit(1)
class DynectDaemon(Daemon): dynect_opts = { 'dynect_customer': '', 'dynect_username': '', 'dynect_password': '', } def __init__(self, prog, **kwargs): config.register_opts(DynectDaemon.dynect_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last_info = {} self.updating = False self.dedup = DeDup(threshold=10) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=DynectMessage(self.mq)) while not self.shuttingdown: try: self.queryDynect() if self.updating: self.alertDynect() self.last_info = self.info LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True self.running = False def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = list() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert) def check_weight(self, parent, resource): weight = self.info[resource]['status'].split(':')[2] for pool in [resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent]: if self.info[pool]['status'].split(':')[1] == 'no': LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status']) continue LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight) if self.info[pool]['status'].split(':')[2] != weight: return False return True def queryDynect(self): LOG.info('Query DynECT to get the state of GSLBs') try: rest_iface = DynectRest() if CONF.debug and CONF.use_stderr: rest_iface.verbose = True # login credentials = { 'customer_name': CONF.dynect_customer, 'user_name': CONF.dynect_username, 'password': CONF.dynect_password, } LOG.debug('credentials = %s', credentials) response = rest_iface.execute('/Session/', 'POST', credentials) if response['status'] != 'success': LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO']) self.updating = False return # Discover all the Zones in DynECT response = rest_iface.execute('/Zone/', 'GET') LOG.debug('/Zone/ => %s', json.dumps(response, indent=4)) zone_resources = response['data'] # Discover all the LoadBalancers for resource in zone_resources: zone = resource.split('/')[3] # eg. /REST/Zone/guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET') LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4)) gslb = response['data'] # Discover LoadBalancer pool information. for lb in gslb: fqdn = lb.split('/')[4] # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/' + fqdn + '/', 'GET') LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4)) status = response['data']['status'] monitor = response['data']['monitor'] self.info['gslb-' + fqdn] = {'status': status, 'gslb': fqdn, 'rawData': monitor} for pool in response['data']['pool']: name = '%s-%s' % (fqdn, pool['label'].replace(' ', '-')) status = '%s:%s:%s' % (pool['status'], pool['serve_mode'], pool['weight']) self.info['pool-' + name] = {'status': status, 'gslb': fqdn, 'rawData': pool} LOG.info('Finished object discovery query.') LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4)) # logout rest_iface.execute('/Session/', 'DELETE') except Exception, e: LOG.error('Failed to discover GSLBs: %s', e) self.updating = False self.updating = True
def __init__(self, prog): Daemon.__init__(self, prog) self.dedup = DeDup(by_value=True)
class GangliaDaemon(Daemon): def __init__(self, prog): Daemon.__init__(self, prog) self.dedup = DeDup(by_value=True) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=GangliaMessage(self.mq)) while not self.shuttingdown: try: rules = init_rules() # re-read rule config each time self.metric_check(rules) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect() def metric_check(self, rules): for rule in rules: # Check rule is valid if len(rule['thresholdInfo']) != len(rule['text']): LOG.warning( 'Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event']) continue # Get list of metrics required to evaluate each rule params = dict() if 'filter' in rule and rule['filter'] is not None: params[rule['filter']] = 1 for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']): matches = re.findall('\$([a-z0-9A-Z_]+)', s) for m in matches: if m != 'now': params['metric=' + m] = 1 metric_filter = '&'.join(params.keys()) LOG.debug('Metric filter = %s', metric_filter) # Get metric data for each rule response = GangliaDaemon.get_metrics(metric_filter) LOG.debug('Ganglia API response: %s', response) # Make non-metric substitutions in value, thresholdInfo and text now = int(time.time()) rule['value'] = re.sub('\$now', str(now), rule['value']) idx = 0 for threshold in rule['thresholdInfo']: rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold) idx += 1 idx = 0 for text in rule['text']: rule['text'][idx] = re.sub( '\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text) idx += 1 metric = dict() for m in response: # Make metric-based substitutions in resource eg. per instance, host or cluster resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource']) resource = re.sub('\$host', m.get('host', '__NA__'), resource) resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource) if '__NA__' in resource: LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource']) continue LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource) # Don't generate cluster alerts from host-based metrics if 'host' in m and not '$host' in rule['resource']: LOG.debug( 'Skipping host-based metric for cluster-based rule') continue # Build up info for alert if metric value triggers threshold if resource not in metric: metric[resource] = dict() if 'thresholdInfo' not in metric[resource]: metric[resource]['thresholdInfo'] = list( rule['thresholdInfo']) LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo']) if 'text' not in metric[resource]: metric[resource]['text'] = list(rule['text']) LOG.debug('Set text to %s', metric[resource]['text']) if m['metric'] in rule['value']: # Determine service and environment from rule if given if 'environment' in rule: metric[resource]['environment'] = [rule['environment']] else: metric[resource]['environment'] = [m['environment']] LOG.debug('Set environment for alert to %s', metric[resource]['environment']) if 'service' in rule: metric[resource]['service'] = [rule['service']] else: metric[resource]['service'] = [m['service']] LOG.debug('Set service for alert to %s', metric[resource]['service']) # Use raw metric value, or sum or average if aggregated metric if 'value' in m: v = GangliaDaemon.quote(m['value']) # raw value elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote( m['sum']) # aggregated sum value if "<metric>.sum" else: try: v = "%.1f" % (float(m['sum']) / float(m['num']) ) # average of aggregate value except ZeroDivisionError: v = 0.0 LOG.debug('Value for %s on %s is %s', m['id'], resource, v) # If no value assign rule value if 'value' not in metric[resource]: metric[resource]['value'] = rule['value'] metric[resource]['value'] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), metric[resource]['value']) metric[resource]['units'] = m['units'] # Assign tags metric[resource]['tags'] = list() metric[resource]['tags'].extend(rule['tags']) metric[resource]['tags'].append('cluster:%s' % m['cluster']) if 'tags' in m and m['tags'] is not None: metric[resource]['tags'].extend(m['tags']) # Assign graph URL if 'graphUrl' not in metric[resource]: metric[resource]['graphUrls'] = list() if 'graphUrl' in m: metric[resource]['graphUrls'].append(m['graphUrl']) for g in rule['graphs']: if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], m['host'], g)) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], g)) metric[resource]['moreInfo'] = '' if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit('/', 2) [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host']) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit( '/', 2)[0:2]) + '/?c=%s' % m['cluster'] # Substitutions for threshold info if m['metric'] in ''.join(rule['thresholdInfo']): LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 idx = 0 for threshold in metric[resource]['thresholdInfo']: metric[resource]['thresholdInfo'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), threshold) idx += 1 # Substitutions for text if m['metric'] in ''.join(rule['text']): LOG.debug('Text to be substituted: %s', ''.join(rule['text'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 if m['type'] == 'timestamp' or m['units'] == 'timestamp': v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v))) LOG.debug('Metric resource text %s', metric) idx = 0 for text in metric[resource]['text']: metric[resource]['text'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), text) idx += 1 LOG.debug('end of metric loop') for resource in metric: LOG.debug('Calculate final value for resource %s', resource) index = 0 try: calculated_value = eval(metric[resource]['value']) except KeyError: LOG.warning( 'Could not calculate %s value for %s because %s is not being reported', rule['event'], resource, rule['value']) continue except (SyntaxError, NameError): LOG.error( 'Could not calculate %s value for %s => eval(%s)', rule['event'], resource, metric[resource]['value']) continue except ZeroDivisionError: LOG.debug( 'Could not calculate %s value for %s => eval(%s) (division by zero). Setting to 0 instead.', rule['event'], resource, metric[resource]['value']) calculated_value = 0 except Exception: LOG.error( 'Could not calculate %s value for %s => eval(%s) (threw unknown exception)', rule['event'], resource, metric[resource]['value']) continue LOG.debug('Calculated value for resource %s => %s', resource, calculated_value) # Compare final value with each threshold for ti in metric[resource]['thresholdInfo']: severity, op, threshold = ti.split(':') rule_eval = '%s %s %s' % ( GangliaDaemon.quote(calculated_value), op, threshold) try: result = eval(rule_eval) except SyntaxError: LOG.error( 'Could not evaluate %s threshold for %s => eval(%s)', rule['event'], resource, rule_eval) result = False if result: event = rule['event'] group = rule['group'] value = "%s%s" % (calculated_value, GangliaDaemon.format_units( metric[resource]['units'])) environment = metric[resource]['environment'] service = metric[resource]['service'] text = metric[resource]['text'][index] tags = metric[resource]['tags'] threshold_info = ','.join(rule['thresholdInfo']) more_info = metric[resource]['moreInfo'] graph_urls = metric[resource]['graphUrls'] gangliaAlert = Alert( resource=resource, event=event, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='gangliaAlert', tags=tags, threshold_info=threshold_info, more_info=more_info, graph_urls=graph_urls, raw_data= '', # TODO(nsatterl): put raw metric values used to do calculation here ) if self.dedup.is_send(gangliaAlert): self.mq.send(gangliaAlert) break # First match wins index += 1 @staticmethod def get_metrics(filter): url = "http://%s:%s/ganglia/api/v1/metrics?%s" % ( CONF.ganglia_host, CONF.ganglia_port, filter) LOG.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, 15) except urllib2.URLError, e: LOG.error('Could not retrieve metric data from %s - %s', url, e) return dict() if r.getcode() is None: LOG.error('Error during connection or data transfer (timeout=%d)', 15) return dict() response = json.loads(r.read())['response'] if response['status'] == 'error': LOG.error('No metrics retreived - %s', response['message']) return dict() LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time']) return response['metrics']
class GangliaDaemon(Daemon): def __init__(self, prog): Daemon.__init__(self, prog) self.dedup = DeDup(by_value=True) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=GangliaMessage(self.mq)) while not self.shuttingdown: try: rules = init_rules() # re-read rule config each time self.metric_check(rules) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect() def metric_check(self, rules): for rule in rules: # Check rule is valid if len(rule['thresholdInfo']) != len(rule['text']): LOG.warning('Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event']) continue # Get list of metrics required to evaluate each rule params = dict() if 'filter' in rule and rule['filter'] is not None: params[rule['filter']] = 1 for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']): matches = re.findall('\$([a-z0-9A-Z_]+)', s) for m in matches: if m != 'now': params['metric=' + m] = 1 metric_filter = '&'.join(params.keys()) LOG.debug('Metric filter = %s', metric_filter) # Get metric data for each rule response = GangliaDaemon.get_metrics(metric_filter) LOG.debug('Ganglia API response: %s', response) # Make non-metric substitutions in value, thresholdInfo and text now = int(time.time()) rule['value'] = re.sub('\$now', str(now), rule['value']) idx = 0 for threshold in rule['thresholdInfo']: rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold) idx += 1 idx = 0 for text in rule['text']: rule['text'][idx] = re.sub('\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text) idx += 1 metric = dict() for m in response: # Make metric-based substitutions in resource eg. per instance, host or cluster resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource']) resource = re.sub('\$host', m.get('host', '__NA__'), resource) resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource) if '__NA__' in resource: LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource']) continue LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource) # Don't generate cluster alerts from host-based metrics if 'host' in m and not '$host' in rule['resource']: LOG.debug('Skipping host-based metric for cluster-based rule') continue # Build up info for alert if metric value triggers threshold if resource not in metric: metric[resource] = dict() if 'thresholdInfo' not in metric[resource]: metric[resource]['thresholdInfo'] = list(rule['thresholdInfo']) LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo']) if 'text' not in metric[resource]: metric[resource]['text'] = list(rule['text']) LOG.debug('Set text to %s', metric[resource]['text']) if m['metric'] in rule['value']: # Determine service and environment from rule if given if 'environment' in rule: metric[resource]['environment'] = [rule['environment']] else: metric[resource]['environment'] = [m['environment']] LOG.debug('Set environment for alert to %s', metric[resource]['environment']) if 'service' in rule: metric[resource]['service'] = [rule['service']] else: metric[resource]['service'] = [m['service']] LOG.debug('Set service for alert to %s', metric[resource]['service']) # Use raw metric value, or sum or average if aggregated metric if 'value' in m: v = GangliaDaemon.quote(m['value']) # raw value elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) # aggregated sum value if "<metric>.sum" else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) # average of aggregate value except ZeroDivisionError: v = 0.0 LOG.debug('Value for %s on %s is %s', m['id'], resource, v) # If no value assign rule value if 'value' not in metric[resource]: metric[resource]['value'] = rule['value'] metric[resource]['value'] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), metric[resource]['value']) metric[resource]['units'] = m['units'] # Assign tags metric[resource]['tags'] = list() metric[resource]['tags'].extend(rule['tags']) metric[resource]['tags'].append('cluster:%s' % m['cluster']) if 'tags' in m and m['tags'] is not None: metric[resource]['tags'].extend(m['tags']) # Assign graph URL if 'graphUrl' not in metric[resource]: metric[resource]['graphUrls'] = list() if 'graphUrl' in m: metric[resource]['graphUrls'].append(m['graphUrl']) for g in rule['graphs']: if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], m['host'], g)) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append('/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], g)) metric[resource]['moreInfo'] = '' if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host']) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/?c=%s' % m['cluster'] # Substitutions for threshold info if m['metric'] in ''.join(rule['thresholdInfo']): LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 idx = 0 for threshold in metric[resource]['thresholdInfo']: metric[resource]['thresholdInfo'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), threshold) idx += 1 # Substitutions for text if m['metric'] in ''.join(rule['text']): LOG.debug('Text to be substituted: %s', ''.join(rule['text'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 if m['type'] == 'timestamp' or m['units'] == 'timestamp': v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v))) LOG.debug('Metric resource text %s', metric) idx = 0 for text in metric[resource]['text']: metric[resource]['text'][idx] = re.sub('\$%s(\.sum)?' % m['metric'], str(v), text) idx += 1 LOG.debug('end of metric loop') for resource in metric: LOG.debug('Calculate final value for resource %s', resource) index = 0 try: calculated_value = eval(metric[resource]['value']) except KeyError: LOG.warning('Could not calculate %s value for %s because %s is not being reported', rule['event'], resource, rule['value']) continue except (SyntaxError, NameError): LOG.error('Could not calculate %s value for %s => eval(%s)', rule['event'], resource, metric[resource]['value']) continue except ZeroDivisionError: LOG.debug( 'Could not calculate %s value for %s => eval(%s) (division by zero). Setting to 0 instead.', rule['event'], resource, metric[resource]['value']) calculated_value = 0 except Exception: LOG.error('Could not calculate %s value for %s => eval(%s) (threw unknown exception)', rule['event'], resource, metric[resource]['value']) continue LOG.debug('Calculated value for resource %s => %s', resource, calculated_value) # Compare final value with each threshold for ti in metric[resource]['thresholdInfo']: severity, op, threshold = ti.split(':') rule_eval = '%s %s %s' % (GangliaDaemon.quote(calculated_value), op, threshold) try: result = eval(rule_eval) except SyntaxError: LOG.error('Could not evaluate %s threshold for %s => eval(%s)', rule['event'], resource, rule_eval) result = False if result: event = rule['event'] group = rule['group'] value = "%s%s" % (calculated_value, GangliaDaemon.format_units(metric[resource]['units'])) environment = metric[resource]['environment'] service = metric[resource]['service'] text = metric[resource]['text'][index] tags = metric[resource]['tags'] threshold_info = ','.join(rule['thresholdInfo']) more_info = metric[resource]['moreInfo'] graph_urls = metric[resource]['graphUrls'] gangliaAlert = Alert( resource=resource, event=event, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='gangliaAlert', tags=tags, threshold_info=threshold_info, more_info=more_info, graph_urls=graph_urls, raw_data='', # TODO(nsatterl): put raw metric values used to do calculation here ) if self.dedup.is_send(gangliaAlert): self.mq.send(gangliaAlert) break # First match wins index += 1 @staticmethod def get_metrics(filter): url = "http://%s:%s/ganglia/api/v1/metrics?%s" % (CONF.ganglia_host, CONF.ganglia_port, filter) LOG.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, 15) except urllib2.URLError, e: LOG.error('Could not retrieve metric data from %s - %s', url, e) return dict() if r.getcode() is None: LOG.error('Error during connection or data transfer (timeout=%d)', 15) return dict() response = json.loads(r.read())['response'] if response['status'] == 'error': LOG.error('No metrics retreived - %s', response['message']) return dict() LOG.info('Retreived %s matching metrics in %ss', response['total'], response['time']) return response['metrics']
class SolarWindsDaemon(Daemon): solarwinds_opts = { 'solarwinds_host': 'localhost', 'solarwinds_username': '******', 'solarwinds_password': '', 'solarwinds_group': 'websys', } def __init__(self, prog, **kwargs): config.register_opts(SolarWindsDaemon.solarwinds_opts) Daemon.__init__(self, prog, kwargs) def run(self): self.running = True while True: try: swis = SwisClient(username=CONF.solarwinds_username, password=CONF.solarwinds_password) except Exception, e: LOG.error('SolarWinds SWIS Client error: %s', e) time.sleep(30) else: break LOG.info('Polling for SolarWinds events on %s' % CONF.solarwinds_host) # Connect to message queue self.mq = Messaging() self.mq.connect(callback=SolarWindsMessage(self.mq)) self.dedup = DeDup(by_value=True) while not self.shuttingdown: try: LOG.debug('Polling SolarWinds...') send_heartbeat = True # network, interface and volume events try: events = swis.get_npm_events() except IOError: events = [] send_heartbeat = False solarwindsAlerts = self.parse_events(events) for solarwindsAlert in solarwindsAlerts: if self.dedup.is_send(solarwindsAlert): self.mq.send(solarwindsAlert) # Cisco UCS events try: events = swis.get_ucs_events() except IOError: events = [] send_heartbeat = False solarwindsAlerts = self.parse_events(events) for solarwindsAlert in solarwindsAlerts: if self.dedup.is_send(solarwindsAlert): self.mq.send(solarwindsAlert) if send_heartbeat: LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) else: LOG.error('SolarWinds failure. Skipping heartbeat.') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True LOG.info('Shutdown request received...') self.running = False LOG.info('Disconnecting from message broker...') self.mq.disconnect()
class DynectDaemon(Daemon): dynect_opts = { 'dynect_customer': '', 'dynect_username': '', 'dynect_password': '', } def __init__(self, prog, **kwargs): config.register_opts(DynectDaemon.dynect_opts) Daemon.__init__(self, prog, kwargs) self.info = {} self.last_info = {} self.updating = False self.dedup = DeDup(threshold=10) def run(self): self.running = True # Connect to message queue self.mq = Messaging() self.mq.connect(callback=DynectMessage(self.mq)) while not self.shuttingdown: try: self.queryDynect() if self.updating: self.alertDynect() self.last_info = self.info LOG.debug('Send heartbeat...') heartbeat = Heartbeat(version=Version) self.mq.send(heartbeat) LOG.debug('Waiting for next check run...') time.sleep(CONF.loop_every) except (KeyboardInterrupt, SystemExit): self.shuttingdown = True self.running = False def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = [ 'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError' ] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = dict() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert) def check_weight(self, parent, resource): weight = self.info[resource]['status'].split(':')[2] for pool in [ resource for resource in self.info if resource.startswith('pool') and self.info[resource]['gslb'] == parent ]: if self.info[pool]['status'].split(':')[1] == 'no': LOG.warning('Skipping %s because not serving for pool %s', pool, self.info[pool]['status']) continue LOG.debug('pool %s weight %s <=> %s', pool, self.info[pool]['status'].split(':')[2], weight) if self.info[pool]['status'].split(':')[2] != weight: return False return True def queryDynect(self): LOG.info('Query DynECT to get the state of GSLBs') try: rest_iface = DynectRest() if CONF.debug and CONF.use_stderr: rest_iface.verbose = True # login credentials = { 'customer_name': CONF.dynect_customer, 'user_name': CONF.dynect_username, 'password': CONF.dynect_password, } LOG.debug('credentials = %s', credentials) response = rest_iface.execute('/Session/', 'POST', credentials) if response['status'] != 'success': LOG.error('Failed to create API session: %s', response['msgs'][0]['INFO']) self.updating = False return # Discover all the Zones in DynECT response = rest_iface.execute('/Zone/', 'GET') LOG.debug('/Zone/ => %s', json.dumps(response, indent=4)) zone_resources = response['data'] # Discover all the LoadBalancers for resource in zone_resources: zone = resource.split('/')[ 3] # eg. /REST/Zone/guardiannews.com/ response = rest_iface.execute('/LoadBalance/' + zone + '/', 'GET') LOG.debug('/LoadBalance/%s/ => %s', zone, json.dumps(response, indent=4)) gslb = response['data'] # Discover LoadBalancer pool information. for lb in gslb: fqdn = lb.split( '/' )[4] # eg. /REST/LoadBalance/guardiannews.com/id.guardiannews.com/ response = rest_iface.execute( '/LoadBalance/' + zone + '/' + fqdn + '/', 'GET') LOG.debug('/LoadBalance/%s/%s/ => %s', zone, fqdn, json.dumps(response, indent=4)) status = response['data']['status'] monitor = response['data']['monitor'] self.info['gslb-' + fqdn] = { 'status': status, 'gslb': fqdn, 'rawData': monitor } for pool in response['data']['pool']: name = '%s-%s' % (fqdn, pool['label'].replace( ' ', '-')) status = '%s:%s:%s' % ( pool['status'], pool['serve_mode'], pool['weight']) self.info['pool-' + name] = { 'status': status, 'gslb': fqdn, 'rawData': pool } LOG.info('Finished object discovery query.') LOG.debug('GSLBs and Pools: %s', json.dumps(self.info, indent=4)) # logout rest_iface.execute('/Session/', 'DELETE') except Exception, e: LOG.error('Failed to discover GSLBs: %s', e) self.updating = False self.updating = True