def parse_notification(self, message): LOG.debug('Parsing CloudWatch notification message...') notification = json.loads(message) if 'Message' in notification: alarm = json.loads(notification['Message']) else: return # Defaults alertid = notification['MessageId'] resource = alarm['Trigger']['Dimensions'][0]['value'] event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) previous_severity = self.cw_state_to_severity(alarm['OldStateValue']) group = 'CloudWatch' value = alarm['NewStateValue'] text = alarm['AlarmDescription'] environment = ['INFRA'] service = [alarm['AWSAccountId']] # XXX - use transform_alert() to map AWSAccountId to a useful name tags = {'Region': alarm['Region']} correlate = list() origin = notification['TopicArn'] timeout = None threshold_info = alarm['NewStateReason'] summary = notification['Subject'] create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( alertid=alertid, resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, previous_severity=previous_severity, environment=environment, service=service, text=text, event_type='cloudwatchAlarm', tags=tags, origin=origin, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=raw_data, ) suppress = cloudwatchAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', event) LOG.debug('%s', cloudwatchAlert) return return cloudwatchAlert
def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = list() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert)
def parse_events(self, data): LOG.debug('Parsing solarwinds event data...') LOG.debug(data) try: data[0] except IndexError: return [] try: data[0].c0 except AttributeError: data = [data] solarwindsAlerts = list() for row in data: LOG.debug(row) event = row.c4.replace(" ", "") correlate = SOLAR_WINDS_CORRELATED_EVENTS.get(event, None) resource = '%s:%s' % (row.c2, row.c3.lower()) severity = SOLAR_WINDS_SEVERITY_LEVELS.get(row.c7, None) group = 'Orion' value = '%s' % row.c6 text = '%s' % row.c5 environment = ['INFRA'] service = ['Network'] tags = None timeout = None threshold_info = None summary = None raw_data = repr(row) create_time = datetime.datetime.strptime(row.c1[:-5]+'Z', '%Y-%m-%dT%H:%M:%S.%fZ') solarwindsAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='solarwindsAlert', tags=tags, threshold_info=threshold_info, summary=summary, timeout=timeout, create_time=create_time, raw_data=raw_data, ) suppress = solarwindsAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', solarwindsAlert.event) LOG.debug('%s', solarwindsAlert) continue if solarwindsAlert.get_type() == 'Heartbeat': solarwindsAlert = Heartbeat(origin=solarwindsAlert.origin, version='n/a', timeout=solarwindsAlert.timeout) solarwindsAlerts.append(solarwindsAlert) return solarwindsAlerts
urlmonAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, threshold_info=threshold_info, ) suppress = urlmonAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', urlmonAlert.event) LOG.debug('%s', urlmonAlert) elif self.dedup.is_send(urlmonAlert): self.mq.send(urlmonAlert) self.queue.task_done() LOG.info('%s check complete.', self.getName()) self.queue.task_done() class UrlmonMessage(MessageHandler): def __init__(self, mq):
urlmonAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, threshold_info=threshold_info, ) suppress = urlmonAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', urlmonAlert.event) LOG.debug('%s', urlmonAlert) elif self.dedup.is_send(urlmonAlert): self.mq.send(urlmonAlert) self.queue.task_done() LOG.info('%s check complete.', self.getName()) self.queue.task_done() class UrlmonMessage(MessageHandler):
def parse_snmptrap(data): pdu_data = data.splitlines() varbind_list = pdu_data[:] trapvars = dict() for line in pdu_data: if line.startswith('$'): special, value = line.split(None, 1) trapvars[special] = value varbind_list.pop(0) if '$s' in trapvars: if trapvars['$s'] == '0': version = 'SNMPv1' elif trapvars['$s'] == '1': version = 'SNMPv2c' elif trapvars['$s'] == '2': version = 'SNMPv2u' # not supported else: version = 'SNMPv3' trapvars['$s'] = version # Get varbinds varbinds = dict() idx = 0 for varbind in '\n'.join(varbind_list).split('~%~'): if varbind == '': break idx += 1 try: oid, value = varbind.split(None, 1) except ValueError: oid = varbind value = '' varbinds[oid] = value trapvars['$' + str(idx)] = value # $n LOG.debug('$%s %s', str(idx), value) trapvars['$q'] = trapvars['$q'].lstrip('.') # if numeric, remove leading '.' trapvars['$#'] = str(idx) LOG.debug('varbinds = %s', varbinds) LOG.debug('version = %s', version) correlate = list() if version == 'SNMPv1': if trapvars['$w'] == '0': trapvars['$O'] = 'coldStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '1': trapvars['$O'] = 'warmStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '2': trapvars['$O'] = 'linkDown' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '3': trapvars['$O'] = 'linkUp' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '4': trapvars['$O'] = 'authenticationFailure' elif trapvars['$w'] == '5': trapvars['$O'] = 'egpNeighborLoss' elif trapvars['$w'] == '6': # enterpriseSpecific(6) if trapvars['$q'].isdigit(): # XXX - specific trap number was not decoded trapvars['$O'] = '%s.0.%s' % (trapvars['$N'], trapvars['$q']) else: trapvars['$O'] = trapvars['$q'] elif version == 'SNMPv2c': if 'coldStart' in trapvars['$2']: trapvars['$w'] = '0' trapvars['$W'] = 'Cold Start' elif 'warmStart' in trapvars['$2']: trapvars['$w'] = '1' trapvars['$W'] = 'Warm Start' elif 'linkDown' in trapvars['$2']: trapvars['$w'] = '2' trapvars['$W'] = 'Link Down' elif 'linkUp' in trapvars['$2']: trapvars['$w'] = '3' trapvars['$W'] = 'Link Up' elif 'authenticationFailure' in trapvars['$2']: trapvars['$w'] = '4' trapvars['$W'] = 'Authentication Failure' elif 'egpNeighborLoss' in trapvars['$2']: trapvars['$w'] = '5' trapvars['$W'] = 'EGP Neighbor Loss' else: trapvars['$w'] = '6' trapvars['$W'] = 'Enterprise Specific' trapvars['$O'] = trapvars['$2'] # SNMPv2-MIB::snmpTrapOID.0 LOG.debug('trapvars = %s', trapvars) LOG.info('%s-Trap-PDU %s from %s at %s %s', version, trapvars['$O'], trapvars['$B'], trapvars['$x'], trapvars['$X']) if trapvars['$B'] != '<UNKNOWN>': resource = trapvars['$B'] elif trapvars['$A'] != '0.0.0.0': resource = trapvars['$A'] else: m = re.match(r'UDP: \[(\d+\.\d+\.\d+\.\d+)\]', trapvars['$b']) if m: resource = m.group(1) else: resource = '<NONE>' # Defaults event = trapvars['$O'] severity = severity_code.NORMAL group = 'SNMP' value = trapvars['$w'] text = trapvars['$W'] environment = ['INFRA'] service = ['Network'] tags = {'Version': version} timeout = None threshold_info = None summary = None create_time = datetime.datetime.strptime('%sT%s.000Z' % (trapvars['$x'], trapvars['$X']), '%Y-%m-%dT%H:%M:%S.%fZ') snmptrapAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='snmptrapAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=data, ) suppress = snmptrapAlert.transform_alert(trapoid=trapvars['$O'], trapvars=trapvars, varbinds=varbinds) if suppress: LOG.info('Suppressing %s SNMP trap', snmptrapAlert.event) LOG.debug('%s', snmptrapAlert) return snmptrapAlert.translate_alert(trapvars) if snmptrapAlert.get_type() == 'Heartbeat': snmptrapAlert = Heartbeat(origin=snmptrapAlert.origin, version='n/a', timeout=snmptrapAlert.timeout) return snmptrapAlert
def run(self): while True: LOG.debug('Waiting on input queue...') item = self.queue.get() if not item: LOG.info('%s is shutting down.', self.getName()) break environment, service, resource, retries, queue_time = item if time.time() - queue_time > CONF.loop_every: LOG.warning('Ping request to %s expired after %d seconds.', resource, int(time.time() - queue_time)) self.queue.task_done() continue LOG.info('%s pinging %s...', self.getName(), resource) if retries > 1: rc, rtt, loss, stdout = self.pinger(resource, count=2, timeout=5) else: rc, rtt, loss, stdout = self.pinger( resource, count=5, timeout=CONF.ping_max_timeout) if rc != PING_OK and retries: LOG.info('Retrying ping %s %s more times', resource, retries) self.queue.put( (environment, service, resource, retries - 1, time.time())) self.queue.task_done() continue if rc == PING_OK: avg, max = rtt self.carbon.metric_send( 'alert.pinger.%s.avgRoundTrip' % resource, avg) self.carbon.metric_send( 'alert.pinger.%s.maxRoundTrip' % resource, max) self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 100.0) if avg > CONF.ping_slow_critical: event = 'PingSlow' severity = severity_code.CRITICAL text = 'Node responded to ping in %s ms avg (> %s ms)' % ( avg, CONF.ping_slow_critical) elif avg > CONF.ping_slow_warning: event = 'PingSlow' severity = severity_code.WARNING text = 'Node responded to ping in %s ms avg (> %s ms)' % ( avg, CONF.ping_slow_warning) else: event = 'PingOK' severity = severity_code.NORMAL text = 'Node responding to ping avg/max %s/%s ms.' % tuple( rtt) value = '%s/%s ms' % tuple(rtt) elif rc == PING_FAILED: event = 'PingFailed' severity = severity_code.MAJOR text = 'Node did not respond to ping or timed out within %s seconds' % CONF.ping_max_timeout value = '%s%% packet loss' % loss self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 100.0 - float(loss)) elif rc == PING_ERROR: event = 'PingError' severity = severity_code.WARNING text = 'Could not ping node %s.' % resource value = stdout self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 0.0) else: LOG.warning('Unknown ping return code: %s', rc) continue # Defaults resource += ':icmp' group = 'Ping' correlate = _PING_ALERTS timeout = None threshold_info = None summary = None raw_data = stdout pingAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=None, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = pingAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', pingAlert.event) LOG.debug('%s', pingAlert) elif self.dedup.is_send(pingAlert): self.mq.send(pingAlert) self.queue.task_done() LOG.info('%s ping %s complete.', self.getName(), resource) self.queue.task_done()
def parse_notification(self, message): LOG.debug('Parsing CloudWatch notification message...') notification = json.loads(message) if 'Message' in notification: alarm = json.loads(notification['Message']) else: return # Defaults alertid = notification['MessageId'] resource = alarm['Trigger']['Dimensions'][0]['value'] event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) previous_severity = self.cw_state_to_severity(alarm['OldStateValue']) group = 'CloudWatch' value = alarm['NewStateValue'] text = alarm['AlarmDescription'] environment = ['INFRA'] service = [ alarm['AWSAccountId'] ] # XXX - use transform_alert() to map AWSAccountId to a useful name tags = {'Region': alarm['Region']} correlate = list() origin = notification['TopicArn'] timeout = None threshold_info = alarm['NewStateReason'] summary = notification['Subject'] create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( alertid=alertid, resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, previous_severity=previous_severity, environment=environment, service=service, text=text, event_type='cloudwatchAlarm', tags=tags, origin=origin, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=raw_data, ) suppress = cloudwatchAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', event) LOG.debug('%s', cloudwatchAlert) return return cloudwatchAlert
def run(self): while True: LOG.debug("Waiting on input queue...") item = self.queue.get() if not item: LOG.info("%s is shutting down.", self.getName()) break environment, service, resource, retries, queue_time = item if time.time() - queue_time > CONF.loop_every: LOG.warning("Ping request to %s expired after %d seconds.", resource, int(time.time() - queue_time)) self.queue.task_done() continue LOG.info("%s pinging %s...", self.getName(), resource) if retries > 1: rc, rtt, loss, stdout = self.pinger(resource, count=2, timeout=5) else: rc, rtt, loss, stdout = self.pinger(resource, count=5, timeout=CONF.ping_max_timeout) if rc != PING_OK and retries: LOG.info("Retrying ping %s %s more times", resource, retries) self.queue.put((environment, service, resource, retries - 1, time.time())) self.queue.task_done() continue if rc == PING_OK: avg, max = rtt self.carbon.metric_send("alert.pinger.%s.avgRoundTrip" % resource, avg) self.carbon.metric_send("alert.pinger.%s.maxRoundTrip" % resource, max) self.carbon.metric_send("alert.pinger.%s.availability" % resource, 100.0) if avg > CONF.ping_slow_critical: event = "PingSlow" severity = severity_code.CRITICAL text = "Node responded to ping in %s ms avg (> %s ms)" % (avg, CONF.ping_slow_critical) elif avg > CONF.ping_slow_warning: event = "PingSlow" severity = severity_code.WARNING text = "Node responded to ping in %s ms avg (> %s ms)" % (avg, CONF.ping_slow_warning) else: event = "PingOK" severity = severity_code.NORMAL text = "Node responding to ping avg/max %s/%s ms." % tuple(rtt) value = "%s/%s ms" % tuple(rtt) elif rc == PING_FAILED: event = "PingFailed" severity = severity_code.MAJOR text = "Node did not respond to ping or timed out within %s seconds" % CONF.ping_max_timeout value = "%s%% packet loss" % loss self.carbon.metric_send("alert.pinger.%s.availability" % resource, 100.0 - float(loss)) elif rc == PING_ERROR: event = "PingError" severity = severity_code.WARNING text = "Could not ping node %s." % resource value = stdout self.carbon.metric_send("alert.pinger.%s.availability" % resource, 0.0) else: LOG.warning("Unknown ping return code: %s", rc) continue # Defaults resource += ":icmp" group = "Ping" correlate = _PING_ALERTS timeout = None threshold_info = None summary = None raw_data = stdout pingAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type="serviceAlert", tags=None, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = pingAlert.transform_alert() if suppress: LOG.info("Suppressing %s alert", pingAlert.event) LOG.debug("%s", pingAlert) elif self.dedup.is_send(pingAlert): self.mq.send(pingAlert) self.queue.task_done() LOG.info("%s ping %s complete.", self.getName(), resource) self.queue.task_done()
def parse_events(self, data): LOG.debug('Parsing solarwinds event data...') LOG.debug(data) try: data[0] except IndexError: return [] try: data[0].c0 except AttributeError: data = [data] solarwindsAlerts = list() for row in data: LOG.debug(row) event = row.c4.replace(" ", "") correlate = SOLAR_WINDS_CORRELATED_EVENTS.get(event, None) resource = '%s:%s' % (row.c2, row.c3.lower()) severity = SOLAR_WINDS_SEVERITY_LEVELS.get(row.c7, None) group = 'Orion' value = '%s' % row.c6 text = '%s' % row.c5 environment = ['INFRA'] service = ['Network'] tags = None timeout = None threshold_info = None summary = None raw_data = repr(row) create_time = datetime.datetime.strptime(row.c1[:-5] + 'Z', '%Y-%m-%dT%H:%M:%S.%fZ') solarwindsAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='solarwindsAlert', tags=tags, threshold_info=threshold_info, summary=summary, timeout=timeout, create_time=create_time, raw_data=raw_data, ) suppress = solarwindsAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', solarwindsAlert.event) LOG.debug('%s', solarwindsAlert) continue if solarwindsAlert.get_type() == 'Heartbeat': solarwindsAlert = Heartbeat(origin=solarwindsAlert.origin, version='n/a', timeout=solarwindsAlert.timeout) solarwindsAlerts.append(solarwindsAlert) return solarwindsAlerts
def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = [ 'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError' ] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = dict() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert)