def parse_notification(self, message): LOG.debug('Parsing CloudWatch notification message...') notification = json.loads(message) if 'Message' in notification: alarm = json.loads(notification['Message']) else: return # Defaults alertid = notification['MessageId'] resource = alarm['Trigger']['Dimensions'][0]['value'] event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) previous_severity = self.cw_state_to_severity(alarm['OldStateValue']) group = 'CloudWatch' value = alarm['NewStateValue'] text = alarm['AlarmDescription'] environment = ['INFRA'] service = [alarm['AWSAccountId']] # XXX - use transform_alert() to map AWSAccountId to a useful name tags = {'Region': alarm['Region']} correlate = list() origin = notification['TopicArn'] timeout = None threshold_info = alarm['NewStateReason'] summary = notification['Subject'] create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( alertid=alertid, resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, previous_severity=previous_severity, environment=environment, service=service, text=text, event_type='cloudwatchAlarm', tags=tags, origin=origin, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=raw_data, ) suppress = cloudwatchAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', event) LOG.debug('%s', cloudwatchAlert) return return cloudwatchAlert
def test_alert_receive_now(self): """ Ensure receive time is stamped. """ alert = Alert(self.RESOURCE, self.EVENT, severity=self.SEVERITY, environment=self.ENVIRONMENT) alert.receive_now() self.assertIsInstance(alert.receive_time, datetime.datetime)
def test_alert_translate(self): """ Ensure a valid alert is created with some assigned values """ alert = Alert(self.RESOURCE, self.EVENT, text=self.TEXT, tags=self.TAGS) alert.translate_alert(self.trapvars) self.assertEquals(alert.text, 'foo is foo, bar was baz') self.assertEquals(alert.tags, {'Foo': '--foo--', 'Bar': 'bar'})
def create_alert(): # Create a new alert try: newAlert = Alert.parse_alert(request.data) except ValueError, e: return jsonify(response={"status": "error", "message": str(e)})
def create_alert(): # Create a new alert try: incomingAlert = Alert.parse_alert(request.data) except ValueError, e: return jsonify(status="error", message=str(e))
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: mailAlert = Alert.parse_alert(body) except ValueError: return alertid = mailAlert.get_id() severity = mailAlert.get_severity() previous_severity = mailAlert.previous_severity if severity in [severity_code.CRITICAL, severity_code.MAJOR]: LOG.info('%s : Queue email because alert severity is important', alertid) elif previous_severity in [severity_code.CRITICAL, severity_code.MAJOR]: LOG.info('%s : Queue email because alert severity was important', alertid) else: LOG.info('%s : Do not queue email, not important enough', alertid) return hold_time = time.time() + _EMAIL_HOLD_TIME if alertid in self.onhold: if severity == severity_code.NORMAL: LOG.info('%s : De-queue alert because it has been cleared', alertid) del self.onhold[alertid] else: LOG.info('%s : Extend queue on-hold time to %s', alertid, hold_time) self.onhold[alertid] = (mailAlert, hold_time) else: LOG.info('%s : Queued alert on hold until %s', alertid, hold_time) self.onhold[alertid] = (mailAlert, hold_time)
def on_message(self, headers, body): if 'type' not in headers or 'correlation-id' not in headers: LOG.warning( 'Malformed header missing "type" or "correlation-id": %s', headers) self.statsd.metric_send('alerta.alerts.rejected', 1) return LOG.info("Received %s %s", headers['type'], headers['correlation-id']) LOG.debug("Received body : %s", body) if headers['type'] == 'Heartbeat': heartbeat = Heartbeat.parse_heartbeat(body) if heartbeat: heartbeat.receive_now() LOG.debug('Queueing successfully parsed heartbeat %s', heartbeat.get_body()) self.queue.put(heartbeat) else: try: alert = Alert.parse_alert(body) except ValueError: self.statsd.metric_send('alerta.alerts.rejected', 1) return if alert: alert.receive_now() LOG.debug('Queueing successfully parsed alert %s', alert.get_body()) self.queue.put(alert)
def on_message(self, headers, body): if 'type' not in headers or 'correlation-id' not in headers: LOG.warning('Malformed header missing "type" or "correlation-id": %s', headers) self.statsd.metric_send('alerta.alerts.rejected', 1) return LOG.info("Received %s %s", headers['type'], headers['correlation-id']) LOG.debug("Received body : %s", body) if headers['type'] == 'Heartbeat': heartbeat = Heartbeat.parse_heartbeat(body) if heartbeat: heartbeat.receive_now() LOG.debug('Queueing successfully parsed heartbeat %s', heartbeat.get_body()) self.queue.put(heartbeat) else: try: alert = Alert.parse_alert(body) except ValueError: self.statsd.metric_send('alerta.alerts.rejected', 1) return if alert: alert.receive_now() LOG.debug('Queueing successfully parsed alert %s', alert.get_body()) self.queue.put(alert)
def main(self): if CONF.heartbeat: vtag = ''.join(CONF.tags) if CONF.tags else None heartbeat = Heartbeat( origin=CONF.origin, version=vtag or Version, timeout=CONF.timeout ) LOG.debug(repr(heartbeat)) api = ApiClient() api.send(heartbeat) return heartbeat.get_id() else: exceptionAlert = Alert( resource=CONF.resource, event=CONF.event, correlate=CONF.correlate, group=CONF.group, value=CONF.value, status=CONF.status, severity=CONF.severity, environment=CONF.environment, service=CONF.service, text=CONF.text, event_type=CONF.event_type, tags=CONF.tags, origin=CONF.origin, threshold_info='n/a', # TODO(nsatterl): make this configurable? summary=CONF.summary, timeout=CONF.timeout, raw_data='n/a', # TODO(nsatterl): make this configurable? more_info=CONF.more_info, graph_urls=CONF.graph_urls, ) LOG.debug(repr(exceptionAlert)) api = ApiClient() api.send(exceptionAlert) return exceptionAlert.get_id()
def main(self): if CONF.heartbeat: heartbeat = Heartbeat(origin=CONF.origin, version=CONF.tags.get('Version', Version), timeout=CONF.timeout) LOG.debug(heartbeat) api = ApiClient() api.send(heartbeat) return heartbeat.get_id() else: exceptionAlert = Alert( resource=CONF.resource, event=CONF.event, correlate=CONF.correlate, group=CONF.group, value=CONF.value, status=CONF.status, severity=CONF.severity, environment=CONF.environment, service=CONF.service, text=CONF.text, event_type=CONF.event_type, tags=CONF.tags, origin=CONF.origin, threshold_info='n/a', # TODO(nsatterl): make this configurable? summary=CONF.summary, timeout=CONF.timeout, raw_data='n/a', # TODO(nsatterl): make this configurable? more_info=CONF.more_info, graph_urls=CONF.graph_urls, ) LOG.debug(repr(exceptionAlert)) api = ApiClient() api.send(exceptionAlert) return exceptionAlert.get_id()
def test_alert_defaults(self): """ Ensures a valid alert is created with default values """ alert = Alert(self.RESOURCE, self.EVENT) self.assertEquals(alert.previous_severity, self.PREVIOUS_SEVERITY) self.assertEquals(alert.repeat, self.REPEAT) self.assertEquals(alert.duplicate_count, self.DUPLICATE_COUNT) self.assertEquals(alert.timeout, self.TIMEOUT)
def test_save_and_tag_alert(self): """ Save an alert to database and tag it. """ alert = Alert(self.RESOURCE, self.EVENT, receive_time=datetime.datetime.utcnow()) ret = self.db.save_alert(alert) self.assertIsInstance(ret, basestring) self.db.tag_alert(alert.alertid, self.TAGS) self.assertItemsEqual(self.db.get_alert(alert.alertid).tags, [self.TAGS])
def test_alert_with_some_values(self): """ Ensure a valid alert is created with some assigned values """ alert = Alert(self.RESOURCE, self.EVENT, severity=self.SEVERITY, environment=self.ENVIRONMENT) self.assertEquals(alert.resource, self.RESOURCE) self.assertEquals(alert.event, self.EVENT) self.assertEquals(alert.group, 'Misc') self.assertEquals(alert.severity, self.SEVERITY) self.assertEquals(alert.environment, self.ENVIRONMENT)
def get_alerts(self, query=None, fields=None, sort=None, limit=0): query = query or dict() fields = fields or list() sort = sort or dict() responses = self.db.alerts.find(query, fields=fields, sort=sort).limit(limit) if not responses: LOG.warning( 'Alert not found with query = %s, sort = %s, limit = %s', query, sort, limit) return None alerts = list() for response in responses: alerts.append( Alert( alertid=response['_id'], resource=response['resource'], event=response['event'], correlate=response['correlatedEvents'], group=response['group'], value=response['value'], status=response['status'], severity=response['severity'], previous_severity=response['previousSeverity'], environment=response['environment'], service=response['service'], text=response['text'], event_type=response['type'], tags=response['tags'], origin=response['origin'], repeat=response['repeat'], duplicate_count=response['duplicateCount'], threshold_info=response['thresholdInfo'], summary=response['summary'], timeout=response['timeout'], last_receive_id=response['lastReceiveId'], create_time=response['createTime'], expire_time=response['expireTime'], receive_time=response['receiveTime'], last_receive_time=response['lastReceiveTime'], trend_indication=response['trendIndication'], raw_data=response['rawData'], more_info=response['moreInfo'], graph_urls=response['graphUrls'], history=response['history'], )) return alerts
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: pdAlert = Alert.parse_alert(body) except ValueError: return if 'pagerduty' not in pdAlert.tags: return if pdAlert.status == status_code.OPEN: self.pd.trigger_event(pdAlert) elif pdAlert.status == status_code.ACK: self.pd.acknowledge_event(pdAlert) elif pdAlert.status == status_code.CLOSED: self.pd.resolve_event(pdAlert)
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: logAlert = Alert.parse_alert(body) except ValueError: return if logAlert: LOG.info('%s : [%s] %s', logAlert.last_receive_id, logAlert.status, logAlert.summary) source_host, _, source_path = logAlert.resource.partition(':') document = { '@message': logAlert.summary, '@source': logAlert.resource, '@source_host': source_host, '@source_path': source_path, '@tags': logAlert.tags, '@timestamp': logAlert.last_receive_time, '@type': logAlert.event_type, '@fields': logAlert.get_body() } LOG.debug('Index payload %s', document) index_url = "http://%s:%s/%s/%s" % ( CONF.es_host, CONF.es_port, datetime.datetime.utcnow().strftime( CONF.es_index), logAlert.event_type) LOG.debug('Index URL: %s', index_url) try: response = urllib2.urlopen( index_url, json.dumps(document, cls=DateEncoder)).read() except Exception, e: LOG.error('%s : Alert indexing to %s failed - %s', logAlert.last_receive_id, index_url, e) return try: es_id = json.loads(response)['_id'] LOG.info('%s : Alert indexed at %s/%s', logAlert.last_receive_id, index_url, es_id) except Exception, e: LOG.error('%s : Could not parse elasticsearch reponse: %s', e)
def on_message(self, headers, body): if not self.tokens.get_token(): LOG.warning('%s : No tokens left, rate limiting this alert', headers['correlation-id']) return LOG.debug("Received: %s", body) try: ircAlert = Alert.parse_alert(body) except ValueError: return if ircAlert: LOG.info('%s : Send IRC message to %s', ircAlert.get_id(), CONF.irc_channel) try: msg = 'PRIVMSG %s :%s [%s] %s' % (CONF.irc_channel, ircAlert.get_id(short=True), ircAlert.status, ircAlert.summary) self.irc.send(msg + '\r\n') except Exception, e: LOG.error('%s : IRC send failed - %s', ircAlert.get_id(), e)
def on_message(self, headers, body): LOG.info("Received %s %s", headers['type'], headers['correlation-id']) LOG.debug("Received body : %s", body) if headers['type'] == 'Heartbeat': heartbeat = Heartbeat.parse_heartbeat(body) if heartbeat: heartbeat.receive_now() LOG.debug('Queueing successfully parsed heartbeat %s', heartbeat.get_body()) self.queue.put(heartbeat) elif headers['type'].endswith('Alert'): try: alert = Alert.parse_alert(body) except ValueError: self.statsd.metric_send('alerta.alerts.rejected', 1) return if alert: alert.receive_now() LOG.debug('Queueing successfully parsed alert %s', alert.get_body()) self.queue.put(alert)
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: logAlert = Alert.parse_alert(body) except ValueError: return if logAlert: LOG.info('%s : [%s] %s', logAlert.last_receive_id, logAlert.status, logAlert.summary) source_host, _, source_path = logAlert.resource.partition(':') document = { '@message': logAlert.summary, '@source': logAlert.resource, '@source_host': source_host, '@source_path': source_path, '@tags': logAlert.tags, '@timestamp': logAlert.last_receive_time, '@type': logAlert.event_type, '@fields': logAlert.get_body() } LOG.debug('Index payload %s', document) index_url = "http://%s:%s/%s/%s" % (CONF.es_host, CONF.es_port, datetime.datetime.utcnow().strftime(CONF.es_index), logAlert.event_type) LOG.debug('Index URL: %s', index_url) try: response = urllib2.urlopen(index_url, json.dumps(document, cls=DateEncoder)).read() except Exception, e: LOG.error('%s : Alert indexing to %s failed - %s', logAlert.last_receive_id, index_url, e) return try: es_id = json.loads(response)['_id'] LOG.info('%s : Alert indexed at %s/%s', logAlert.last_receive_id, index_url, es_id) except Exception, e: LOG.error('%s : Could not parse elasticsearch reponse: %s', e)
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: mailAlert = Alert.parse_alert(body) except ValueError: return alertid = mailAlert.get_id() severity = mailAlert.get_severity() previous_severity = mailAlert.previous_severity if severity in [severity_code.CRITICAL, severity_code.MAJOR]: LOG.info('%s : Queue email because alert severity is important', alertid) elif previous_severity in [ severity_code.CRITICAL, severity_code.MAJOR ]: LOG.info('%s : Queue email because alert severity was important', alertid) else: LOG.info('%s : Do not queue email, not important enough', alertid) return hold_time = time.time() + _EMAIL_HOLD_TIME if alertid in self.onhold: if severity == severity_code.NORMAL: LOG.info('%s : De-queue alert because it has been cleared', alertid) del self.onhold[alertid] else: LOG.info('%s : Extend queue on-hold time to %s', alertid, hold_time) self.onhold[alertid] = (mailAlert, hold_time) else: LOG.info('%s : Queued alert on hold until %s', alertid, hold_time) self.onhold[alertid] = (mailAlert, hold_time)
def on_message(self, headers, body): LOG.debug("Received: %s", body) try: pdAlert = Alert.parse_alert(body) except ValueError: return # do not trigger new incidents from updates if pdAlert.origin == 'pagerduty/webhook': return if 'pagerduty' not in pdAlert.tags.keys(): return LOG.info('PagerDuty Incident %s status %s', pdAlert.get_id(), pdAlert.status) incident_key = pdAlert.get_id() if pdAlert.status == status_code.OPEN: self.pd.trigger_event(pdAlert, incident_key=incident_key) elif pdAlert.status == status_code.ACK: self.pd.acknowledge_event(pdAlert, incident_key=incident_key) elif pdAlert.status == status_code.CLOSED: self.pd.resolve_event(pdAlert, incident_key=incident_key)
def parse_notification(self, message): LOG.debug('Parsing CloudWatch notification message...') notification = json.loads(message) if 'Message' in notification: alarm = json.loads(notification['Message']) else: return # Defaults alertid = notification['MessageId'] resource = alarm['Trigger']['Dimensions'][0]['value'] event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) previous_severity = self.cw_state_to_severity(alarm['OldStateValue']) group = 'CloudWatch' value = alarm['NewStateValue'] text = alarm['AlarmDescription'] environment = ['INFRA'] service = [ alarm['AWSAccountId'] ] # XXX - use transform_alert() to map AWSAccountId to a useful name tags = {'Region': alarm['Region']} correlate = list() origin = notification['TopicArn'] timeout = None threshold_info = alarm['NewStateReason'] summary = notification['Subject'] create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( alertid=alertid, resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, previous_severity=previous_severity, environment=environment, service=service, text=text, event_type='cloudwatchAlarm', tags=tags, origin=origin, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=raw_data, ) suppress = cloudwatchAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', event) LOG.debug('%s', cloudwatchAlert) return return cloudwatchAlert
def get_alert(self, alertid=None, environment=None, resource=None, event=None, severity=None): if alertid: query = { '$or': [{ '_id': { '$regex': '^' + alertid } }, { 'lastReceiveId': { '$regex': '^' + alertid } }] } elif severity: query = { "environment": environment, "resource": resource, "event": event, "severity": severity } else: query = { "environment": environment, "resource": resource, "event": event } response = self.db.alerts.find_one(query) LOG.debug('db.alerts.findOne(query=%s)', query) if not response: LOG.warning( 'Alert not found with environment, resource, event, severity = %s %s %s %s', environment, resource, event, severity) return None return Alert( resource=response.get('resource', None), event=response.get('event', None), correlate=response.get('correlatedEvents', None), group=response.get('group', None), value=response.get('value', None), status=response.get('status', None), severity=response.get('severity', None), previous_severity=response.get('previousSeverity', None), environment=response.get('environment', None), service=response.get('service', None), text=response.get('text', None), event_type=response.get('type', None), tags=response.get('tags', None), origin=response.get('origin', None), repeat=response.get('repeat', None), duplicate_count=response.get('duplicateCount', None), threshold_info=response.get('thresholdInfo', None), summary=response.get('summary', None), timeout=response.get('timeout', None), alertid=response.get('_id', None), last_receive_id=response.get('lastReceiveId', None), create_time=response.get('createTime', None), expire_time=response.get('expireTime', None), receive_time=response.get('receiveTime', None), last_receive_time=response.get('lastReceiveTime', None), trend_indication=response.get('trendIndication', None), raw_data=response.get('rawData', None), more_info=response.get('moreInfo', None), graph_urls=response.get('graphUrls', None), history=response.get('history', None), )
def update_status(self, alertid=None, alert=None, status=None, text=None): if alertid: query = { '$or': [{ '_id': { '$regex': '^' + alertid } }, { 'lastReceiveId': { '$regex': '^' + alertid } }] } else: query = { "environment": alert.environment, "resource": alert.resource, '$or': [{ "event": alert.event }, { "correlatedEvents": alert.event }] } update_time = datetime.datetime.utcnow() update_time = update_time.replace(tzinfo=pytz.utc) # FIXME - no native find_and_modify method in this version of pymongo no_obj_error = "No matching object found" response = self.db.command("findAndModify", 'alerts', allowable_errors=[no_obj_error], query=query, update={ '$set': { "status": status }, '$push': { "history": { "status": status, "updateTime": update_time, "text": text, } } }, multi=False, new=True, fields={"history": 0})['value'] if not response: LOG.warn('Alert %s not found - could not update status to %s', alertid, status) return return Alert( alertid=response['_id'], resource=response['resource'], event=response['event'], correlate=response['correlatedEvents'], group=response['group'], value=response['value'], status=response['status'], severity=response['severity'], previous_severity=response['previousSeverity'], environment=response['environment'], service=response['service'], text=response['text'], event_type=response['type'], tags=response['tags'], origin=response['origin'], repeat=response['repeat'], duplicate_count=response['duplicateCount'], threshold_info=response['thresholdInfo'], summary=response['summary'], timeout=response['timeout'], last_receive_id=response['lastReceiveId'], create_time=response['createTime'], expire_time=response['expireTime'], receive_time=response['receiveTime'], last_receive_time=response['lastReceiveTime'], trend_indication=response['trendIndication'], raw_data=response['rawData'], more_info=response['moreInfo'], graph_urls=response['graphUrls'], )
def duplicate_alert(self, alert): update = { "correlatedEvents": alert.correlate, "group": alert.group, "value": alert.value, "service": alert.service, "text": alert.text, "tags": alert.tags, "origin": alert.origin, "repeat": True, "thresholdInfo": alert.threshold_info, "summary": alert.summary, "timeout": alert.timeout, "lastReceiveId": alert.alertid, "expireTime": alert.expire_time, "lastReceiveTime": alert.receive_time, "rawData": alert.raw_data, "moreInfo": alert.more_info, "graphUrls": alert.graph_urls, } # FIXME - no native find_and_modify method in this version of pymongo no_obj_error = "No matching object found" response = self.db.command("findAndModify", 'alerts', allowable_errors=[no_obj_error], query={ "environment": alert.environment, "resource": alert.resource, "event": alert.event }, update={ '$set': update, '$inc': { "duplicateCount": 1 } }, new=True, fields={"history": 0})['value'] return Alert( alertid=response['_id'], resource=response['resource'], event=response['event'], correlate=response['correlatedEvents'], group=response['group'], value=response['value'], status=response['status'], severity=response['severity'], previous_severity=response['previousSeverity'], environment=response['environment'], service=response['service'], text=response['text'], event_type=response['type'], tags=response['tags'], origin=response['origin'], repeat=response['repeat'], duplicate_count=response['duplicateCount'], threshold_info=response['thresholdInfo'], summary=response['summary'], timeout=response['timeout'], last_receive_id=response['lastReceiveId'], create_time=response['createTime'], expire_time=response['expireTime'], receive_time=response['receiveTime'], last_receive_time=response['lastReceiveTime'], trend_indication=response['trendIndication'], raw_data=response['rawData'], more_info=response['moreInfo'], graph_urls=response['graphUrls'], )
try: data = json.loads(request.data) except Exception, e: return jsonify(response={"status": "error", "message": str(e)}) try: newAlert = Alert( resource=data.get('resource', None), event=data.get('event', None), correlate=data.get('correlatedEvents', None), group=data.get('group', None), value=data.get('value', None), severity=severity_code.parse_severity(data.get('severity', None)), environment=data.get('environment', None), service=data.get('service', None), text=data.get('text', None), event_type=data.get('type', 'exceptionAlert'), tags=data.get('tags', None), origin=data.get('origin', None), threshold_info=data.get('thresholdInfo', None), timeout=data.get('timeout', None), alertid=data.get('id', None), raw_data=data.get('rawData', None), more_info=data.get('moreInfo', None), graph_urls=data.get('graphUrls', None), ) except ValueError, e: return jsonify(response={"status": "error", "message": str(e)}) LOG.debug('New alert %s', newAlert) mq.send(newAlert)
def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = [ 'PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError' ] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = dict() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert)
def test_alert_with_all_values(self): """ Ensure a valid alert is created with all assigned values """ alert = Alert(resource=self.RESOURCE, event=self.EVENT, correlate=self.CORRELATE, group=self.GROUP, value=self.VALUE, status=self.STATUS, severity=self.SEVERITY, previous_severity=self.PREVIOUS_SEVERITY, environment=self.ENVIRONMENT, service=self.SERVICE, text=self.TEXT, event_type=self.EVENT_TYPE, tags=self.TAGS, origin=self.ORIGIN, repeat=self.REPEAT, duplicate_count=self.DUPLICATE_COUNT, threshold_info=self.THRESHOLD_INFO, summary=self.SUMMARY, timeout=self.TIMEOUT, alertid=self.ALERTID, create_time=self.CREATE_TIME, expire_time=self.EXPIRE_TIME, receive_time=self.RECEIVE_TIME, last_receive_time=self.RECEIVE_TIME, trend_indication=self.TREND_INDICATION, raw_data=self.RAW_DATA, more_info=self.MORE_INFO, graph_urls=self.GRAPH_URLS, history=self.HISTORY) self.assertEquals(alert.resource, self.RESOURCE) self.assertEquals(alert.event, self.EVENT) self.assertEquals(alert.correlate, self.CORRELATE) self.assertEquals(alert.group, self.GROUP) self.assertEquals(alert.value, self.VALUE) self.assertEquals(alert.status, self.STATUS) self.assertEquals(alert.severity, self.SEVERITY) self.assertEquals(alert.previous_severity, self.PREVIOUS_SEVERITY) self.assertEquals(alert.environment, self.ENVIRONMENT) self.assertEquals(alert.service, self.SERVICE) self.assertEquals(alert.text, self.TEXT) self.assertEquals(alert.event_type, self.EVENT_TYPE) self.assertEquals(alert.tags, self.TAGS) self.assertEquals(alert.origin, self.ORIGIN) self.assertEquals(alert.repeat, self.REPEAT) self.assertEquals(alert.duplicate_count, self.DUPLICATE_COUNT) self.assertEquals(alert.threshold_info, self.THRESHOLD_INFO) self.assertEquals(alert.summary, self.SUMMARY) self.assertEquals(alert.timeout, self.TIMEOUT) self.assertEquals(alert.alertid, self.ALERTID) self.assertEquals(alert.last_receive_id, self.ALERTID) self.assertEquals(alert.create_time, self.CREATE_TIME) self.assertEquals(alert.expire_time, self.EXPIRE_TIME) self.assertEquals(alert.receive_time, self.RECEIVE_TIME) self.assertEquals(alert.trend_indication, self.TREND_INDICATION) self.assertEquals(alert.raw_data, self.RAW_DATA) self.assertEquals(alert.more_info, self.MORE_INFO) self.assertEquals(alert.graph_urls, self.GRAPH_URLS) self.assertEquals(alert.history, self.HISTORY)
def parse_events(self, data): LOG.debug('Parsing solarwinds event data...') LOG.debug(data) try: data[0] except IndexError: return [] try: data[0].c0 except AttributeError: data = [data] solarwindsAlerts = list() for row in data: LOG.debug(row) event = row.c4.replace(" ", "") correlate = SOLAR_WINDS_CORRELATED_EVENTS.get(event, None) resource = '%s:%s' % (row.c2, row.c3.lower()) severity = SOLAR_WINDS_SEVERITY_LEVELS.get(row.c7, None) group = 'Orion' value = '%s' % row.c6 text = '%s' % row.c5 environment = ['INFRA'] service = ['Network'] tags = None timeout = None threshold_info = None summary = None raw_data = repr(row) create_time = datetime.datetime.strptime(row.c1[:-5] + 'Z', '%Y-%m-%dT%H:%M:%S.%fZ') solarwindsAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='solarwindsAlert', tags=tags, threshold_info=threshold_info, summary=summary, timeout=timeout, create_time=create_time, raw_data=raw_data, ) suppress = solarwindsAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', solarwindsAlert.event) LOG.debug('%s', solarwindsAlert) continue if solarwindsAlert.get_type() == 'Heartbeat': solarwindsAlert = Heartbeat(origin=solarwindsAlert.origin, version='n/a', timeout=solarwindsAlert.timeout) solarwindsAlerts.append(solarwindsAlert) return solarwindsAlerts
def run(self): while True: LOG.debug("Waiting on input queue...") item = self.queue.get() if not item: LOG.info("%s is shutting down.", self.getName()) break environment, service, resource, retries, queue_time = item if time.time() - queue_time > CONF.loop_every: LOG.warning("Ping request to %s expired after %d seconds.", resource, int(time.time() - queue_time)) self.queue.task_done() continue LOG.info("%s pinging %s...", self.getName(), resource) if retries > 1: rc, rtt, loss, stdout = self.pinger(resource, count=2, timeout=5) else: rc, rtt, loss, stdout = self.pinger(resource, count=5, timeout=CONF.ping_max_timeout) if rc != PING_OK and retries: LOG.info("Retrying ping %s %s more times", resource, retries) self.queue.put((environment, service, resource, retries - 1, time.time())) self.queue.task_done() continue if rc == PING_OK: avg, max = rtt self.carbon.metric_send("alert.pinger.%s.avgRoundTrip" % resource, avg) self.carbon.metric_send("alert.pinger.%s.maxRoundTrip" % resource, max) self.carbon.metric_send("alert.pinger.%s.availability" % resource, 100.0) if avg > CONF.ping_slow_critical: event = "PingSlow" severity = severity_code.CRITICAL text = "Node responded to ping in %s ms avg (> %s ms)" % (avg, CONF.ping_slow_critical) elif avg > CONF.ping_slow_warning: event = "PingSlow" severity = severity_code.WARNING text = "Node responded to ping in %s ms avg (> %s ms)" % (avg, CONF.ping_slow_warning) else: event = "PingOK" severity = severity_code.NORMAL text = "Node responding to ping avg/max %s/%s ms." % tuple(rtt) value = "%s/%s ms" % tuple(rtt) elif rc == PING_FAILED: event = "PingFailed" severity = severity_code.MAJOR text = "Node did not respond to ping or timed out within %s seconds" % CONF.ping_max_timeout value = "%s%% packet loss" % loss self.carbon.metric_send("alert.pinger.%s.availability" % resource, 100.0 - float(loss)) elif rc == PING_ERROR: event = "PingError" severity = severity_code.WARNING text = "Could not ping node %s." % resource value = stdout self.carbon.metric_send("alert.pinger.%s.availability" % resource, 0.0) else: LOG.warning("Unknown ping return code: %s", rc) continue # Defaults resource += ":icmp" group = "Ping" correlate = _PING_ALERTS timeout = None threshold_info = None summary = None raw_data = stdout pingAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type="serviceAlert", tags=None, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = pingAlert.transform_alert() if suppress: LOG.info("Suppressing %s alert", pingAlert.event) LOG.debug("%s", pingAlert) elif self.dedup.is_send(pingAlert): self.mq.send(pingAlert) self.queue.task_done() LOG.info("%s ping %s complete.", self.getName(), resource) self.queue.task_done()
def metric_check(self, rules): for rule in rules: # Check rule is valid if len(rule['thresholdInfo']) != len(rule['text']): LOG.warning( 'Skipping invalid rule %s - MUST define alert text for each threshold.', rule['event']) continue # Get list of metrics required to evaluate each rule params = dict() if 'filter' in rule and rule['filter'] is not None: params[rule['filter']] = 1 for s in (' '.join(rule['text']), ' '.join(rule['thresholdInfo']), rule['value']): matches = re.findall('\$([a-z0-9A-Z_]+)', s) for m in matches: if m != 'now': params['metric=' + m] = 1 metric_filter = '&'.join(params.keys()) LOG.debug('Metric filter = %s', metric_filter) # Get metric data for each rule response = GangliaDaemon.get_metrics(metric_filter) LOG.debug('Ganglia API response: %s', response) # Make non-metric substitutions in value, thresholdInfo and text now = int(time.time()) rule['value'] = re.sub('\$now', str(now), rule['value']) idx = 0 for threshold in rule['thresholdInfo']: rule['thresholdInfo'][idx] = re.sub('\$now', str(now), threshold) idx += 1 idx = 0 for text in rule['text']: rule['text'][idx] = re.sub( '\$now', time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(now)), text) idx += 1 metric = dict() for m in response: # Make metric-based substitutions in resource eg. per instance, host or cluster resource = re.sub('\$instance', m.get('instance', '__NA__'), rule['resource']) resource = re.sub('\$host', m.get('host', '__NA__'), resource) resource = re.sub('\$cluster', m.get('cluster', '__NA__'), resource) if '__NA__' in resource: LOG.debug('Metric %s doesnt match resource rule %s', m['id'], rule['resource']) continue LOG.debug('Metric %s matches rule %s => %s', m['id'], rule['resource'], resource) # Don't generate cluster alerts from host-based metrics if 'host' in m and not '$host' in rule['resource']: LOG.debug( 'Skipping host-based metric for cluster-based rule') continue # Build up info for alert if metric value triggers threshold if resource not in metric: metric[resource] = dict() if 'thresholdInfo' not in metric[resource]: metric[resource]['thresholdInfo'] = list( rule['thresholdInfo']) LOG.debug('Set thresholdInfo to %s', metric[resource]['thresholdInfo']) if 'text' not in metric[resource]: metric[resource]['text'] = list(rule['text']) LOG.debug('Set text to %s', metric[resource]['text']) if m['metric'] in rule['value']: # Determine service and environment from rule if given if 'environment' in rule: metric[resource]['environment'] = [rule['environment']] else: metric[resource]['environment'] = [m['environment']] LOG.debug('Set environment for alert to %s', metric[resource]['environment']) if 'service' in rule: metric[resource]['service'] = [rule['service']] else: metric[resource]['service'] = [m['service']] LOG.debug('Set service for alert to %s', metric[resource]['service']) # Use raw metric value, or sum or average if aggregated metric if 'value' in m: v = GangliaDaemon.quote(m['value']) # raw value elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote( m['sum']) # aggregated sum value if "<metric>.sum" else: try: v = "%.1f" % (float(m['sum']) / float(m['num']) ) # average of aggregate value except ZeroDivisionError: v = 0.0 LOG.debug('Value for %s on %s is %s', m['id'], resource, v) # If no value assign rule value if 'value' not in metric[resource]: metric[resource]['value'] = rule['value'] metric[resource]['value'] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), metric[resource]['value']) metric[resource]['units'] = m['units'] # Assign tags metric[resource]['tags'] = list() metric[resource]['tags'].extend(rule['tags']) metric[resource]['tags'].append('cluster:%s' % m['cluster']) if 'tags' in m and m['tags'] is not None: metric[resource]['tags'].extend(m['tags']) # Assign graph URL if 'graphUrl' not in metric[resource]: metric[resource]['graphUrls'] = list() if 'graphUrl' in m: metric[resource]['graphUrls'].append(m['graphUrl']) for g in rule['graphs']: if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&h=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], m['host'], g)) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['graphUrls'].append( '/'.join(m['graphUrl'].rsplit('/', 2)[0:2]) + '/graph.php?c=%s&m=%s&r=1day&v=0&z=default' % (m['cluster'], g)) metric[resource]['moreInfo'] = '' if '$host' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit('/', 2) [0:2]) + '/?c=%s&h=%s' % (m['cluster'], m['host']) if '$cluster' in rule['resource'] and 'graphUrl' in m: metric[resource]['moreInfo'] = '/'.join( m['graphUrl'].rsplit( '/', 2)[0:2]) + '/?c=%s' % m['cluster'] # Substitutions for threshold info if m['metric'] in ''.join(rule['thresholdInfo']): LOG.debug('Text to be substituted: %s', ''.join(rule['thresholdInfo'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 idx = 0 for threshold in metric[resource]['thresholdInfo']: metric[resource]['thresholdInfo'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), threshold) idx += 1 # Substitutions for text if m['metric'] in ''.join(rule['text']): LOG.debug('Text to be substituted: %s', ''.join(rule['text'])) if 'value' in m: v = GangliaDaemon.quote(m['value']) elif rule['value'].endswith('.sum'): v = GangliaDaemon.quote(m['sum']) else: try: v = "%.1f" % (float(m['sum']) / float(m['num'])) except ZeroDivisionError: v = 0.0 if m['type'] == 'timestamp' or m['units'] == 'timestamp': v = time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(float(v))) LOG.debug('Metric resource text %s', metric) idx = 0 for text in metric[resource]['text']: metric[resource]['text'][idx] = re.sub( '\$%s(\.sum)?' % m['metric'], str(v), text) idx += 1 LOG.debug('end of metric loop') for resource in metric: LOG.debug('Calculate final value for resource %s', resource) index = 0 try: calculated_value = eval(metric[resource]['value']) except KeyError: LOG.warning( 'Could not calculate %s value for %s because %s is not being reported', rule['event'], resource, rule['value']) continue except (SyntaxError, NameError): LOG.error( 'Could not calculate %s value for %s => eval(%s)', rule['event'], resource, metric[resource]['value']) continue except ZeroDivisionError: LOG.debug( 'Could not calculate %s value for %s => eval(%s) (division by zero). Setting to 0 instead.', rule['event'], resource, metric[resource]['value']) calculated_value = 0 except Exception: LOG.error( 'Could not calculate %s value for %s => eval(%s) (threw unknown exception)', rule['event'], resource, metric[resource]['value']) continue LOG.debug('Calculated value for resource %s => %s', resource, calculated_value) # Compare final value with each threshold for ti in metric[resource]['thresholdInfo']: severity, op, threshold = ti.split(':') rule_eval = '%s %s %s' % ( GangliaDaemon.quote(calculated_value), op, threshold) try: result = eval(rule_eval) except SyntaxError: LOG.error( 'Could not evaluate %s threshold for %s => eval(%s)', rule['event'], resource, rule_eval) result = False if result: event = rule['event'] group = rule['group'] value = "%s%s" % (calculated_value, GangliaDaemon.format_units( metric[resource]['units'])) environment = metric[resource]['environment'] service = metric[resource]['service'] text = metric[resource]['text'][index] tags = metric[resource]['tags'] threshold_info = ','.join(rule['thresholdInfo']) more_info = metric[resource]['moreInfo'] graph_urls = metric[resource]['graphUrls'] gangliaAlert = Alert( resource=resource, event=event, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='gangliaAlert', tags=tags, threshold_info=threshold_info, more_info=more_info, graph_urls=graph_urls, raw_data= '', # TODO(nsatterl): put raw metric values used to do calculation here ) if self.dedup.is_send(gangliaAlert): self.mq.send(gangliaAlert) break # First match wins index += 1
def parse_events(self, data): LOG.debug('Parsing solarwinds event data...') LOG.debug(data) try: data[0] except IndexError: return [] try: data[0].c0 except AttributeError: data = [data] solarwindsAlerts = list() for row in data: LOG.debug(row) event = row.c4.replace(" ", "") correlate = SOLAR_WINDS_CORRELATED_EVENTS.get(event, None) resource = '%s:%s' % (row.c2, row.c3.lower()) severity = SOLAR_WINDS_SEVERITY_LEVELS.get(row.c7, None) group = 'Orion' value = '%s' % row.c6 text = '%s' % row.c5 environment = ['INFRA'] service = ['Network'] tags = None timeout = None threshold_info = None summary = None raw_data = repr(row) create_time = datetime.datetime.strptime(row.c1[:-5]+'Z', '%Y-%m-%dT%H:%M:%S.%fZ') solarwindsAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='solarwindsAlert', tags=tags, threshold_info=threshold_info, summary=summary, timeout=timeout, create_time=create_time, raw_data=raw_data, ) suppress = solarwindsAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', solarwindsAlert.event) LOG.debug('%s', solarwindsAlert) continue if solarwindsAlert.get_type() == 'Heartbeat': solarwindsAlert = Heartbeat(origin=solarwindsAlert.origin, version='n/a', timeout=solarwindsAlert.timeout) solarwindsAlerts.append(solarwindsAlert) return solarwindsAlerts
resource = check['resource'] correlate = _HTTP_ALERTS group = 'Web' environment = check['environment'] service = check['service'] text = text tags = check.get('tags', dict()) threshold_info = "%s : RT > %d RT > %d x %s" % (check['url'], warn_thold, crit_thold, check.get('count', 1)) urlmonAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, threshold_info=threshold_info, ) suppress = urlmonAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', urlmonAlert.event) LOG.debug('%s', urlmonAlert) elif self.dedup.is_send(urlmonAlert): self.mq.send(urlmonAlert)
def run(self): while True: LOG.debug('Waiting on input queue...') item = self.queue.get() if not item: LOG.info('%s is shutting down.', self.getName()) break environment, service, resource, retries, queue_time = item if time.time() - queue_time > CONF.loop_every: LOG.warning('Ping request to %s expired after %d seconds.', resource, int(time.time() - queue_time)) self.queue.task_done() continue LOG.info('%s pinging %s...', self.getName(), resource) if retries > 1: rc, rtt, loss, stdout = self.pinger(resource, count=2, timeout=5) else: rc, rtt, loss, stdout = self.pinger( resource, count=5, timeout=CONF.ping_max_timeout) if rc != PING_OK and retries: LOG.info('Retrying ping %s %s more times', resource, retries) self.queue.put( (environment, service, resource, retries - 1, time.time())) self.queue.task_done() continue if rc == PING_OK: avg, max = rtt self.carbon.metric_send( 'alert.pinger.%s.avgRoundTrip' % resource, avg) self.carbon.metric_send( 'alert.pinger.%s.maxRoundTrip' % resource, max) self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 100.0) if avg > CONF.ping_slow_critical: event = 'PingSlow' severity = severity_code.CRITICAL text = 'Node responded to ping in %s ms avg (> %s ms)' % ( avg, CONF.ping_slow_critical) elif avg > CONF.ping_slow_warning: event = 'PingSlow' severity = severity_code.WARNING text = 'Node responded to ping in %s ms avg (> %s ms)' % ( avg, CONF.ping_slow_warning) else: event = 'PingOK' severity = severity_code.NORMAL text = 'Node responding to ping avg/max %s/%s ms.' % tuple( rtt) value = '%s/%s ms' % tuple(rtt) elif rc == PING_FAILED: event = 'PingFailed' severity = severity_code.MAJOR text = 'Node did not respond to ping or timed out within %s seconds' % CONF.ping_max_timeout value = '%s%% packet loss' % loss self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 100.0 - float(loss)) elif rc == PING_ERROR: event = 'PingError' severity = severity_code.WARNING text = 'Could not ping node %s.' % resource value = stdout self.carbon.metric_send( 'alert.pinger.%s.availability' % resource, 0.0) else: LOG.warning('Unknown ping return code: %s', rc) continue # Defaults resource += ':icmp' group = 'Ping' correlate = _PING_ALERTS timeout = None threshold_info = None summary = None raw_data = stdout pingAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=None, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = pingAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', pingAlert.event) LOG.debug('%s', pingAlert) elif self.dedup.is_send(pingAlert): self.mq.send(pingAlert) self.queue.task_done() LOG.info('%s ping %s complete.', self.getName(), resource) self.queue.task_done()
correlate = _HTTP_ALERTS group = 'Web' environment = check['environment'] service = check['service'] text = text tags = check.get('tags', dict()) threshold_info = "%s : RT > %d RT > %d x %s" % ( check['url'], warn_thold, crit_thold, check.get('count', 1)) urlmonAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, threshold_info=threshold_info, ) suppress = urlmonAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', urlmonAlert.event) LOG.debug('%s', urlmonAlert) elif self.dedup.is_send(urlmonAlert): self.mq.send(urlmonAlert)
def main(self): if CONF.heartbeat: msg = Heartbeat(version=Version) else: # Run Nagios plugin check args = shlex.split( os.path.join(CONF.nagios_plugins, CONF.nagios_cmd)) LOG.info('Running %s', ' '.join(args)) try: check = subprocess.Popen(args, stdout=subprocess.PIPE) except Exception, e: LOG.error('Nagios check did not execute: %s', e) sys.exit(1) stdout = check.communicate()[0] rc = check.returncode LOG.debug('Nagios plugin %s => %s (rc=%d)', CONF.nagios_cmd, stdout, rc) if rc == 0: severity = severity_code.NORMAL elif rc == 1: severity = severity_code.WARNING elif rc == 2: severity = severity_code.CRITICAL elif rc == 3: severity = severity_code.UNKNOWN else: rc = -1 severity = severity_code.INDETERMINATE # Parse Nagios plugin check output text = '' long_text = '' perf_data = '' extra_perf_data = False for num, line in enumerate(stdout.split('\n'), start=1): if num == 1: if '|' in line: text = line.split('|')[0].rstrip(' ') perf_data = line.split('|')[1] value = perf_data.split(';')[0].lstrip(' ') else: text = line value = 'rc=%s' % rc else: if '|' in line: long_text += line.split('|')[0] perf_data += line.split('|')[1] extra_perf_data = True elif extra_perf_data is False: long_text += line else: perf_data += line LOG.debug('Short Output: %s', text) LOG.debug('Long Output: %s', long_text) LOG.debug('Perf Data: %s', perf_data) graph_urls = None msg = Alert( resource=CONF.resource, event=CONF.event, correlate=CONF.correlate, group=CONF.group, value=value, severity=severity, environment=CONF.environment, service=CONF.service, text=text + ' ' + long_text, event_type='nagiosAlert', tags=CONF.tags, threshold_info=CONF.nagios_cmd, timeout=CONF.timeout, raw_data=stdout, more_info=perf_data, graph_urls=graph_urls, )
def parse_snmptrap(data): pdu_data = data.splitlines() varbind_list = pdu_data[:] trapvars = dict() for line in pdu_data: if line.startswith('$'): special, value = line.split(None, 1) trapvars[special] = value varbind_list.pop(0) if '$s' in trapvars: if trapvars['$s'] == '0': version = 'SNMPv1' elif trapvars['$s'] == '1': version = 'SNMPv2c' elif trapvars['$s'] == '2': version = 'SNMPv2u' # not supported else: version = 'SNMPv3' trapvars['$s'] = version # Get varbinds varbinds = dict() idx = 0 for varbind in '\n'.join(varbind_list).split('~%~'): if varbind == '': break idx += 1 try: oid, value = varbind.split(None, 1) except ValueError: oid = varbind value = '' varbinds[oid] = value trapvars['$' + str(idx)] = value # $n LOG.debug('$%s %s', str(idx), value) trapvars['$q'] = trapvars['$q'].lstrip('.') # if numeric, remove leading '.' trapvars['$#'] = str(idx) LOG.debug('varbinds = %s', varbinds) LOG.debug('version = %s', version) correlate = list() if version == 'SNMPv1': if trapvars['$w'] == '0': trapvars['$O'] = 'coldStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '1': trapvars['$O'] = 'warmStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '2': trapvars['$O'] = 'linkDown' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '3': trapvars['$O'] = 'linkUp' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '4': trapvars['$O'] = 'authenticationFailure' elif trapvars['$w'] == '5': trapvars['$O'] = 'egpNeighborLoss' elif trapvars['$w'] == '6': # enterpriseSpecific(6) if trapvars['$q'].isdigit(): # XXX - specific trap number was not decoded trapvars['$O'] = '%s.0.%s' % (trapvars['$N'], trapvars['$q']) else: trapvars['$O'] = trapvars['$q'] elif version == 'SNMPv2c': if 'coldStart' in trapvars['$2']: trapvars['$w'] = '0' trapvars['$W'] = 'Cold Start' elif 'warmStart' in trapvars['$2']: trapvars['$w'] = '1' trapvars['$W'] = 'Warm Start' elif 'linkDown' in trapvars['$2']: trapvars['$w'] = '2' trapvars['$W'] = 'Link Down' elif 'linkUp' in trapvars['$2']: trapvars['$w'] = '3' trapvars['$W'] = 'Link Up' elif 'authenticationFailure' in trapvars['$2']: trapvars['$w'] = '4' trapvars['$W'] = 'Authentication Failure' elif 'egpNeighborLoss' in trapvars['$2']: trapvars['$w'] = '5' trapvars['$W'] = 'EGP Neighbor Loss' else: trapvars['$w'] = '6' trapvars['$W'] = 'Enterprise Specific' trapvars['$O'] = trapvars['$2'] # SNMPv2-MIB::snmpTrapOID.0 LOG.debug('trapvars = %s', trapvars) LOG.info('%s-Trap-PDU %s from %s at %s %s', version, trapvars['$O'], trapvars['$B'], trapvars['$x'], trapvars['$X']) if trapvars['$B'] != '<UNKNOWN>': resource = trapvars['$B'] elif trapvars['$A'] != '0.0.0.0': resource = trapvars['$A'] else: m = re.match(r'UDP: \[(\d+\.\d+\.\d+\.\d+)\]', trapvars['$b']) if m: resource = m.group(1) else: resource = '<NONE>' # Defaults event = trapvars['$O'] severity = severity_code.NORMAL group = 'SNMP' value = trapvars['$w'] text = trapvars['$W'] environment = 'PROD' service = ['Network'] tags = [version] timeout = None create_time = datetime.datetime.strptime('%sT%s.000Z' % (trapvars['$x'], trapvars['$X']), '%Y-%m-%dT%H:%M:%S.%fZ') snmptrapAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='snmptrapAlert', tags=tags, timeout=timeout, create_time=create_time, raw_data=data, ) suppress = Transformers.normalise_alert(snmptrapAlert, trapoid=trapvars['$O'], trapvars=trapvars, varbinds=varbinds) if suppress: LOG.info('Suppressing %s SNMP trap', snmptrapAlert.event) LOG.debug('%s', snmptrapAlert) return SnmpTrapHandler.translate_alert(snmptrapAlert, trapvars) if snmptrapAlert.get_type() == 'Heartbeat': snmptrapAlert = Heartbeat(origin=snmptrapAlert.origin, tags=[__version__], timeout=snmptrapAlert.timeout) return snmptrapAlert
def parse_snmptrap(data): pdu_data = data.splitlines() varbind_list = pdu_data[:] trapvars = dict() for line in pdu_data: if line.startswith('$'): special, value = line.split(None, 1) trapvars[special] = value varbind_list.pop(0) if '$s' in trapvars: if trapvars['$s'] == '0': version = 'SNMPv1' elif trapvars['$s'] == '1': version = 'SNMPv2c' elif trapvars['$s'] == '2': version = 'SNMPv2u' # not supported else: version = 'SNMPv3' trapvars['$s'] = version # Get varbinds varbinds = dict() idx = 0 for varbind in '\n'.join(varbind_list).split('~%~'): if varbind == '': break idx += 1 try: oid, value = varbind.split(None, 1) except ValueError: oid = varbind value = '' varbinds[oid] = value trapvars['$' + str(idx)] = value # $n LOG.debug('$%s %s', str(idx), value) trapvars['$q'] = trapvars['$q'].lstrip('.') # if numeric, remove leading '.' trapvars['$#'] = str(idx) LOG.debug('varbinds = %s', varbinds) LOG.debug('version = %s', version) correlate = list() if version == 'SNMPv1': if trapvars['$w'] == '0': trapvars['$O'] = 'coldStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '1': trapvars['$O'] = 'warmStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '2': trapvars['$O'] = 'linkDown' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '3': trapvars['$O'] = 'linkUp' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '4': trapvars['$O'] = 'authenticationFailure' elif trapvars['$w'] == '5': trapvars['$O'] = 'egpNeighborLoss' elif trapvars['$w'] == '6': # enterpriseSpecific(6) if trapvars['$q'].isdigit(): # XXX - specific trap number was not decoded trapvars['$O'] = '%s.0.%s' % (trapvars['$N'], trapvars['$q']) else: trapvars['$O'] = trapvars['$q'] elif version == 'SNMPv2c': if 'coldStart' in trapvars['$2']: trapvars['$w'] = '0' trapvars['$W'] = 'Cold Start' elif 'warmStart' in trapvars['$2']: trapvars['$w'] = '1' trapvars['$W'] = 'Warm Start' elif 'linkDown' in trapvars['$2']: trapvars['$w'] = '2' trapvars['$W'] = 'Link Down' elif 'linkUp' in trapvars['$2']: trapvars['$w'] = '3' trapvars['$W'] = 'Link Up' elif 'authenticationFailure' in trapvars['$2']: trapvars['$w'] = '4' trapvars['$W'] = 'Authentication Failure' elif 'egpNeighborLoss' in trapvars['$2']: trapvars['$w'] = '5' trapvars['$W'] = 'EGP Neighbor Loss' else: trapvars['$w'] = '6' trapvars['$W'] = 'Enterprise Specific' trapvars['$O'] = trapvars['$2'] # SNMPv2-MIB::snmpTrapOID.0 LOG.debug('trapvars = %s', trapvars) LOG.info('%s-Trap-PDU %s from %s at %s %s', version, trapvars['$O'], trapvars['$B'], trapvars['$x'], trapvars['$X']) if trapvars['$B'] != '<UNKNOWN>': resource = trapvars['$B'] elif trapvars['$A'] != '0.0.0.0': resource = trapvars['$A'] else: m = re.match(r'UDP: \[(\d+\.\d+\.\d+\.\d+)\]', trapvars['$b']) if m: resource = m.group(1) else: resource = '<NONE>' # Defaults event = trapvars['$O'] severity = severity_code.NORMAL group = 'SNMP' value = trapvars['$w'] text = trapvars['$W'] environment = ['INFRA'] service = ['Network'] tags = {'Version': version} timeout = None threshold_info = None summary = None create_time = datetime.datetime.strptime('%sT%s.000Z' % (trapvars['$x'], trapvars['$X']), '%Y-%m-%dT%H:%M:%S.%fZ') snmptrapAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='snmptrapAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, create_time=create_time, raw_data=data, ) suppress = snmptrapAlert.transform_alert(trapoid=trapvars['$O'], trapvars=trapvars, varbinds=varbinds) if suppress: LOG.info('Suppressing %s SNMP trap', snmptrapAlert.event) LOG.debug('%s', snmptrapAlert) return snmptrapAlert.translate_alert(trapvars) if snmptrapAlert.get_type() == 'Heartbeat': snmptrapAlert = Heartbeat(origin=snmptrapAlert.origin, version='n/a', timeout=snmptrapAlert.timeout) return snmptrapAlert
def alertDynect(self): for resource in self.info: if resource not in self.last_info: continue if resource.startswith('gslb-'): # gslb status = ok | unk | trouble | failover text = 'GSLB status is %s.' % self.info[resource]['status'] if self.info[resource]['status'] == 'ok': event = 'GslbOK' severity = severity_code.NORMAL else: event = 'GslbNotOK' severity = severity_code.CRITICAL correlate = ['GslbOK', 'GslbNotOK'] elif resource.startswith('pool-'): # pool status = up | unk | down # pool serve_mode = obey | always | remove | no # pool weight (1-15) if 'down' in self.info[resource]['status']: event = 'PoolDown' severity = severity_code.MAJOR text = 'Pool is down' elif 'obey' not in self.info[resource]['status']: event = 'PoolServe' severity = severity_code.MAJOR text = 'Pool with an incorrect serve mode' elif self.check_weight(self.info[resource]['gslb'], resource) is False: event = 'PoolWeightError' severity = severity_code.MINOR text = 'Pool with an incorrect weight' else: event = 'PoolUp' severity = severity_code.NORMAL text = 'Pool status is normal' correlate = ['PoolUp', 'PoolDown', 'PoolServe', 'PoolWeightError'] else: LOG.warning('Unknown resource type: %s', resource) continue # Defaults group = 'GSLB' value = self.info[resource]['status'] environment = ['PROD'] service = ['Network'] tags = list() timeout = None threshold_info = None summary = None raw_data = self.info[resource]['rawData'] dynectAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, ) suppress = dynectAlert.transform_alert() if suppress: LOG.info('Suppressing %s alert', dynectAlert.event) LOG.debug('%s', dynectAlert) continue if self.dedup.is_send(dynectAlert): self.mq.send(dynectAlert)
timeout = None threshold_info = None summary = None raw_data = None more_info = None graph_urls = None awsAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='cloudAlert', tags=tags, timeout=timeout, threshold_info=threshold_info, summary=summary, raw_data=raw_data, more_info=more_info, graph_urls=graph_urls, ) if self.dedup.is_send(awsAlert): self.mq.send(awsAlert)