def alert_tunnel_up(outside_ip,status_message,last_status_change, aws_acc=None,gwid=None,gwip=None,vpnid=None): ''' Report tunnel up status to alerta only if a down status was sent ''' api = ApiClient(endpoint=alerta_endpoint) alertres = vpnid+','+gwid+','+outside_ip status_file =tempdir+'/'+alertres.replace(',','_')+'.down' if not os.path.exists(status_file): # if file does not exists it means it wasn't down. no point in sending the alert. return alert = Alert( resource=alertres, event='TunnelUp', correlate=['TunnelDown'], group='aws', environment='Production', service=[aws_acc], severity='normal', value=status_message, text=aws_acc+' : Tunnel '+outside_ip +' up since '+last_status_change+'.'+'endpoint: '+gwip, tags=['aws'], attributes={'customer': 'The Guardian', 'account' : aws_acc,'GatewayId' : gwid+' [ '+gwip+' ]','vpnId' : vpnid, 'TunnelOutsideIp' : outside_ip} ) #print alert try: api.send(alert) os.remove (status_file ) except Exception as e: print e
class SnmpTrapHandler(object): def __init__(self): self.api = None def run(self): endpoint = os.environ.get('ALERTA_ENDPOINT', 'http://localhost:8080') key = os.environ.get('ALERTA_API_KEY', None) self.api = ApiClient(endpoint=endpoint, key=key) data = sys.stdin.read() LOG.info('snmptrapd -> %r', data) data = unicode(data, 'utf-8', errors='ignore') LOG.debug('unicoded -> %s', data) snmptrapAlert = SnmpTrapHandler.parse_snmptrap(data) if snmptrapAlert: try: self.api.send(snmptrapAlert) except Exception, e: LOG.warning('Failed to send alert: %s', e) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(tags=[__version__]) try: self.api.send(heartbeat) except Exception, e: LOG.warning('Failed to send heartbeat: %s', e)
def main(): api = ApiClient() listener = Listener() while True: listener.send_cmd('READY\n') headers, body = listener.wait() event = headers['eventname'] if event.startswith('TICK'): supervisorAlert = Heartbeat( origin='supervisord', tags=[headers['ver'], event] ) else: if event.endswith('FATAL'): severity = 'critical' elif event.endswith('BACKOFF'): severity = 'warning' elif event.endswith('EXITED'): severity = 'minor' else: severity = 'normal' supervisorAlert = Alert( resource=body['processname'], environment='Production', service=['supervisord'], event=event, correlate=[ 'PROCESS_STATE_STARTING', 'PROCESS_STATE_RUNNING', 'PROCESS_STATE_BACKOFF', 'PROCESS_STATE_STOPPING', 'PROCESS_STATE_EXITED', 'PROCESS_STATE_STOPPED', 'PROCESS_STATE_FATAL', 'PROCESS_STATE_UNKNOWN' ], value='serial=%s' % headers['serial'], severity=severity, origin=headers['server'], text='State changed from %s to %s.' % (body['from_state'], event), raw_data='%s\n\n%s' % (json.dumps(headers), json.dumps(body)) ) try: api.send(supervisorAlert) except Exception as e: listener.log_stderr(e) listener.send_cmd('RESULT 4\nFAIL') else: listener.send_cmd('RESULT 2\nOK')
def alert(resource,event,text,value,severity,status,go=False): api = ApiClient(endpoint='http://alert.localhost/api', key='UszE5hI_hx5pXKcsCP_2&1DIs&9_Ve*k') #2h expired alert_info = Alert(resource=resource, event=event,text=text,group='ir',environment="Production",service=["localhost"],status=status,timeout='2880',value=value,severity=severity) t = api.send(alert_info) if not go: print 'alert info:',alert_info print t
def alert_rawData(resource,event,text,value,rawData,severity,status,go=False): api = ApiClient(endpoint='http://alert.localhost/api', key='UszE5hI_hx5pXKcsCP_2&1DIs&9_Ve*k') #alert = Alert(resource='irdev', event='searchServerDown',text='The search server is down.',group='ir',environment="Production",service=["localhost"],status='open',timeout=86400,value="query1",severity="major") #alert = Alert(resource='irdev', event='searchServerDown',text='The search server is down.',group='ir',environment="Development",service=["localhost"],status='open',timeout=86400,value="query1",severity="major") #2h expired alert_info = Alert(resource=resource, event=event,text=text,rawData=rawData,group='ir',environment="Production",service=["localhost"],status=status,timeout='2880',value=value,severity=severity) t = api.send(alert_info) if not go: print 'alert info:',alert_info print t
def alert_tunnel_down(outside_ip,status_message,last_status_change, aws_acc=None,gwid=None,gwip=None,vpnid=None,severity='minor'): ''' Report tunnel down status to alerta only if we haven't already sent an alert ''' api = ApiClient(endpoint=alerta_endpoint) alertres = vpnid+','+gwid+','+outside_ip status_file =tempdir+'/'+alertres.replace(',','_')+'.down' count = 1 if os.path.exists(status_file) and not severity == 'critical': # if file does exists, it means its already down and an alert has been sent, check number of counts and send again after 10min try: count = get_down_count(status_file) count = int(count) + 1 except Exception as e: count = 10 record_status(status_file,str(count)) if count >= 5: #assume cron is running every 2 min, send down alert every 10 min. count = 1 else: return alert = Alert( resource=alertres, event='TunnelDown', correlate=['TunnelUp'], group='aws', environment='Production', service=[aws_acc], severity=severity, value=status_message, text=aws_acc+' : Tunnel '+outside_ip +' Down since '+last_status_change+'.'+' endpoint: '+gwip, tags=['aws'], attributes={'customer': 'The Guardian', 'account' : aws_acc,'GatewayId' : gwid+' [ '+gwip+' ]','vpnId' : vpnid, 'TunnelOutsideIp' : outside_ip} ) #print alert try: api.send(alert) record_status(status_file,str(count)) except Exception as e: print e
def run(self): api = ApiClient(endpoint=OPTIONS["endpoint"], key=OPTIONS["key"]) keep_alive = 0 while not self.should_stop: for alertid in on_hold.keys(): try: (alert, hold_time) = on_hold[alertid] except KeyError: continue if time.time() > hold_time: self.send_email(alert) try: del on_hold[alertid] except KeyError: continue if keep_alive >= 10: tag = OPTIONS["smtp_host"] or "alerta-mailer" api.send(Heartbeat(tags=[tag])) keep_alive = 0 keep_alive += 1 time.sleep(2)
class CloudWatch(object): def __init__(self): self.api = ApiClient() try: connection = boto.sqs.connect_to_region( AWS_REGION, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) except boto.exception.SQSError as e: LOG.error('SQS API call failed: %s', e) sys.exit(1) try: self.sqs = connection.create_queue(AWS_SQS_QUEUE) self.sqs.set_message_class(RawMessage) except boto.exception.SQSError as e: LOG.error('SQS queue error: %s', e) sys.exit(1) def run(self): while True: LOG.debug('Waiting for CloudWatch alarms on %s...', AWS_SQS_QUEUE) try: notification = self.sqs.read(wait_time_seconds=20) except boto.exception.SQSError as e: LOG.warning('Could not read from queue: %s', e) time.sleep(20) continue if notification: cloudwatchAlert = self.parse_notification(notification) try: self.api.send(cloudwatchAlert) except Exception as e: LOG.warning('Failed to send alert: %s', e) self.sqs.delete_message(notification) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(tags=[__version__]) try: self.api.send(heartbeat) except Exception as e: LOG.warning('Failed to send heartbeat: %s', e) def parse_notification(self, notification): notification = json.loads(notification.get_body()) alarm = json.loads(notification['Message']) if 'Trigger' not in alarm: return # Defaults resource = '%s:%s' % (alarm['Trigger']['Dimensions'][0]['name'], alarm['Trigger']['Dimensions'][0]['value']) event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) group = 'CloudWatch' value = alarm['Trigger']['MetricName'] text = alarm['AlarmDescription'] service = [ AWS_ACCOUNT_ID.get(alarm['AWSAccountId'], 'AWSAccountId:' + alarm['AWSAccountId']) ] tags = [alarm['Trigger']['Namespace']] correlate = list() origin = notification['TopicArn'] timeout = None create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment='Production', service=service, text=text, event_type='cloudwatchAlarm', tags=tags, attributes={ 'awsMessageId': notification['MessageId'], 'awsRegion': alarm['Region'], 'thresholdInfo': alarm['NewStateReason'] }, origin=origin, timeout=timeout, create_time=create_time, raw_data=raw_data, ) return cloudwatchAlert @staticmethod def cw_state_to_severity(state): if state == 'ALARM': return 'major' elif state == 'INSUFFICIENT_DATA': return 'warning' elif state == 'OK': return 'normal' else: return 'unknown'
class AlertCommand(object): def __init__(self): self.api = ApiClient() def set(self, endpoint, key): self.api = ApiClient(endpoint=endpoint, key=key) def send(self, args): try: alert = Alert(resource=args.resource, event=args.event, environment=args.environment, severity=args.severity, correlate=args.correlate, status=args.status, service=args.service, group=args.group, value=args.value, text=args.text, tags=args.tags, attributes=dict([ attrib.split('=') for attrib in args.attributes ]), origin=args.origin, event_type=args.event_type, timeout=args.timeout, raw_data=args.raw_data) except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.send(alert) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': if not 'alert' in response: info = response['message'] elif response['alert']['repeat']: info = "%s duplicates" % response['alert']['duplicateCount'] else: info = "%s -> %s" % (response['alert']['previousSeverity'], response['alert']['severity']) print("{} ({})".format(response['id'], info)) else: LOG.error(response['message']) sys.exit(1) def heartbeat(self, args): try: heartbeat = Heartbeat(origin=args.origin, tags=args.tags, timeout=args.timeout) except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.send(heartbeat) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': print(response['id']) else: LOG.error(response['message']) sys.exit(1) def query(self, args, from_date=None): response = self._alerts(args.filters, from_date) alerts = response['alerts'] if args.output == "json": print(json.dumps(alerts, indent=4)) sys.exit(0) for alert in reversed(alerts): a = AlertDocument.parse_alert(alert) line_color = '' end_color = _ENDC if args.color: line_color = _COLOR_MAP.get(a.severity, _COLOR_MAP['unknown']) print( line_color + '{0}|{1}|{2}|{3:5d}|{4}|{5:<5s}|{6:<10s}|{7:<18s}|{8:12s}|{9:16s}|{10:12s}' .format( a.id[0:8], a.get_date('last_receive_time', 'local', args.timezone), a.severity, a.duplicate_count, a.customer or "-", a.environment, ','.join(a.service), a.resource, a.group, a.event, a.value) + end_color) print(line_color + ' |{}'.format(a.text) + end_color) if args.details: print(line_color + ' severity | {} -> {}'.format( a.previous_severity, a.severity) + end_color) print(line_color + ' trend | {}'.format(a.trend_indication) + end_color) print(line_color + ' status | {}'.format(a.status) + end_color) print(line_color + ' resource | {}'.format(a.resource) + end_color) print(line_color + ' group | {}'.format(a.group) + end_color) print(line_color + ' event | {}'.format(a.event) + end_color) print(line_color + ' value | {}'.format(a.value) + end_color) print(line_color + ' tags | {}'.format(' '.join(a.tags)) + end_color) for key, value in a.attributes.items(): print(line_color + ' {} | {}'.format(key.ljust(10), value) + end_color) latency = a.receive_time - a.create_time print(line_color + ' time created | {}'.format( a.get_date('create_time', 'iso', args.timezone)) + end_color) print(line_color + ' time received | {}'.format( a.get_date('receive_time', 'iso', args.timezone)) + end_color) print(line_color + ' last received | {}'.format( a.get_date('last_receive_time', 'iso', args.timezone)) + end_color) print(line_color + ' latency | {}ms'.format( (latency.microseconds / 1000)) + end_color) print(line_color + ' timeout | {}s'.format(a.timeout) + end_color) print(line_color + ' alert id | {}'.format(a.id) + end_color) print( line_color + ' last recv id | {}'.format(a.last_receive_id) + end_color) print(line_color + ' customer | {}'.format(a.customer) + end_color) print(line_color + ' environment | {}'.format(a.environment) + end_color) print(line_color + ' service | {}'.format( ','.join(a.service)) + end_color) print(line_color + ' resource | {}'.format(a.resource) + end_color) print(line_color + ' type | {}'.format(a.event_type) + end_color) print(line_color + ' repeat | {}'.format(a.repeat) + end_color) print(line_color + ' origin | {}'.format(a.origin) + end_color) print(line_color + ' correlate | {}'.format( ','.join(a.correlate)) + end_color) return response.get('lastTime', '') def watch(self, args): from_date = None while True: from_date = self.query(args, from_date) try: time.sleep(2) except (KeyboardInterrupt, SystemExit): sys.exit(0) def top(self, args): screen = Screen(endpoint=args.endpoint, key=args.key) try: screen.run() except RuntimeError as e: screen._reset() print(e) sys.exit(1) except (KeyboardInterrupt, SystemExit): screen.w.running = False screen._reset() print('Exiting...') sys.exit(0) def raw(self, args): response = self._alerts(args.filters) alerts = response['alerts'] if args.output == "json": print(json.dumps(alerts, indent=4)) sys.exit(0) for alert in reversed(alerts): line_color = '' end_color = _ENDC print(line_color + '%s' % alert['rawData'] + end_color) def history(self, args): response = self._history(args.filters) history = response['history'] if args.output == "json": print(json.dumps(history, indent=4)) sys.exit(0) for hist in history: line_color = '' end_color = _ENDC update_time = datetime.strptime(hist.get('updateTime', None), '%Y-%m-%dT%H:%M:%S.%fZ') if 'severity' in hist: if args.color: line_color = _COLOR_MAP.get(hist['severity'], _COLOR_MAP['unknown']) print(line_color + '%s|%s|%s|%s|%-5s|%-10s|%-18s|%s|%s|%s|%s' % (hist['id'][0:8], update_time.strftime('%Y/%m/%d %H:%M:%S'), hist['severity'], hist['customer'], hist['environment'], ','.join(hist['service']), hist['resource'], hist['group'], hist['event'], hist['value'], hist['text']) + end_color) if 'status' in hist: print(line_color + '%s|%s|%s|%s|%-5s|%-10s|%-18s|%s|%s|%s|%s' % (hist['id'][0:8], update_time.strftime('%Y/%m/%d %H:%M:%S'), hist['status'], hist['customer'], hist['environment'], ','.join(hist['service']), hist['resource'], hist['group'], hist['event'], 'n/a', hist['text']) + end_color) def tag(self, args): sys.stdout.write("Counting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Tagging alerts: ") for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.tag_alert(alert['id'], args.tags) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def untag(self, args): sys.stdout.write("Counting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Un-tagging alerts: ") for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.untag_alert(alert['id'], args.tags) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def ack(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Acking alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.ack_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def unack(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("un-Acking alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.unack_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def close(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Closing alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.close_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def delete(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Deleting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.delete_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def status(self, args): response = self._status() metrics = response['metrics'] print('{:<28} {:<8} {:<26} {:10} {}'.format('METRIC', 'TYPE', 'NAME', 'VALUE', 'AVG')) for metric in [ m for m in metrics if m['type'] in ['gauge', 'counter', 'timer'] ]: if metric['type'] == 'gauge': print('{0:<28} {1:<8} {2:<26} {3:<10}'.format( metric['title'], metric['type'], metric['group'] + '.' + metric['name'], metric['value'])) else: value = metric.get('count', 0) avg = int(metric['totalTime']) * 1.0 / int(metric['count']) print('{0:<28} {1:<8} {2:<26} {3:<10} {4:-3.2f} ms'.format( metric['title'], metric['type'], metric['group'] + '.' + metric['name'], value, avg)) for metric in [m for m in metrics if m['type'] == 'text']: print('{0:<28} {1:<8} {2:<26} {3:<10}'.format( metric['title'], metric['type'], metric['group'] + '.' + metric['name'], metric['value'])) def heartbeats(self, args): response = self._heartbeats() heartbeats = response['heartbeats'] print('{:<28} {:<26} {:<19} {:>8} {:7} {}'.format( 'ORIGIN', 'TAGS', 'CREATED', 'LATENCY', 'TIMEOUT', 'SINCE')) for heartbeat in heartbeats: hb = HeartbeatDocument.parse_heartbeat(heartbeat) latency = (hb.receive_time - hb.create_time).microseconds / 1000 since = datetime.utcnow() - hb.receive_time since = since - timedelta(microseconds=since.microseconds) latency_exceeded = latency > MAX_LATENCY timeout_exceeded = since.seconds > hb.timeout print('{:<28} {:<26} {} {}{:6}ms {:6}s {}{}'.format( hb.origin, ' '.join(hb.tags), hb.get_date('create_time', 'local', args.timezone), '*' if latency_exceeded else ' ', latency, hb.timeout, '*' if timeout_exceeded else ' ', since)) if args.alert: if timeout_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatFail', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='major', value='{}'.format(since), text='Heartbeat not received in {} seconds'.format( hb.timeout), tags=hb.tags, type='heartbeatAlert') elif latency_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatSlow', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='major', value='{}ms'.format(latency), text='Heartbeat took more than {}ms to be processed'. format(MAX_LATENCY), tags=hb.tags, type='heartbeatAlert') else: alert = Alert(resource=hb.origin, event='HeartbeatOK', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='normal', value='', text='Heartbeat OK', tags=hb.tags, type='heartbeatAlert') self.send(alert) def blackout(self, args): if '.' not in args.start: args.start = args.start.replace('Z', '.000Z') try: blackout = { "environment": args.environment, "resource": args.resource, "service": args.service, "event": args.event, "group": args.group, "tags": args.tags, "startTime": args.start, "duration": args.duration } except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.blackout_alerts(blackout) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': print(response['blackout']) else: LOG.error(response['message']) sys.exit(1) def blackouts(self, args): response = self.api.get_blackouts() blackouts = response['blackouts'] print( '{:<8} {:<16} {:<16} {:<16} {:<16} {:16} {:16} {:24} {:8} {:19} {}' .format('ID', 'CUSTOMER', 'ENVIRONMENT', 'SERVICE', 'RESOURCE', 'EVENT', 'GROUP', 'TAGS', 'STATUS', 'START', 'DURATION')) for blackout in blackouts: start_time = datetime.strptime(blackout['startTime'], '%Y-%m-%dT%H:%M:%S.%fZ') tz = pytz.timezone(args.timezone) if args.purge and blackout['status'] == 'expired': response = self.api.delete_blackout(blackout['id']) if response['status'] == 'ok': blackout['status'] = 'deleted' else: blackout['status'] = 'error' print( '{:<8} {:<16} {:<16} {:16} {:16} {:16} {:16} {:24} {:8} {} {}s' .format( blackout['id'][:8], blackout.get('customer', '*'), blackout.get('environment', '*'), ','.join(blackout.get('service', '*')), blackout.get('resource', '*'), blackout.get('event', '*'), blackout.get('group', '*'), ' '.join(blackout.get('tags', '*')), blackout['status'], start_time.replace(tzinfo=pytz.UTC).astimezone( tz).strftime('%Y/%m/%d %H:%M:%S'), blackout['duration'])) @staticmethod def _build(filters, from_date=None, to_date=None): if filters: query = [tuple(x.split('=', 1)) for x in filters if '=' in x] else: query = list() if from_date: query.append(('from-date', from_date)) if to_date: query.append(('to-date', to_date)) if 'sort-by' not in query: query.append(('sort-by', 'lastReceiveTime')) return query def _alerts(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_alerts(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _counts(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_counts(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _history(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_history(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _heartbeats(self): try: response = self.api.get_heartbeats() except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _status(self): try: response = self.api.get_status() except Exception as e: LOG.error(e) sys.exit(1) return response def help(self, args): pass def uptime(self, args): response = self._status() now = datetime.fromtimestamp(int(response['time']) / 1000.0) d = datetime(1, 1, 1) + timedelta(seconds=int(response['uptime']) / 1000.0) print('{0} up {1} days {2:02d}:{3:02d}'.format(now.strftime('%H:%M'), d.day - 1, d.hour, d.minute)) def version(self, args): response = self._status() print('{0} {1}'.format( response['application'], response['version'], )) print('alerta client {0}'.format(__version__)) print('requests {0}'.format(requests.__version__))
# ### Same Thing, Python style # In[ ]: from alerta.api import ApiClient from alerta.alert import Alert api = ApiClient(endpoint='http://localhost:8090') alert = Alert(resource='localhost', event='VolUnavailable', service=['Filesystem'], environment='Production', value='ERROR', severity='minor') res = api.send(alert) # ## Custom Alerts # ### Remember, you can do amazing stuff… # In[ ]: import utils utils.volume_is_mounted('/Volumes/Intenso64') # In[ ]: utils.internet_available() # In[ ]:
#!/usr/bin/env python # coding=utf-8 from alerta.api import ApiClient from alerta.alert import Alert api = ApiClient(endpoint='http://alert.localhost/api', key='UszE5hI_hx5pXKcsCP_2&1DIs&9_Ve*k') #alert = Alert(resource='irdev', event='searchServerDown',text='The search server is down.',group='ir',environment="Production",service=["localhost"],status='open',timeout=86400,value="query1",severity="major") alert = Alert(resource='irdev', event='searchServerDown',text='The search server is down.',group='ir',environment="Development",service=["localhost"],status='open',timeout=86400,value="query1",severity="major") print alert t = api.send(alert) print t
class CloudWatch(object): def __init__(self): self.api = ApiClient() try: connection = boto.sqs.connect_to_region( AWS_REGION, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) except boto.exception.SQSError as e: LOG.error('SQS API call failed: %s', e) sys.exit(1) try: self.sqs = connection.create_queue(AWS_SQS_QUEUE) self.sqs.set_message_class(RawMessage) except boto.exception.SQSError as e: LOG.error('SQS queue error: %s', e) sys.exit(1) def run(self): while True: LOG.debug('Waiting for CloudWatch alarms on %s...', AWS_SQS_QUEUE) try: notification = self.sqs.read(wait_time_seconds=20) except boto.exception.SQSError as e: LOG.warning('Could not read from queue: %s', e) time.sleep(20) continue if notification: cloudwatchAlert = self.parse_notification(notification) try: self.api.send(cloudwatchAlert) except Exception as e: LOG.warning('Failed to send alert: %s', e) self.sqs.delete_message(notification) LOG.debug('Send heartbeat...') heartbeat = Heartbeat(tags=[__version__]) try: self.api.send(heartbeat) except Exception as e: LOG.warning('Failed to send heartbeat: %s', e) def parse_notification(self, notification): notification = json.loads(notification.get_body()) alarm = json.loads(notification['Message']) if 'Trigger' not in alarm: return # Defaults resource = '%s:%s' % (alarm['Trigger']['Dimensions'][0]['name'], alarm['Trigger']['Dimensions'][0]['value']) event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) group = 'CloudWatch' value = alarm['Trigger']['MetricName'] text = alarm['AlarmDescription'] service = [AWS_ACCOUNT_ID.get(alarm['AWSAccountId'], 'AWSAccountId:' + alarm['AWSAccountId'])] tags = [alarm['Trigger']['Namespace']] correlate = list() origin = notification['TopicArn'] timeout = None create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment='Production', service=service, text=text, event_type='cloudwatchAlarm', tags=tags, attributes={ 'awsMessageId': notification['MessageId'], 'awsRegion': alarm['Region'], 'thresholdInfo': alarm['NewStateReason'] }, origin=origin, timeout=timeout, create_time=create_time, raw_data=raw_data, ) return cloudwatchAlert @staticmethod def cw_state_to_severity(state): if state == 'ALARM': return 'major' elif state == 'INSUFFICIENT_DATA': return 'warning' elif state == 'OK': return 'normal' else: return 'unknown'
get_ipython().system(u' cd $ALERTA_TEST_DIR && ./miniconda2/bin/alerta --endpoint-url "http://localhost:8090" delete') # ### Same Thing, Python style # In[ ]: from alerta.api import ApiClient from alerta.alert import Alert api = ApiClient(endpoint='http://localhost:8090') alert = Alert(resource='localhost', event='VolUnavailable', service=['Filesystem'], environment='Production', value='ERROR', severity='minor') res = api.send(alert) # ## Custom Alerts # ### Remember, you can do amazing stuff… # In[ ]: import utils utils.volume_is_mounted('/Volumes/Intenso64') # In[ ]: utils.internet_available()
class AlertCommand(object): def __init__(self): self.api = ApiClient() def set(self, endpoint, key): self.api = ApiClient(endpoint=endpoint, key=key) def send(self, args): try: alert = Alert( resource=args.resource, event=args.event, environment=args.environment, severity=args.severity, correlate=args.correlate, status=args.status, service=args.service, group=args.group, value=args.value, text=args.text, tags=args.tags, attributes=dict([attrib.split('=') for attrib in args.attributes]), origin=args.origin, event_type=args.event_type, timeout=args.timeout, raw_data=args.raw_data ) except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.send(alert) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': if not 'alert' in response: info = response['message'] elif response['alert']['repeat']: info = "%s duplicates" % response['alert']['duplicateCount'] else: info = "%s -> %s" % (response['alert']['previousSeverity'], response['alert']['severity']) print("{} ({})".format(response['id'], info)) else: LOG.error(response['message']) sys.exit(1) def heartbeat(self, args): try: heartbeat = Heartbeat( origin=args.origin, tags=args.tags, timeout=args.timeout ) except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.send(heartbeat) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': print(response['id']) else: LOG.error(response['message']) sys.exit(1) def query(self, args, from_date=None): response = self._alerts(args.filters, from_date) alerts = response['alerts'] if args.output == "json": print(json.dumps(alerts, indent=4)) sys.exit(0) for alert in reversed(alerts): a = AlertDocument.parse_alert(alert) line_color = '' end_color = _ENDC if args.color: line_color = _COLOR_MAP.get(a.severity, _COLOR_MAP['unknown']) print(line_color + '{0}|{1}|{2}|{3:5d}|{4}|{5:<5s}|{6:<10s}|{7:<18s}|{8:12s}|{9:16s}|{10:12s}'.format( a.id[0:8], a.get_date('last_receive_time', 'local', args.timezone), a.severity, a.duplicate_count, a.customer or "-", a.environment, ','.join(a.service), a.resource, a.group, a.event, a.value) + end_color) print(line_color + ' |{}'.format(a.text) + end_color) if args.details: print(line_color + ' severity | {} -> {}'.format(a.previous_severity, a.severity) + end_color) print(line_color + ' trend | {}'.format(a.trend_indication) + end_color) print(line_color + ' status | {}'.format(a.status) + end_color) print(line_color + ' resource | {}'.format(a.resource) + end_color) print(line_color + ' group | {}'.format(a.group) + end_color) print(line_color + ' event | {}'.format(a.event) + end_color) print(line_color + ' value | {}'.format(a.value) + end_color) print(line_color + ' tags | {}'.format(' '.join(a.tags)) + end_color) for key, value in a.attributes.items(): print(line_color + ' {} | {}'.format(key.ljust(10), value) + end_color) latency = a.receive_time - a.create_time print(line_color + ' time created | {}'.format(a.get_date('create_time', 'iso', args.timezone)) + end_color) print(line_color + ' time received | {}'.format(a.get_date('receive_time', 'iso', args.timezone)) + end_color) print(line_color + ' last received | {}'.format(a.get_date('last_receive_time', 'iso', args.timezone)) + end_color) print(line_color + ' latency | {}ms'.format((latency.microseconds / 1000)) + end_color) print(line_color + ' timeout | {}s'.format(a.timeout) + end_color) print(line_color + ' alert id | {}'.format(a.id) + end_color) print(line_color + ' last recv id | {}'.format(a.last_receive_id) + end_color) print(line_color + ' customer | {}'.format(a.customer) + end_color) print(line_color + ' environment | {}'.format(a.environment) + end_color) print(line_color + ' service | {}'.format(','.join(a.service)) + end_color) print(line_color + ' resource | {}'.format(a.resource) + end_color) print(line_color + ' type | {}'.format(a.event_type) + end_color) print(line_color + ' repeat | {}'.format(a.repeat) + end_color) print(line_color + ' origin | {}'.format(a.origin) + end_color) print(line_color + ' correlate | {}'.format(','.join(a.correlate)) + end_color) return response.get('lastTime', '') def watch(self, args): from_date = None while True: from_date = self.query(args, from_date) try: time.sleep(2) except (KeyboardInterrupt, SystemExit): sys.exit(0) def top(self, args): screen = Screen(endpoint=args.endpoint, key=args.key) try: screen.run() except RuntimeError as e: screen._reset() print(e) sys.exit(1) except (KeyboardInterrupt, SystemExit): screen.w.running = False screen._reset() print('Exiting...') sys.exit(0) def raw(self, args): response = self._alerts(args.filters) alerts = response['alerts'] if args.output == "json": print(json.dumps(alerts, indent=4)) sys.exit(0) for alert in reversed(alerts): line_color = '' end_color = _ENDC print(line_color + '%s' % alert['rawData'] + end_color) def history(self, args): response = self._history(args.filters) history = response['history'] if args.output == "json": print(json.dumps(history, indent=4)) sys.exit(0) for hist in history: line_color = '' end_color = _ENDC update_time = datetime.strptime(hist.get('updateTime', None), '%Y-%m-%dT%H:%M:%S.%fZ') if 'severity' in hist: if args.color: line_color = _COLOR_MAP.get(hist['severity'], _COLOR_MAP['unknown']) print(line_color + '%s|%s|%s|%s|%-5s|%-10s|%-18s|%s|%s|%s|%s' % ( hist['id'][0:8], update_time.strftime('%Y/%m/%d %H:%M:%S'), hist['severity'], hist['customer'], hist['environment'], ','.join(hist['service']), hist['resource'], hist['group'], hist['event'], hist['value'], hist['text'] ) + end_color) if 'status' in hist: print(line_color + '%s|%s|%s|%s|%-5s|%-10s|%-18s|%s|%s|%s|%s' % ( hist['id'][0:8], update_time.strftime('%Y/%m/%d %H:%M:%S'), hist['status'], hist['customer'], hist['environment'], ','.join(hist['service']), hist['resource'], hist['group'], hist['event'], 'n/a', hist['text'] ) + end_color) def tag(self, args): sys.stdout.write("Counting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Tagging alerts: ") for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.tag_alert(alert['id'], args.tags) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def untag(self, args): sys.stdout.write("Counting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Un-tagging alerts: ") for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.untag_alert(alert['id'], args.tags) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def ack(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Acking alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.ack_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def unack(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("un-Acking alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.unack_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def close(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Closing alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.close_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def delete(self, args): sys.stdout.write("Counting alerts: ") response = self._counts(args.filters) total = response['total'] sys.stdout.write("%s, done.\n" % total) sys.stdout.write("Deleting alerts: ") response = self._alerts(args.filters) alerts = response['alerts'] for i, alert in enumerate(alerts): pct = int(100.0 * i / total) sys.stdout.write("%3d%% (%d/%d)" % (pct, i, total)) sys.stdout.flush() sys.stdout.write("\b" * (8 + len(str(i)) + len(str(total)))) try: self.api.delete_alert(alert['id']) except Exception as e: print() LOG.error(e) sys.exit(1) sys.stdout.write("100%% (%d/%d), done.\n" % (total, total)) def status(self, args): response = self._status() metrics = response['metrics'] print('{:<28} {:<8} {:<26} {:10} {}'.format('METRIC', 'TYPE', 'NAME', 'VALUE', 'AVG')) for metric in [m for m in metrics if m['type'] in ['gauge', 'counter', 'timer']]: if metric['type'] == 'gauge': print('{0:<28} {1:<8} {2:<26} {3:<10}'.format(metric['title'], metric['type'], metric['group'] + '.' + metric['name'], metric['value'])) else: value = metric.get('count', 0) avg = int(metric['totalTime']) * 1.0 / int(metric['count']) print('{0:<28} {1:<8} {2:<26} {3:<10} {4:-3.2f} ms'.format(metric['title'], metric['type'], metric['group'] + '.' + metric['name'], value, avg)) for metric in [m for m in metrics if m['type'] == 'text']: print('{0:<28} {1:<8} {2:<26} {3:<10}'.format(metric['title'], metric['type'], metric['group'] + '.' + metric['name'], metric['value'])) def heartbeats(self, args): response = self._heartbeats() heartbeats = response['heartbeats'] print('{:<28} {:<26} {:<19} {:>8} {:7} {}'.format('ORIGIN', 'TAGS', 'CREATED', 'LATENCY', 'TIMEOUT', 'SINCE')) for heartbeat in heartbeats: hb = HeartbeatDocument.parse_heartbeat(heartbeat) latency = (hb.receive_time - hb.create_time).microseconds / 1000 since = datetime.utcnow() - hb.receive_time since = since - timedelta(microseconds=since.microseconds) latency_exceeded = latency > MAX_LATENCY timeout_exceeded = since.seconds > hb.timeout print('{:<28} {:<26} {} {}{:6}ms {:6}s {}{}'.format( hb.origin, ' '.join(hb.tags), hb.get_date('create_time', 'local', args.timezone), '*' if latency_exceeded else ' ', latency, hb.timeout, '*' if timeout_exceeded else ' ', since )) if args.alert: if timeout_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatFail', correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'], group='System', environment='Production', service=['Alerta'], severity='major', value='{}'.format(since), text='Heartbeat not received in {} seconds'.format(hb.timeout), tags=hb.tags, type='heartbeatAlert' ) elif latency_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatSlow', correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'], group='System', environment='Production', service=['Alerta'], severity='major', value='{}ms'.format(latency), text='Heartbeat took more than {}ms to be processed'.format(MAX_LATENCY), tags=hb.tags, type='heartbeatAlert' ) else: alert = Alert( resource=hb.origin, event='HeartbeatOK', correlate=['HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK'], group='System', environment='Production', service=['Alerta'], severity='normal', value='', text='Heartbeat OK', tags=hb.tags, type='heartbeatAlert' ) self.send(alert) def blackout(self, args): if '.' not in args.start: args.start = args.start.replace('Z', '.000Z') try: blackout = { "environment": args.environment, "resource": args.resource, "service": args.service, "event": args.event, "group": args.group, "tags": args.tags, "startTime": args.start, "duration": args.duration } except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.blackout_alerts(blackout) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': print(response['blackout']) else: LOG.error(response['message']) sys.exit(1) def blackouts(self, args): response = self.api.get_blackouts() blackouts = response['blackouts'] print('{:<8} {:<16} {:<16} {:<16} {:<16} {:16} {:16} {:24} {:8} {:19} {}'.format('ID', 'CUSTOMER', 'ENVIRONMENT', 'SERVICE', 'RESOURCE', 'EVENT', 'GROUP', 'TAGS', 'STATUS', 'START', 'DURATION')) for blackout in blackouts: start_time = datetime.strptime(blackout['startTime'], '%Y-%m-%dT%H:%M:%S.%fZ') tz = pytz.timezone(args.timezone) if args.purge and blackout['status'] == 'expired': response = self.api.delete_blackout(blackout['id']) if response['status'] == 'ok': blackout['status'] = 'deleted' else: blackout['status'] = 'error' print('{:<8} {:<16} {:<16} {:16} {:16} {:16} {:16} {:24} {:8} {} {}s'.format( blackout['id'][:8], blackout.get('customer', '*'), blackout.get('environment', '*'), ','.join(blackout.get('service', '*')), blackout.get('resource', '*'), blackout.get('event', '*'), blackout.get('group', '*'), ' '.join(blackout.get('tags', '*')), blackout['status'], start_time.replace(tzinfo=pytz.UTC).astimezone(tz).strftime('%Y/%m/%d %H:%M:%S'), blackout['duration'] )) @staticmethod def _build(filters, from_date=None, to_date=None): if filters: query = [tuple(x.split('=', 1)) for x in filters if '=' in x] else: query = list() if from_date: query.append(('from-date', from_date)) if to_date: query.append(('to-date', to_date)) if 'sort-by' not in query: query.append(('sort-by', 'lastReceiveTime')) return query def _alerts(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_alerts(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _counts(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_counts(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _history(self, filters, from_date=None, to_date=None): query = self._build(filters, from_date, to_date) try: response = self.api.get_history(query) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _heartbeats(self): try: response = self.api.get_heartbeats() except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == "error": LOG.error(response['message']) sys.exit(1) return response def _status(self): try: response = self.api.get_status() except Exception as e: LOG.error(e) sys.exit(1) return response def help(self, args): pass def uptime(self, args): response = self._status() now = datetime.fromtimestamp(int(response['time']) / 1000.0) d = datetime(1, 1, 1) + timedelta(seconds=int(response['uptime']) / 1000.0) print('{0} up {1} days {2:02d}:{3:02d}'.format( now.strftime('%H:%M'), d.day - 1, d.hour, d.minute )) def version(self, args): response = self._status() print('{0} {1}'.format( response['application'], response['version'], )) print('alerta client {0}'.format(__version__)) print('requests {0}'.format(requests.__version__))