def run(): global conn logging.basicConfig(level=logging.INFO, format="%(asctime)s alert-ganglia[%(process)d] %(levelname)s - %(message)s", filename=LOGFILE) logging.info('Starting up Alert Ganglia version %s', __version__) # Write pid file if not already running if os.path.isfile(PIDFILE): pid = open(PIDFILE).read() try: os.kill(int(pid), 0) logging.error('Process with pid %s already exists, exiting', pid) sys.exit(1) except OSError: pass file(PIDFILE, 'w').write(str(os.getpid())) # Connect to message broker logging.info('Connect to broker') try: conn = stomp.Connection( BROKER_LIST, reconnect_sleep_increase=5.0, reconnect_sleep_max=120.0, reconnect_attempts_max=20 ) conn.set_listener('', MessageHandler()) conn.start() conn.connect(wait=True) except Exception, e: logging.error('Stomp connection error: %s', e)
def get_metrics(filter): url = "http://%s/ganglia/app/v1/metrics?%s" % (API_SERVER, filter) logging.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, REQUEST_TIMEOUT) except urllib2.URLError, e: logging.error('Could not retrieve metric data from %s - %s', url, e) return dict()
def init_rules(): rules = list() logging.info('Loading rules...') try: rules = yaml.load(open(RULESFILE)) except Exception, e: logging.error('Failed to load alert rules: %s', e) return rules
def init_config(): global owners, USERNAME, PASSWORD logging.info("Loading config.") try: config = yaml.load(open(CONFIGFILE)) except Exception, e: logging.error("Failed to load alert config: %s", e) pass
def init_tokens(): global tokens try: for owner in owners: tokens[owner, "sms"] = INITIAL_TOKENS tokens[owner, "email"] = INITIAL_TOKENS except Exception, e: logging.error("Failed to initialize tokens %s", e) pass
def send_notify(alertid): global tokens, hold try: for tag in alert[alertid]["tags"]: if tag.startswith("sms:") or tag.startswith("email:"): who = who_to_notify(tag) message = alert[alertid]["summary"] if tag.startswith("sms:") and tokens[who, "sms"] > 0: _Lock.acquire() tokens[who, "sms"] -= 1 _Lock.release() logging.debug("Taken a sms token from %s, there are only %d left", who, tokens[who, "sms"]) sms_notify(alertid, USERNAME, PASSWORD, owners[who]["mobile"]) elif tokens[who, "sms"] == 0: logging.error( "%s run out of sms tokens. Failed to notify %s.", who, alert[alertid]["lastReceiveId"] ) if tag.startswith("email:") and tokens[who, "email"] > 0: _Lock.acquire() tokens[who, "email"] -= 1 _Lock.release() logging.debug("Taken a email token from %s, there are only %d left", who, tokens[who, "sms"]) email_notify(alertid, owners[who]["email"]) elif tokens[who, "email"] == 0: logging.error( "%s run out of email tokens. Failed to notify %s.", who, alert[alertid]["lastReceiveId"] ) except Exception, e: logging.error('Notify sending failed for "%s" - %s - %s', alert[alertid]["lastReceiveId"], message, e) pass
def send_heartbeat(): global conn heartbeatid = str(uuid.uuid4()) # random UUID createTime = datetime.datetime.utcnow() headers = dict() headers['type'] = "heartbeat" headers['correlation-id'] = heartbeatid heartbeat = dict() heartbeat['id'] = heartbeatid heartbeat['type'] = "heartbeat" heartbeat['createTime'] = createTime.replace(microsecond=0).isoformat() + ".%03dZ" % ( createTime.microsecond // 1000) heartbeat['origin'] = "%s/%s" % (__program__, os.uname()[1]) heartbeat['version'] = __version__ try: conn.send(json.dumps(heartbeat), headers, destination=ALERT_QUEUE) broker = conn.get_host_and_port() logging.info('%s : Heartbeat sent to %s:%s', heartbeatid, broker[0], str(broker[1])) except Exception, e: logging.error('Failed to send heartbeat to broker %s', e)
def on_error(self, headers, body): logging.error("Received an error %s", body)
def email_notify(alertid, email): MAILING_LIST = email createTime = datetime.datetime.strptime(alert[alertid]["createTime"], "%Y-%m-%dT%H:%M:%S.%fZ") createTime = createTime.replace(tzinfo=pytz.utc) tz = pytz.timezone(TIMEZONE) localTime = createTime.astimezone(tz) text = "" text += "[%s] %s\n" % (alert[alertid]["status"], alert[alertid]["summary"]) text += "Alert Details\n" text += "Alert ID: %s\n" % (alert[alertid]["id"]) text += "Create Time: %s\n" % (localTime.strftime("%Y/%m/%d %H:%M:%S")) text += "Resource: %s\n" % (alert[alertid]["resource"]) text += "Environment: %s\n" % (",".join(alert[alertid]["environment"])) text += "Service: %s\n" % (",".join(alert[alertid]["service"])) text += "Event Name: %s\n" % (alert[alertid]["event"]) text += "Event Group: %s\n" % (alert[alertid]["group"]) text += "Event Value: %s\n" % (alert[alertid]["value"]) text += "Severity: %s -> %s\n" % (alert[alertid]["previousSeverity"], alert[alertid]["severity"]) text += "Status: %s\n" % (alert[alertid]["status"]) text += "Text: %s\n" % (alert[alertid]["text"]) if "thresholdInfo" in alert[alertid]: text += "Threshold Info: %s\n" % (alert[alertid]["thresholdInfo"]) if "duplicateCount" in alert[alertid]: text += "Duplicate Count: %s\n" % (alert[alertid]["duplicateCount"]) if "moreInfo" in alert[alertid]: text += "More Info: %s\n" % (alert[alertid]["moreInfo"]) text += "Historical Data\n" if "graphs" in alert[alertid]: for g in alert[alertid]["graphs"]: text += "%s\n" % (g) text += "Raw Alert\n" text += "%s\n" % (json.dumps(alert[alertid])) text += "Generated by %s on %s at %s\n" % ( "alert-notify.py", os.uname()[1], datetime.datetime.now().strftime("%a %d %b %H:%M:%S"), ) logging.debug("Raw Text: %s", text) html = '<p><table border="0" cellpadding="0" cellspacing="0" width="100%">\n' # table used to center email html += '<tr><td bgcolor="#ffffff" align="center">\n' html += '<table border="0" cellpadding="0" cellspacing="0" width="700">\n' # table used to set width of email html += ( '<tr><td bgcolor="#425470"><p align="center" style="font-size:24px;color:#d9fffd;font-weight:bold;"><strong>[%s] %s</strong></p>\n' % (alert[alertid]["status"], alert[alertid]["summary"]) ) html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Alert Details</p>\n' html += "<table>\n" html += ( '<tr><td><b>Alert ID:</b></td><td><a href="%s/alerta/details.php?id=%s" target="_blank">%s</a></td></tr>\n' % (API_SERVER, alert[alertid]["id"], alert[alertid]["id"]) ) html += "<tr><td><b>Create Time:</b></td><td>%s</td></tr>\n" % (localTime.strftime("%Y/%m/%d %H:%M:%S")) html += "<tr><td><b>Resource:</b></td><td>%s</td></tr>\n" % (alert[alertid]["resource"]) html += "<tr><td><b>Environment:</b></td><td>%s</td></tr>\n" % (",".join(alert[alertid]["environment"])) html += "<tr><td><b>Service:</b></td><td>%s</td></tr>\n" % (",".join(alert[alertid]["service"])) html += "<tr><td><b>Event Name:</b></td><td>%s</td></tr>\n" % (alert[alertid]["event"]) html += "<tr><td><b>Event Group:</b></td><td>%s</td></tr>\n" % (alert[alertid]["group"]) html += "<tr><td><b>Event Value:</b></td><td>%s</td></tr>\n" % (alert[alertid]["value"]) html += "<tr><td><b>Severity:</b></td><td>%s -> %s</td></tr>\n" % ( alert[alertid]["previousSeverity"], alert[alertid]["severity"], ) html += "<tr><td><b>Status:</b></td><td>%s</td></tr>\n" % (alert[alertid]["status"]) html += "<tr><td><b>Text:</b></td><td>%s</td></tr>\n" % (alert[alertid]["text"]) if "thresholdInfo" in alert[alertid]: html += "<tr><td><b>Threshold Info:</b></td><td>%s</td></tr>\n" % (alert[alertid]["thresholdInfo"]) if "duplicateCount" in alert[alertid]: html += "<tr><td><b>Duplicate Count:</b></td><td>%s</td></tr>\n" % (alert[alertid]["duplicateCount"]) if "moreInfo" in alert[alertid]: html += '<tr><td><b>More Info:</b></td><td><a href="%s">ganglia</a></td></tr>\n' % (alert[alertid]["moreInfo"]) html += "</table>\n" html += "</td></tr>\n" html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Historical Data</p>\n' if "graphs" in alert[alertid]: graph_cid = dict() for g in alert[alertid]["graphs"]: graph_cid[g] = str(uuid.uuid4()) html += '<tr><td><img src="cid:' + graph_cid[g] + '"></td></tr>\n' html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Raw Alert</p>\n' html += '<tr><td><p align="left" style="font-family: \'Courier New\', Courier, monospace">%s</p></td></tr>\n' % ( json.dumps(alert[alertid]) ) html += "<tr><td>Generated by %s on %s at %s</td></tr>\n" % ( "alert-mailer.py", os.uname()[1], datetime.datetime.now().strftime("%a %d %b %H:%M:%S"), ) html += "</table>" html += "</td></tr></table>" html += "</td></tr></table>" logging.debug("HTML Text %s", html) msg_root = MIMEMultipart("related") msg_root["Subject"] = "[%s] %s" % (alert[alertid]["status"], alert[alertid]["summary"]) msg_root["From"] = ALERTER_MAIL msg_root["To"] = MAILING_LIST msg_root.preamble = "This is a multi-part message in MIME format." msg_alt = MIMEMultipart("alternative") msg_root.attach(msg_alt) msg_text = MIMEText(text, "plain") msg_alt.attach(msg_text) msg_html = MIMEText(html, "html") msg_alt.attach(msg_html) if "graphs" in alert[alertid]: msg_img = dict() for g in alert[alertid]["graphs"]: try: image = urllib2.urlopen(g).read() msg_img[g] = MIMEImage(image) logging.debug("graph cid %s", graph_cid[g]) msg_img[g].add_header("Content-ID", "<" + graph_cid[g] + ">") msg_root.attach(msg_img[g]) except: pass try: logging.info("%s : Send email to %s", alert[alertid]["lastReceiveId"], MAILING_LIST) s = smtplib.SMTP(SMTP_SERVER) s.sendmail(ALERTER_MAIL, MAILING_LIST, msg_root.as_string()) s.quit() except smtplib.SMTPException, e: logging.error("%s : Sendmail failed - %s", alert[alertid]["lastReceiveId"], e)
currentState = dict() previousSeverity = dict() def get_metrics(filter): url = "http://%s/ganglia/app/v1/metrics?%s" % (API_SERVER, filter) logging.info('Metric request %s', url) try: r = urllib2.urlopen(url, None, REQUEST_TIMEOUT) except urllib2.URLError, e: logging.error('Could not retrieve metric data from %s - %s', url, e) return dict() if r.getcode() is None: logging.error('Error during connection or data transfer (timeout=%d)', REQUEST_TIMEOUT) return dict() response = json.loads(r.read())['response'] if response['status'] == 'error': logging.error('No metrics retreived - %s', response['message']) return dict() logging.info('Retreived %s matching metrics in %ss', response['total'], response['time']) return response['metrics'] class MessageHandler(object): def on_error(self, headers, body): logging.error('Received an error %s', body)