Exemple #1
0
    def run():
        global conn

        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s alert-ganglia[%(process)d] %(levelname)s - %(message)s",
                            filename=LOGFILE)
        logging.info('Starting up Alert Ganglia version %s', __version__)

        # Write pid file if not already running
        if os.path.isfile(PIDFILE):
            pid = open(PIDFILE).read()
            try:
                os.kill(int(pid), 0)
                logging.error('Process with pid %s already exists, exiting', pid)
                sys.exit(1)
            except OSError:
                pass
        file(PIDFILE, 'w').write(str(os.getpid()))

        # Connect to message broker
        logging.info('Connect to broker')
        try:
            conn = stomp.Connection(
                BROKER_LIST,
                reconnect_sleep_increase=5.0,
                reconnect_sleep_max=120.0,
                reconnect_attempts_max=20
            )
            conn.set_listener('', MessageHandler())
            conn.start()
            conn.connect(wait=True)
        except Exception, e:
            logging.error('Stomp connection error: %s', e)
Exemple #2
0
def get_metrics(filter):
    url = "http://%s/ganglia/app/v1/metrics?%s" % (API_SERVER, filter)
    logging.info('Metric request %s', url)

    try:
        r = urllib2.urlopen(url, None, REQUEST_TIMEOUT)
    except urllib2.URLError, e:
        logging.error('Could not retrieve metric data from %s - %s', url, e)
        return dict()
Exemple #3
0
def init_rules():
    rules = list()

    logging.info('Loading rules...')
    try:
        rules = yaml.load(open(RULESFILE))
    except Exception, e:
        logging.error('Failed to load alert rules: %s', e)
        return rules
Exemple #4
0
def init_config():
    global owners, USERNAME, PASSWORD

    logging.info("Loading config.")

    try:
        config = yaml.load(open(CONFIGFILE))
    except Exception, e:
        logging.error("Failed to load alert config: %s", e)
        pass
Exemple #5
0
def init_tokens():
    global tokens

    try:
        for owner in owners:
            tokens[owner, "sms"] = INITIAL_TOKENS
            tokens[owner, "email"] = INITIAL_TOKENS

    except Exception, e:
        logging.error("Failed to initialize tokens %s", e)
        pass
Exemple #6
0
def send_notify(alertid):
    global tokens, hold

    try:
        for tag in alert[alertid]["tags"]:

            if tag.startswith("sms:") or tag.startswith("email:"):
                who = who_to_notify(tag)
                message = alert[alertid]["summary"]

                if tag.startswith("sms:") and tokens[who, "sms"] > 0:
                    _Lock.acquire()
                    tokens[who, "sms"] -= 1
                    _Lock.release()
                    logging.debug("Taken a sms token from %s, there are only %d left", who, tokens[who, "sms"])
                    sms_notify(alertid, USERNAME, PASSWORD, owners[who]["mobile"])
                elif tokens[who, "sms"] == 0:
                    logging.error(
                        "%s run out of sms tokens. Failed to notify %s.", who, alert[alertid]["lastReceiveId"]
                    )

                if tag.startswith("email:") and tokens[who, "email"] > 0:
                    _Lock.acquire()
                    tokens[who, "email"] -= 1
                    _Lock.release()
                    logging.debug("Taken a email token from %s, there are only %d left", who, tokens[who, "sms"])
                    email_notify(alertid, owners[who]["email"])
                elif tokens[who, "email"] == 0:
                    logging.error(
                        "%s run out of email tokens. Failed to notify %s.", who, alert[alertid]["lastReceiveId"]
                    )

    except Exception, e:
        logging.error('Notify sending failed for "%s" - %s - %s', alert[alertid]["lastReceiveId"], message, e)
        pass
Exemple #7
0
def send_heartbeat():
    global conn

    heartbeatid = str(uuid.uuid4()) # random UUID
    createTime = datetime.datetime.utcnow()

    headers = dict()
    headers['type'] = "heartbeat"
    headers['correlation-id'] = heartbeatid

    heartbeat = dict()
    heartbeat['id'] = heartbeatid
    heartbeat['type'] = "heartbeat"
    heartbeat['createTime'] = createTime.replace(microsecond=0).isoformat() + ".%03dZ" % (
    createTime.microsecond // 1000)
    heartbeat['origin'] = "%s/%s" % (__program__, os.uname()[1])
    heartbeat['version'] = __version__

    try:
        conn.send(json.dumps(heartbeat), headers, destination=ALERT_QUEUE)
        broker = conn.get_host_and_port()
        logging.info('%s : Heartbeat sent to %s:%s', heartbeatid, broker[0], str(broker[1]))
    except Exception, e:
        logging.error('Failed to send heartbeat to broker %s', e)
Exemple #8
0
 def on_error(self, headers, body):
     logging.error("Received an error %s", body)
Exemple #9
0
def email_notify(alertid, email):
    MAILING_LIST = email

    createTime = datetime.datetime.strptime(alert[alertid]["createTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
    createTime = createTime.replace(tzinfo=pytz.utc)
    tz = pytz.timezone(TIMEZONE)
    localTime = createTime.astimezone(tz)

    text = ""
    text += "[%s] %s\n" % (alert[alertid]["status"], alert[alertid]["summary"])
    text += "Alert Details\n"
    text += "Alert ID: %s\n" % (alert[alertid]["id"])
    text += "Create Time: %s\n" % (localTime.strftime("%Y/%m/%d %H:%M:%S"))
    text += "Resource: %s\n" % (alert[alertid]["resource"])
    text += "Environment: %s\n" % (",".join(alert[alertid]["environment"]))
    text += "Service: %s\n" % (",".join(alert[alertid]["service"]))
    text += "Event Name: %s\n" % (alert[alertid]["event"])
    text += "Event Group: %s\n" % (alert[alertid]["group"])
    text += "Event Value: %s\n" % (alert[alertid]["value"])
    text += "Severity: %s -> %s\n" % (alert[alertid]["previousSeverity"], alert[alertid]["severity"])
    text += "Status: %s\n" % (alert[alertid]["status"])
    text += "Text: %s\n" % (alert[alertid]["text"])

    if "thresholdInfo" in alert[alertid]:
        text += "Threshold Info: %s\n" % (alert[alertid]["thresholdInfo"])
    if "duplicateCount" in alert[alertid]:
        text += "Duplicate Count: %s\n" % (alert[alertid]["duplicateCount"])
    if "moreInfo" in alert[alertid]:
        text += "More Info: %s\n" % (alert[alertid]["moreInfo"])
    text += "Historical Data\n"
    if "graphs" in alert[alertid]:
        for g in alert[alertid]["graphs"]:
            text += "%s\n" % (g)
    text += "Raw Alert\n"
    text += "%s\n" % (json.dumps(alert[alertid]))
    text += "Generated by %s on %s at %s\n" % (
        "alert-notify.py",
        os.uname()[1],
        datetime.datetime.now().strftime("%a %d %b %H:%M:%S"),
    )

    logging.debug("Raw Text: %s", text)

    html = '<p><table border="0" cellpadding="0" cellspacing="0" width="100%">\n'  # table used to center email
    html += '<tr><td bgcolor="#ffffff" align="center">\n'
    html += '<table border="0" cellpadding="0" cellspacing="0" width="700">\n'  # table used to set width of email
    html += (
        '<tr><td bgcolor="#425470"><p align="center" style="font-size:24px;color:#d9fffd;font-weight:bold;"><strong>[%s] %s</strong></p>\n'
        % (alert[alertid]["status"], alert[alertid]["summary"])
    )
    html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Alert Details</p>\n'
    html += "<table>\n"
    html += (
        '<tr><td><b>Alert ID:</b></td><td><a href="%s/alerta/details.php?id=%s" target="_blank">%s</a></td></tr>\n'
        % (API_SERVER, alert[alertid]["id"], alert[alertid]["id"])
    )
    html += "<tr><td><b>Create Time:</b></td><td>%s</td></tr>\n" % (localTime.strftime("%Y/%m/%d %H:%M:%S"))
    html += "<tr><td><b>Resource:</b></td><td>%s</td></tr>\n" % (alert[alertid]["resource"])
    html += "<tr><td><b>Environment:</b></td><td>%s</td></tr>\n" % (",".join(alert[alertid]["environment"]))
    html += "<tr><td><b>Service:</b></td><td>%s</td></tr>\n" % (",".join(alert[alertid]["service"]))
    html += "<tr><td><b>Event Name:</b></td><td>%s</td></tr>\n" % (alert[alertid]["event"])
    html += "<tr><td><b>Event Group:</b></td><td>%s</td></tr>\n" % (alert[alertid]["group"])
    html += "<tr><td><b>Event Value:</b></td><td>%s</td></tr>\n" % (alert[alertid]["value"])
    html += "<tr><td><b>Severity:</b></td><td>%s -> %s</td></tr>\n" % (
        alert[alertid]["previousSeverity"],
        alert[alertid]["severity"],
    )
    html += "<tr><td><b>Status:</b></td><td>%s</td></tr>\n" % (alert[alertid]["status"])
    html += "<tr><td><b>Text:</b></td><td>%s</td></tr>\n" % (alert[alertid]["text"])
    if "thresholdInfo" in alert[alertid]:
        html += "<tr><td><b>Threshold Info:</b></td><td>%s</td></tr>\n" % (alert[alertid]["thresholdInfo"])
    if "duplicateCount" in alert[alertid]:
        html += "<tr><td><b>Duplicate Count:</b></td><td>%s</td></tr>\n" % (alert[alertid]["duplicateCount"])
    if "moreInfo" in alert[alertid]:
        html += '<tr><td><b>More Info:</b></td><td><a href="%s">ganglia</a></td></tr>\n' % (alert[alertid]["moreInfo"])
    html += "</table>\n"
    html += "</td></tr>\n"
    html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Historical Data</p>\n'
    if "graphs" in alert[alertid]:
        graph_cid = dict()
        for g in alert[alertid]["graphs"]:
            graph_cid[g] = str(uuid.uuid4())
            html += '<tr><td><img src="cid:' + graph_cid[g] + '"></td></tr>\n'
    html += '<tr><td><p align="left" style="font-size:18px;line-height:22px;color:#c25130;font-weight:bold;">Raw Alert</p>\n'
    html += '<tr><td><p align="left" style="font-family: \'Courier New\', Courier, monospace">%s</p></td></tr>\n' % (
        json.dumps(alert[alertid])
    )
    html += "<tr><td>Generated by %s on %s at %s</td></tr>\n" % (
        "alert-mailer.py",
        os.uname()[1],
        datetime.datetime.now().strftime("%a %d %b %H:%M:%S"),
    )
    html += "</table>"
    html += "</td></tr></table>"
    html += "</td></tr></table>"

    logging.debug("HTML Text %s", html)

    msg_root = MIMEMultipart("related")
    msg_root["Subject"] = "[%s] %s" % (alert[alertid]["status"], alert[alertid]["summary"])
    msg_root["From"] = ALERTER_MAIL
    msg_root["To"] = MAILING_LIST
    msg_root.preamble = "This is a multi-part message in MIME format."

    msg_alt = MIMEMultipart("alternative")
    msg_root.attach(msg_alt)

    msg_text = MIMEText(text, "plain")
    msg_alt.attach(msg_text)

    msg_html = MIMEText(html, "html")
    msg_alt.attach(msg_html)

    if "graphs" in alert[alertid]:
        msg_img = dict()
        for g in alert[alertid]["graphs"]:
            try:
                image = urllib2.urlopen(g).read()
                msg_img[g] = MIMEImage(image)
                logging.debug("graph cid %s", graph_cid[g])
                msg_img[g].add_header("Content-ID", "<" + graph_cid[g] + ">")
                msg_root.attach(msg_img[g])
            except:
                pass

    try:
        logging.info("%s : Send email to %s", alert[alertid]["lastReceiveId"], MAILING_LIST)
        s = smtplib.SMTP(SMTP_SERVER)
        s.sendmail(ALERTER_MAIL, MAILING_LIST, msg_root.as_string())
        s.quit()
    except smtplib.SMTPException, e:
        logging.error("%s : Sendmail failed - %s", alert[alertid]["lastReceiveId"], e)
Exemple #10
0
currentState = dict()
previousSeverity = dict()


def get_metrics(filter):
    url = "http://%s/ganglia/app/v1/metrics?%s" % (API_SERVER, filter)
    logging.info('Metric request %s', url)

    try:
        r = urllib2.urlopen(url, None, REQUEST_TIMEOUT)
    except urllib2.URLError, e:
        logging.error('Could not retrieve metric data from %s - %s', url, e)
        return dict()

    if r.getcode() is None:
        logging.error('Error during connection or data transfer (timeout=%d)', REQUEST_TIMEOUT)
        return dict()

    response = json.loads(r.read())['response']
    if response['status'] == 'error':
        logging.error('No metrics retreived - %s', response['message'])
        return dict()

    logging.info('Retreived %s matching metrics in %ss', response['total'], response['time'])

    return response['metrics']


class MessageHandler(object):
    def on_error(self, headers, body):
        logging.error('Received an error %s', body)