def main():
    parser = build_opt_parser()
    (options, args) = parser.parse_args()
    if not args:
        parser.error(
            "must specify an action: trigger, acknowledge, or resolve")
    if not options.service_key:
        parser.error("service key is required")

    action = args[0]

    description = options.description
    if action == "trigger":
        if description in (None, "-"):
            description = sys.stdin.read()
        if not description:
            sys.stderr.write("Action trigger requires a description\n")
            sys.exit(1)
    elif action in ("acknowledge", "resolve"):
        if not options.incident_key:
            sys.stderr.write("Action %s requires an incident key\n" % action)
            sys.exit(1)
        if description == "-":
            description = sys.stdin.read()

    pg = PagerDuty(options.service_key)
    try:
        ik = getattr(pg, action)(
            description=description,
            incident_key=options.incident_key,
            details=options.details,
        )
    except PagerDutyException, exc:
        sys.stderr.write(str(exc) + "\n")
        sys.exit(2)
Example #2
0
    def get(self, token, prefix):
        if token != config.CRON_PASSWORD:
            self.write('nope')
            return
        else:
            # App Engine cron jobs don't always run exactly on the minute,
            # so make sure all times are evenly divisible by 60
            run_time = (int(math.floor(time.time())) // 60) * 60
            alerts = AlertManager.get_all_alerts_systemwide(prefix=prefix)
            for alert in alerts:
                if alert.active == False:
                    continue

                # TODO what if this times out?
                if alert.last_run == 0 or alert.state == 'C' or (
                        run_time -
                        alert.last_run) >= alert.threshold_time_secs:
                    # this is a blanket try/catch so misconfigured endpoints, etc. don't impact other alerts.
                    try:
                        user = UserManager.get_user(alert.email)
                        oauth_client = lib.oauth.Client(user.subdomain)
                        token = oauth_client.generate_token(
                            user.access_token_key, user.access_token_secret)

                        # to create a dummy saved search, POST to
                        # http://davidlanstein.frontend-david1.office.loggly.net/api/savedsearches/create
                        # with this data:
                        # name=foo&context={"search_type":"search", "terms":"ivan tam", "from":"NOW-1DAY", "until":"NOW", "inputs":["logglyapp","logglyweb"], "order":"desc", "buckets": null, "highlighting":true, "rows":20, "start":0, "page":0, "command_string":null}
                        saved_searches = SavedSearchManager.get_all_saved_searches(
                            user)

                        found = False
                        for saved_search in saved_searches:
                            if saved_search.id == alert.saved_search:
                                found = True
                                break
                        if not found:
                            # search was deleted, perhaps?
                            logging.warn({
                                'module':
                                'controllers.cron',
                                'message':
                                'Alert with id \'%s\' is associated with saved search \'%s\', which no longer exists.'
                                % (unicode(alert.key()), saved_search.id)
                            })
                            continue

                        qs = Loggly.build_search_query_string(
                            saved_search.context, alert.threshold_time_secs)

                        try:
                            search_result = json.loads(oauth_client.make_request(token, 'http://%s.%s/api/facets/date?%s' % \
                                (user.subdomain, config.LOGGLY_DOMAIN, qs), 'GET'))
                        except Exception, e:
                            logging.error({
                                'module': 'controllers.cron',
                                'traceback': traceback.format_exc()
                            })
                            # input name in saved search doesn't exist anymore, etc.
                            continue

                        if alert.threshold_operator == 'gt':
                            fire_alert = search_result[
                                'numFound'] > alert.threshold_count
                        elif alert.threshold_operator == 'lt':
                            fire_alert = search_result[
                                'numFound'] < alert.threshold_count
                        else:
                            fire_alert = search_result[
                                'numFound'] == alert.threshold_count

                        if not fire_alert:
                            if alert.state == 'C':
                                alert.state = 'N'
                                alert.last_state_change = run_time
                                alert_json = {
                                    'sound':
                                    alert.sound,
                                    'description':
                                    tornado.escape.xhtml_escape(
                                        alert.description),
                                    'name':
                                    tornado.escape.xhtml_escape(alert.name),
                                    'state':
                                    alert.state,
                                    'key':
                                    unicode(alert.key()),
                                    'muted':
                                    alert.muted,
                                    'last_state_change':
                                    alert.last_state_change
                                }
                                alert_channel = hashlib.md5(
                                    'alertbirds' +
                                    alert.subdomain).hexdigest()
                                pusher_client = pusher.Pusher(
                                    app_id=config.PUSHER_APP_ID,
                                    key=config.PUSHER_KEY,
                                    secret=config.PUSHER_SECRET)
                                result = pusher_client[alert_channel].trigger(
                                    'chirp', data=alert_json)

                                if alert.endpoint:
                                    endpoint = EndpointManager.get_endpoint(
                                        alert.endpoint, alert.email)
                                    pagerduty = PagerDuty(endpoint.service_key)
                                    pagerduty.resolve(unicode(alert.key()))
                        else:
                            if alert.state == 'N':
                                alert.state = 'C'
                                alert.last_state_change = run_time
                            logging.warn({
                                'module':
                                'controllers.cron',
                                'message':
                                'Alert with id \'%s\' is in a critical state.'
                                % unicode(alert.key())
                            })
                            alert_json = {
                                'sound':
                                alert.sound,
                                'description':
                                tornado.escape.xhtml_escape(alert.description),
                                'name':
                                tornado.escape.xhtml_escape(alert.name),
                                'state':
                                alert.state,
                                'key':
                                unicode(alert.key()),
                                'muted':
                                alert.muted,
                                'last_state_change':
                                alert.last_state_change
                            }
                            alert_channel = hashlib.md5(
                                'alertbirds' + alert.subdomain).hexdigest()
                            pusher_client = pusher.Pusher(
                                app_id=config.PUSHER_APP_ID,
                                key=config.PUSHER_KEY,
                                secret=config.PUSHER_SECRET)
                            result = pusher_client[alert_channel].trigger(
                                'chirp', data=alert_json)

                            if alert.endpoint:
                                endpoint = EndpointManager.get_endpoint(
                                    alert.endpoint, alert.email)
                                pagerduty = PagerDuty(endpoint.service_key)
                                pagerduty.trigger(endpoint.alert_text,
                                                  unicode(alert.key()),
                                                  alert.description)

                        # if pagerduty is experiencing an outage, still re-run next minute
                        # that's why we set last_run at the bottom
                        alert.last_run = run_time
                        alert.put()

                    except Exception, e:
                        # endpoint misconfigured, who knows what else.  don't impact other users.
                        logging.error({
                            'module': 'controllers.cron',
                            'traceback': traceback.format_exc()
                        })
 def resolve(self, alert):
     if self.provider == 'pd':
         pagerduty = PagerDuty(self.service_key)
         pagerduty.resolve(unicode(alert.key()))
     elif self.provider == 'xmpp':
         self.xmpp_send('RESOLVE')
 def trigger(self, alert):
     if self.provider == 'pd':
         pagerduty = PagerDuty(self.service_key)
         pagerduty.trigger(self.alert_text, unicode(alert.key()), alert.description)
     elif self.provider == 'xmpp':
         self.xmpp_send('TRIGGER')
    def get(self, token, prefix):
        if token != config.CRON_PASSWORD:
            self.write('nope')
            return
        else:
            # App Engine cron jobs don't always run exactly on the minute,
            # so make sure all times are evenly divisible by 60
            run_time = (int(math.floor(time.time())) // 60) * 60
            alerts = AlertManager.get_all_alerts_systemwide(prefix=prefix)
            for alert in alerts:
                if alert.active == False:
                    continue

                # TODO what if this times out?
                if alert.last_run == 0 or alert.state == 'C' or (run_time - alert.last_run) >= alert.threshold_time_secs:
                    # this is a blanket try/catch so misconfigured endpoints, etc. don't impact other alerts.
                    try:
                        user = UserManager.get_user(alert.email)
                        oauth_client = lib.oauth.Client(user.subdomain)
                        token = oauth_client.generate_token(user.access_token_key, user.access_token_secret)

                        # to create a dummy saved search, POST to
                        # http://davidlanstein.frontend-david1.office.loggly.net/api/savedsearches/create
                        # with this data:
                        # name=foo&context={"search_type":"search", "terms":"ivan tam", "from":"NOW-1DAY", "until":"NOW", "inputs":["logglyapp","logglyweb"], "order":"desc", "buckets": null, "highlighting":true, "rows":20, "start":0, "page":0, "command_string":null}
                        saved_searches = SavedSearchManager.get_all_saved_searches(user)

                        found = False
                        for saved_search in saved_searches:
                            if saved_search.id == alert.saved_search:
                                found = True
                                break
                        if not found:
                            # search was deleted, perhaps?
                            logging.warn({'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is associated with saved search \'%s\', which no longer exists.' % (unicode(alert.key()), saved_search.id)})
                            continue

                        qs = Loggly.build_search_query_string(saved_search.context, alert.threshold_time_secs)

                        try:
                            search_result = json.loads(oauth_client.make_request(token, 'http://%s.%s/api/facets/date?%s' % \
                                (user.subdomain, config.LOGGLY_DOMAIN, qs), 'GET'))
                        except Exception, e:
                            logging.error({'module': 'controllers.cron', 'traceback': traceback.format_exc()})
                            # input name in saved search doesn't exist anymore, etc.
                            continue

                        if alert.threshold_operator == 'gt':
                            fire_alert = search_result['numFound'] > alert.threshold_count
                        elif alert.threshold_operator == 'lt':
                            fire_alert = search_result['numFound'] < alert.threshold_count
                        else:
                            fire_alert = search_result['numFound'] == alert.threshold_count

                        if not fire_alert:
                            if alert.state == 'C':
                                alert.state = 'N'
                                alert.last_state_change = run_time
                                alert_json = {'sound': alert.sound , 'description': tornado.escape.xhtml_escape(alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change }
                                alert_channel = hashlib.md5('alertbirds' + alert.subdomain).hexdigest()
                                pusher_client = pusher.Pusher(app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET)
                                result = pusher_client[alert_channel].trigger('chirp', data=alert_json)

                                if alert.endpoint:
                                    endpoint = EndpointManager.get_endpoint(alert.endpoint, alert.email)
                                    pagerduty = PagerDuty(endpoint.service_key)
                                    pagerduty.resolve(unicode(alert.key()))
                        else:
                            if alert.state == 'N':
                                alert.state = 'C'
                                alert.last_state_change = run_time
                            logging.warn({'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is in a critical state.' % unicode(alert.key())})
                            alert_json = {'sound': alert.sound , 'description': tornado.escape.xhtml_escape(alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change }
                            alert_channel = hashlib.md5('alertbirds' + alert.subdomain).hexdigest()
                            pusher_client = pusher.Pusher(app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET)
                            result = pusher_client[alert_channel].trigger('chirp', data=alert_json)

                            if alert.endpoint:
                                endpoint = EndpointManager.get_endpoint(alert.endpoint, alert.email)
                                pagerduty = PagerDuty(endpoint.service_key)
                                pagerduty.trigger(endpoint.alert_text, unicode(alert.key()), alert.description)

                        # if pagerduty is experiencing an outage, still re-run next minute
                        # that's why we set last_run at the bottom
                        alert.last_run = run_time
                        alert.put()
        
                    except Exception, e:
                        # endpoint misconfigured, who knows what else.  don't impact other users.
                        logging.error({'module': 'controllers.cron', 'traceback': traceback.format_exc()})
Example #6
0
def run():
    '''
    Worker runner that checks for alerts.
    '''

    global notifier_proxy, settings
    args = get_args_from_cli()
    alerts, settings = get_config(args.config)

    # setting up logging
    if not 'log_level' in settings:
        settings['log_level'] = logging.WARNING
    else:
        settings['log_level'] = settings['log_level'].upper()

    if not 'log_format' in settings:
        settings[
            'log_format'] = '%(asctime)s %(name)s %(levelname)s %(message)s'

    if not 'log_datefmt' in settings:
        settings['log_datefmt'] = '%Y-%m-%d %H:%M:%S'

    logging.basicConfig(filename=settings.get('log_file', None),
                        level=settings['log_level'],
                        format=settings['log_format'],
                        datefmt=settings['log_datefmt'])

    log.info('graphite-alerts started')
    log.debug('Command line arguments:')
    log.debug(args)

    log.debug('Initializing redis at %s', args.redisurl)
    STORAGE = RedisStorage(redis, args.redisurl)

    notifier_proxy.add_notifier(LogNotifier(STORAGE))
    notifier_proxy.add_notifier(ConsoleNotifier(STORAGE))

    settings['graphite_url'] = args.graphite_url or settings['graphite_url']
    if settings['graphite_url'].endswith('/'):
        settings['graphite_url'] = settings['graphite_url'][:-1]
    settings['pagerduty_key'] = args.pagerduty_key or settings['pagerduty_key']
    log.debug('graphite_url: %s', settings['graphite_url'])
    log.debug('pagerduty_key: %s', settings['pagerduty_key'])

    if settings['pagerduty_key']:
        pagerduty_client = PagerDuty(settings['pagerduty_key'])
        notifier_proxy.add_notifier(
            PagerdutyNotifier(pagerduty_client, STORAGE))

    if args.hipchat_key:
        hipchat = HipchatNotifier(HipChat(args.hipchat_key), STORAGE)
        hipchat.add_room(settings['hipchat_room'])
        notifier_proxy.add_notifier(hipchat)

    while True:
        start_time = time.time()
        seen_alert_targets = set()
        for alert in alerts:
            check_for_alert(alert)

        remove_old_seen_alerts()

        # what if cron should trigger us ?
        time_diff = time.time() - start_time
        sleep_for = 60 - time_diff
        if sleep_for > 0:
            sleep_for = 60 - time_diff
            log.info('Sleeping for %s seconds at %s', sleep_for,
                     datetime.utcnow())
            time.sleep(60 - time_diff)
Example #7
0
import requests
import requests.exceptions

from alerts import get_alerts
from graphite_data_record import GraphiteDataRecord
from graphite_target import get_records
from hipchat_notifier import HipchatNotifier
from level import Level
from notifier_proxy import NotifierProxy
from pagerduty_notifier import PagerdutyNotifier
from redis_storage import RedisStorage

STORAGE = RedisStorage(redis, os.getenv('REDISTOGO_URL'))

pg_key = os.getenv('PAGERDUTY_KEY')
pagerduty_client = PagerDuty(pg_key)

GRAPHITE_URL = os.getenv('GRAPHITE_URL')

notifier_proxy = NotifierProxy()
notifier_proxy.add_notifier(PagerdutyNotifier(pagerduty_client, STORAGE))

if 'HIPCHAT_KEY' in os.environ:
    hipchat = HipchatNotifier(HipChat(os.getenv('HIPCHAT_KEY')), STORAGE)
    hipchat.add_room(os.getenv('HIPCHAT_ROOM'))
    notifier_proxy.add_notifier(hipchat)

ALERT_TEMPLATE = r"""{{level}} alert for {{alert.name}} {{record.target}}.  The
current value is {{current_value}} which passes the {{threshold_level|lower}} value of
{{threshold_value}}. Go to {{graph_url}}.
{% if docs_url %}Documentation: {{docs_url}}{% endif %}.