def main(): parser = build_opt_parser() (options, args) = parser.parse_args() if not args: parser.error( "must specify an action: trigger, acknowledge, or resolve") if not options.service_key: parser.error("service key is required") action = args[0] description = options.description if action == "trigger": if description in (None, "-"): description = sys.stdin.read() if not description: sys.stderr.write("Action trigger requires a description\n") sys.exit(1) elif action in ("acknowledge", "resolve"): if not options.incident_key: sys.stderr.write("Action %s requires an incident key\n" % action) sys.exit(1) if description == "-": description = sys.stdin.read() pg = PagerDuty(options.service_key) try: ik = getattr(pg, action)( description=description, incident_key=options.incident_key, details=options.details, ) except PagerDutyException, exc: sys.stderr.write(str(exc) + "\n") sys.exit(2)
def get(self, token, prefix): if token != config.CRON_PASSWORD: self.write('nope') return else: # App Engine cron jobs don't always run exactly on the minute, # so make sure all times are evenly divisible by 60 run_time = (int(math.floor(time.time())) // 60) * 60 alerts = AlertManager.get_all_alerts_systemwide(prefix=prefix) for alert in alerts: if alert.active == False: continue # TODO what if this times out? if alert.last_run == 0 or alert.state == 'C' or ( run_time - alert.last_run) >= alert.threshold_time_secs: # this is a blanket try/catch so misconfigured endpoints, etc. don't impact other alerts. try: user = UserManager.get_user(alert.email) oauth_client = lib.oauth.Client(user.subdomain) token = oauth_client.generate_token( user.access_token_key, user.access_token_secret) # to create a dummy saved search, POST to # http://davidlanstein.frontend-david1.office.loggly.net/api/savedsearches/create # with this data: # name=foo&context={"search_type":"search", "terms":"ivan tam", "from":"NOW-1DAY", "until":"NOW", "inputs":["logglyapp","logglyweb"], "order":"desc", "buckets": null, "highlighting":true, "rows":20, "start":0, "page":0, "command_string":null} saved_searches = SavedSearchManager.get_all_saved_searches( user) found = False for saved_search in saved_searches: if saved_search.id == alert.saved_search: found = True break if not found: # search was deleted, perhaps? logging.warn({ 'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is associated with saved search \'%s\', which no longer exists.' % (unicode(alert.key()), saved_search.id) }) continue qs = Loggly.build_search_query_string( saved_search.context, alert.threshold_time_secs) try: search_result = json.loads(oauth_client.make_request(token, 'http://%s.%s/api/facets/date?%s' % \ (user.subdomain, config.LOGGLY_DOMAIN, qs), 'GET')) except Exception, e: logging.error({ 'module': 'controllers.cron', 'traceback': traceback.format_exc() }) # input name in saved search doesn't exist anymore, etc. continue if alert.threshold_operator == 'gt': fire_alert = search_result[ 'numFound'] > alert.threshold_count elif alert.threshold_operator == 'lt': fire_alert = search_result[ 'numFound'] < alert.threshold_count else: fire_alert = search_result[ 'numFound'] == alert.threshold_count if not fire_alert: if alert.state == 'C': alert.state = 'N' alert.last_state_change = run_time alert_json = { 'sound': alert.sound, 'description': tornado.escape.xhtml_escape( alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change } alert_channel = hashlib.md5( 'alertbirds' + alert.subdomain).hexdigest() pusher_client = pusher.Pusher( app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET) result = pusher_client[alert_channel].trigger( 'chirp', data=alert_json) if alert.endpoint: endpoint = EndpointManager.get_endpoint( alert.endpoint, alert.email) pagerduty = PagerDuty(endpoint.service_key) pagerduty.resolve(unicode(alert.key())) else: if alert.state == 'N': alert.state = 'C' alert.last_state_change = run_time logging.warn({ 'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is in a critical state.' % unicode(alert.key()) }) alert_json = { 'sound': alert.sound, 'description': tornado.escape.xhtml_escape(alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change } alert_channel = hashlib.md5( 'alertbirds' + alert.subdomain).hexdigest() pusher_client = pusher.Pusher( app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET) result = pusher_client[alert_channel].trigger( 'chirp', data=alert_json) if alert.endpoint: endpoint = EndpointManager.get_endpoint( alert.endpoint, alert.email) pagerduty = PagerDuty(endpoint.service_key) pagerduty.trigger(endpoint.alert_text, unicode(alert.key()), alert.description) # if pagerduty is experiencing an outage, still re-run next minute # that's why we set last_run at the bottom alert.last_run = run_time alert.put() except Exception, e: # endpoint misconfigured, who knows what else. don't impact other users. logging.error({ 'module': 'controllers.cron', 'traceback': traceback.format_exc() })
def resolve(self, alert): if self.provider == 'pd': pagerduty = PagerDuty(self.service_key) pagerduty.resolve(unicode(alert.key())) elif self.provider == 'xmpp': self.xmpp_send('RESOLVE')
def trigger(self, alert): if self.provider == 'pd': pagerduty = PagerDuty(self.service_key) pagerduty.trigger(self.alert_text, unicode(alert.key()), alert.description) elif self.provider == 'xmpp': self.xmpp_send('TRIGGER')
def get(self, token, prefix): if token != config.CRON_PASSWORD: self.write('nope') return else: # App Engine cron jobs don't always run exactly on the minute, # so make sure all times are evenly divisible by 60 run_time = (int(math.floor(time.time())) // 60) * 60 alerts = AlertManager.get_all_alerts_systemwide(prefix=prefix) for alert in alerts: if alert.active == False: continue # TODO what if this times out? if alert.last_run == 0 or alert.state == 'C' or (run_time - alert.last_run) >= alert.threshold_time_secs: # this is a blanket try/catch so misconfigured endpoints, etc. don't impact other alerts. try: user = UserManager.get_user(alert.email) oauth_client = lib.oauth.Client(user.subdomain) token = oauth_client.generate_token(user.access_token_key, user.access_token_secret) # to create a dummy saved search, POST to # http://davidlanstein.frontend-david1.office.loggly.net/api/savedsearches/create # with this data: # name=foo&context={"search_type":"search", "terms":"ivan tam", "from":"NOW-1DAY", "until":"NOW", "inputs":["logglyapp","logglyweb"], "order":"desc", "buckets": null, "highlighting":true, "rows":20, "start":0, "page":0, "command_string":null} saved_searches = SavedSearchManager.get_all_saved_searches(user) found = False for saved_search in saved_searches: if saved_search.id == alert.saved_search: found = True break if not found: # search was deleted, perhaps? logging.warn({'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is associated with saved search \'%s\', which no longer exists.' % (unicode(alert.key()), saved_search.id)}) continue qs = Loggly.build_search_query_string(saved_search.context, alert.threshold_time_secs) try: search_result = json.loads(oauth_client.make_request(token, 'http://%s.%s/api/facets/date?%s' % \ (user.subdomain, config.LOGGLY_DOMAIN, qs), 'GET')) except Exception, e: logging.error({'module': 'controllers.cron', 'traceback': traceback.format_exc()}) # input name in saved search doesn't exist anymore, etc. continue if alert.threshold_operator == 'gt': fire_alert = search_result['numFound'] > alert.threshold_count elif alert.threshold_operator == 'lt': fire_alert = search_result['numFound'] < alert.threshold_count else: fire_alert = search_result['numFound'] == alert.threshold_count if not fire_alert: if alert.state == 'C': alert.state = 'N' alert.last_state_change = run_time alert_json = {'sound': alert.sound , 'description': tornado.escape.xhtml_escape(alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change } alert_channel = hashlib.md5('alertbirds' + alert.subdomain).hexdigest() pusher_client = pusher.Pusher(app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET) result = pusher_client[alert_channel].trigger('chirp', data=alert_json) if alert.endpoint: endpoint = EndpointManager.get_endpoint(alert.endpoint, alert.email) pagerduty = PagerDuty(endpoint.service_key) pagerduty.resolve(unicode(alert.key())) else: if alert.state == 'N': alert.state = 'C' alert.last_state_change = run_time logging.warn({'module': 'controllers.cron', 'message': 'Alert with id \'%s\' is in a critical state.' % unicode(alert.key())}) alert_json = {'sound': alert.sound , 'description': tornado.escape.xhtml_escape(alert.description), 'name': tornado.escape.xhtml_escape(alert.name), 'state': alert.state, 'key': unicode(alert.key()), 'muted': alert.muted, 'last_state_change': alert.last_state_change } alert_channel = hashlib.md5('alertbirds' + alert.subdomain).hexdigest() pusher_client = pusher.Pusher(app_id=config.PUSHER_APP_ID, key=config.PUSHER_KEY, secret=config.PUSHER_SECRET) result = pusher_client[alert_channel].trigger('chirp', data=alert_json) if alert.endpoint: endpoint = EndpointManager.get_endpoint(alert.endpoint, alert.email) pagerduty = PagerDuty(endpoint.service_key) pagerduty.trigger(endpoint.alert_text, unicode(alert.key()), alert.description) # if pagerduty is experiencing an outage, still re-run next minute # that's why we set last_run at the bottom alert.last_run = run_time alert.put() except Exception, e: # endpoint misconfigured, who knows what else. don't impact other users. logging.error({'module': 'controllers.cron', 'traceback': traceback.format_exc()})
def run(): ''' Worker runner that checks for alerts. ''' global notifier_proxy, settings args = get_args_from_cli() alerts, settings = get_config(args.config) # setting up logging if not 'log_level' in settings: settings['log_level'] = logging.WARNING else: settings['log_level'] = settings['log_level'].upper() if not 'log_format' in settings: settings[ 'log_format'] = '%(asctime)s %(name)s %(levelname)s %(message)s' if not 'log_datefmt' in settings: settings['log_datefmt'] = '%Y-%m-%d %H:%M:%S' logging.basicConfig(filename=settings.get('log_file', None), level=settings['log_level'], format=settings['log_format'], datefmt=settings['log_datefmt']) log.info('graphite-alerts started') log.debug('Command line arguments:') log.debug(args) log.debug('Initializing redis at %s', args.redisurl) STORAGE = RedisStorage(redis, args.redisurl) notifier_proxy.add_notifier(LogNotifier(STORAGE)) notifier_proxy.add_notifier(ConsoleNotifier(STORAGE)) settings['graphite_url'] = args.graphite_url or settings['graphite_url'] if settings['graphite_url'].endswith('/'): settings['graphite_url'] = settings['graphite_url'][:-1] settings['pagerduty_key'] = args.pagerduty_key or settings['pagerduty_key'] log.debug('graphite_url: %s', settings['graphite_url']) log.debug('pagerduty_key: %s', settings['pagerduty_key']) if settings['pagerduty_key']: pagerduty_client = PagerDuty(settings['pagerduty_key']) notifier_proxy.add_notifier( PagerdutyNotifier(pagerduty_client, STORAGE)) if args.hipchat_key: hipchat = HipchatNotifier(HipChat(args.hipchat_key), STORAGE) hipchat.add_room(settings['hipchat_room']) notifier_proxy.add_notifier(hipchat) while True: start_time = time.time() seen_alert_targets = set() for alert in alerts: check_for_alert(alert) remove_old_seen_alerts() # what if cron should trigger us ? time_diff = time.time() - start_time sleep_for = 60 - time_diff if sleep_for > 0: sleep_for = 60 - time_diff log.info('Sleeping for %s seconds at %s', sleep_for, datetime.utcnow()) time.sleep(60 - time_diff)
import requests import requests.exceptions from alerts import get_alerts from graphite_data_record import GraphiteDataRecord from graphite_target import get_records from hipchat_notifier import HipchatNotifier from level import Level from notifier_proxy import NotifierProxy from pagerduty_notifier import PagerdutyNotifier from redis_storage import RedisStorage STORAGE = RedisStorage(redis, os.getenv('REDISTOGO_URL')) pg_key = os.getenv('PAGERDUTY_KEY') pagerduty_client = PagerDuty(pg_key) GRAPHITE_URL = os.getenv('GRAPHITE_URL') notifier_proxy = NotifierProxy() notifier_proxy.add_notifier(PagerdutyNotifier(pagerduty_client, STORAGE)) if 'HIPCHAT_KEY' in os.environ: hipchat = HipchatNotifier(HipChat(os.getenv('HIPCHAT_KEY')), STORAGE) hipchat.add_room(os.getenv('HIPCHAT_ROOM')) notifier_proxy.add_notifier(hipchat) ALERT_TEMPLATE = r"""{{level}} alert for {{alert.name}} {{record.target}}. The current value is {{current_value}} which passes the {{threshold_level|lower}} value of {{threshold_value}}. Go to {{graph_url}}. {% if docs_url %}Documentation: {{docs_url}}{% endif %}.