def check_condition(condition, datapoints): lbl = "%s:%s [%s]" % (condition.uuid, condition.rule_id, condition) # extract value from series and apply operator triggered, value = compute(condition.operator, condition.aggregate, [val for val, timestamp in datapoints], condition.value) # condition state changed if triggered != condition.state: condition.state = triggered condition.state_since = time() # if condition untriggered and no trigger notification previously sent, # set level to 1 so that we don't send OK to core (in case condition # uses custom reminder list where first notification happens later). if not triggered and condition.notification_level == 0: condition.notification_level = 1 else: condition.notification_level = 0 if triggered: # if condition just got triggered, issue a new incident_id condition.incident_id = uuid.uuid4().hex condition.save() # logs are gooood since_str = "always" if condition.state_since: since_str = tdelta_to_str(time() - condition.state_since) if since_str: since_str += " ago" else: since_str = "just now" msg = "%s is %s since %s (value=%s, level=%d)" % ( lbl, condition.state, since_str, value, condition.notification_level) # notify core if necessary reminder_list = condition.reminder_list or config.REMINDER_LIST if condition.state and len(reminder_list) > condition.notification_level: duration = time() - condition.state_since next_notification = reminder_list[condition.notification_level] next_notification += condition.reminder_offset if duration < next_notification: log.info(msg) return try: notify_core(condition, value) except Exception as exc: # don't advance notification level if notification failed log.error("%s - FAILED to send WARNING: %r", msg, exc) return log.info("%s - sent WARNING", msg) condition.notification_level += 1 condition.save() elif not condition.state and not condition.notification_level: try: notify_core(condition, value) except Exception as exc: # don't advance notification level if notification failed log.error("%s - FAILED to send OK: %r", msg, exc) return log.info("%s - sent OK", msg) condition.notification_level = 1 condition.save() else: log.info(msg)
def check_condition(condition, datapoints): lbl = "%s:%s [%s]" % (condition.uuid, condition.rule_id, condition) # extract value from series and apply operator triggered, value = compute( condition.operator, condition.aggregate, [val for val, timestamp in datapoints], condition.value ) # condition state changed if triggered != condition.state: condition.state = triggered condition.state_since = time() # if condition untriggered and no trigger notification previously sent, # set level to 1 so that we don't send OK to core (in case condition # uses custom reminder list where first notification happens later). if not triggered and condition.notification_level == 0: condition.notification_level = 1 else: condition.notification_level = 0 if triggered: # if condition just got triggered, issue a new incident_id condition.incident_id = uuid.uuid4().hex condition.save() # logs are gooood since_str = "always" if condition.state_since: since_str = tdelta_to_str(time() - condition.state_since) if since_str: since_str += " ago" else: since_str = "just now" msg = "%s is %s since %s (value=%s, level=%d)" % ( lbl, condition.state, since_str, value, condition.notification_level, ) # notify core if necessary reminder_list = condition.reminder_list or config.REMINDER_LIST if condition.state and len(reminder_list) > condition.notification_level: duration = time() - condition.state_since next_notification = reminder_list[condition.notification_level] next_notification += condition.reminder_offset if duration < next_notification: log.info(msg) return try: notify_core(condition, value) except Exception as exc: # don't advance notification level if notification failed log.error("%s - FAILED to send WARNING: %r", msg, exc) return log.info("%s - sent WARNING", msg) condition.notification_level += 1 condition.save() elif not condition.state and not condition.notification_level: try: notify_core(condition, value) except Exception as exc: # don't advance notification level if notification failed log.error("%s - FAILED to send OK: %r", msg, exc) return log.info("%s - sent OK", msg) condition.notification_level = 1 condition.save() else: log.info(msg)
def check_machine(machine, rule_id=''): """Check all conditions for given machine with a single graphite query. If rule is specified, on that rule will be checked. """ old_targets = { 'cpu': 'cpu.total.nonidle', 'load': 'load.shortterm', 'ram': 'memory.nonfree_percent', 'disk-read': 'disk.total.disk_octets.read', 'disk-write': 'disk.total.disk_octets.write', 'network-rx': 'interface.total.if_octets.rx', 'network-tx': 'interface.total.if_octets.tx', } handler = MultiHandler(machine.uuid) # check if machine activated if not machine.activated: if handler.check_head(): log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) with machine.lock_n_load(): machine.activated = True machine.save() for rule_id in machine.rules: condition = machine.get_condition(rule_id) condition.active_after = time() + 30 condition.save() else: log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) return # gather all conditions conditions = {} rules = [rule_id] if rule_id else machine.rules for rule_id in rules: lbl = "%s/%s" % (machine.uuid, rule_id) try: condition = machine.get_condition(rule_id) except ConditionNotFoundError: log.warning( "%s condition not found, probably rule just got " "updated, will check on next run", lbl) continue lbl = "%s [%s]" % (lbl, condition) target = old_targets.get(condition.metric, condition.metric) ## if "%(head)s." not in target: ## target = "%(head)s." + target if condition.operator not in ('gt', 'lt'): log.error("%s unknown operator '%s'", lbl, condition.operator) continue if not condition.aggregate: log.warning("%s setting aggregate to 'all'", lbl) condition.aggregate = 'all' condition.save() if condition.aggregate not in ('all', 'any', 'avg'): log.error("%s unknown aggregate '%s'", lbl, condition.aggregate) continue if condition.active_after > time(): log.info("%s not yet active", lbl) continue if target not in conditions: conditions[target] = [condition] else: conditions[target].append(condition) if not conditions: log.warning("%s no rules found", machine.uuid) return try: data = handler.get_data(conditions.keys(), start='-90sec') except GraphiteError as exc: log.warning("%s error fetching stats %r", machine.uuid, exc) return # check all conditions for item in data: target = item['_requested_target'] if target not in conditions: log.warning("%s get data returned unexpected target %s", machine.uuid, target) continue datapoints = [(val, ts) for val, ts in item['datapoints'] if val is not None] for condition in conditions.pop(target): if not datapoints: log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition) continue check_condition(condition, datapoints) if conditions: for target in conditions: for cond in conditions[target]: if target == "nodata": # if nodata rule didn't return any datapoints, the whisper # files must be missing, so make the rule true check_condition(cond, [(1, 0)]) else: log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)
def check_machine(machine, rule_id=""): """Check all conditions for given machine with a single graphite query. If rule is specified, on that rule will be checked. """ old_targets = { "cpu": "cpu.total.nonidle", "load": "load.shortterm", "ram": "memory.nonfree_percent", "disk-read": "disk.total.disk_octets.read", "disk-write": "disk.total.disk_octets.write", "network-rx": "interface.total.if_octets.rx", "network-tx": "interface.total.if_octets.tx", } handler = MultiHandler(machine.uuid) # check if machine activated if not machine.activated: if handler.check_head(): log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) with machine.lock_n_load(): machine.activated = True machine.save() for rule_id in machine.rules: condition = machine.get_condition(rule_id) condition.active_after = time() + 30 condition.save() else: log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) return # gather all conditions conditions = {} rules = [rule_id] if rule_id else machine.rules for rule_id in rules: lbl = "%s/%s" % (machine.uuid, rule_id) try: condition = machine.get_condition(rule_id) except ConditionNotFoundError: log.warning("%s condition not found, probably rule just got " "updated, will check on next run", lbl) continue lbl = "%s [%s]" % (lbl, condition) target = old_targets.get(condition.metric, condition.metric) ## if "%(head)s." not in target: ## target = "%(head)s." + target if condition.operator not in ("gt", "lt"): log.error("%s unknown operator '%s'", lbl, condition.operator) continue if not condition.aggregate: log.warning("%s setting aggregate to 'all'", lbl) condition.aggregate = "all" condition.save() if condition.aggregate not in ("all", "any", "avg"): log.error("%s unknown aggregate '%s'", lbl, condition.aggregate) continue if condition.active_after > time(): log.info("%s not yet active", lbl) continue if target not in conditions: conditions[target] = [condition] else: conditions[target].append(condition) if not conditions: log.warning("%s no rules found", machine.uuid) return try: data = handler.get_data(conditions.keys(), start="-90sec") except GraphiteError as exc: log.warning("%s error fetching stats %r", machine.uuid, exc) return # check all conditions for item in data: target = item["_requested_target"] if target not in conditions: log.warning("%s get data returned unexpected target %s", machine.uuid, target) continue datapoints = [(val, ts) for val, ts in item["datapoints"] if val is not None] for condition in conditions.pop(target): if not datapoints: log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition) continue check_condition(condition, datapoints) if conditions: for target in conditions: for cond in conditions[target]: if target == "nodata": # if nodata rule didn't return any datapoints, the whisper # files must be missing, so make the rule true check_condition(cond, [(1, 0)]) else: log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)