def api_set_root(self, request, id, root): alarm = get_alarm(id) r = get_alarm(root) if not r: return self.response_not_found() alarm.set_root(r) return True
def api_clear(self, request, id): alarm = get_alarm(id) if not alarm.alarm_class.user_clearable: return {"status": False, "error": "Deny clear alarm by user"} if alarm.status == "A": alarm.clear_alarm("Cleared by %s" % request.user) return True
def api_escalation_alarm(self, request, id): alarm = get_alarm(id) if alarm.status == "A": AlarmEscalation.watch_escalations(alarm) return {"status": True} else: return {"status": False, "error": "The alarm is not active at the moment"}
def wait_tt(alarm_id): logger.info("[%s] Checking escalated TT", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) return c_tt_name, c_tt_id = alarm.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if not cts: logger.error("Unknown TT system: %s", c_tt_name) return ti = None tts = cts.get_system() try: ti = tts.get_tt(c_tt_id) except tts.TTError as e: logger.error("Cannot get TT info: %s", e) if ti and ti["resolved"]: # Close alarm alarm.clear_alarm( "Closed by TT %s" % alarm.escalation_tt, ts=ti.get("close_ts", datetime.datetime.now()), force=True, ) else: Job.retry_after(config.escalator.wait_tt_check_interval, msg="Next check")
def handle_check(self, check_alarms=None, *args, **kwargs): check_alarms = check_alarms or [] for a_id in check_alarms: alarm = get_alarm(a_id) if alarm: self.check_alarm(alarm) else: self.print("ERROR: Alarm %s is not found. Skipping" % alarm)
def api_clear(self, request, id, msg=""): alarm = get_alarm(id) if not alarm.alarm_class.user_clearable: return {"status": False, "error": "Deny clear alarm by user"} if alarm.status == "A": alarm.clear_alarm("Cleared by %s: %s" % (request.user, msg), source=request.user.username) return True
def close_oo_alarm(alarm_id, timestamp, *args, **kwargs): logger.info("[close_oo_alarm|%s] Closing alarm", alarm_id) alarm = get_alarm(alarm_id) if alarm.status != "A": logger.info("[close_oo_alarm|%s] Already closed, skipping", alarm_id) return alarm.clear_alarm(message="Cleared as out-of-order", ts=timestamp) metrics["oo_pings_closed"] += 1
def dereference(self, id): a = get_alarm(id) if self.current_user.is_superuser: return a elif set(self.get_user_domains()) & set(a.adm_path): return a else: metrics["error", ("type", "no_such_alarm")] += 1 return None
def api_unsubscribe(self, request, id): alarm = get_alarm(id) if not alarm: return self.response_not_found() if alarm.status == "A": alarm.unsubscribe(request.user) return self.get_alarm_subscribers(alarm) else: return []
def api_escalation_alarm(self, request, id): alarm = get_alarm(id) if alarm.status == "A": AlarmEscalation.watch_escalations(alarm) return {'status': True} else: return { 'status': False, 'error': 'The alarm is not active at the moment' }
def api_unacknowledge(self, request, id, msg=""): alarm = get_alarm(id) if not alarm: return self.response_not_found() if alarm.status != "A": return self.response_not_found() if not alarm.ack_ts: return {"status": False, "message": "Already unacknowledged"} alarm.unacknowledge(request.user, msg=msg) return {"status": True}
def api_acknowledge(self, request, id): alarm = get_alarm(id) if not alarm: return self.response_not_found() if alarm.status != "A": return self.response_not_found() if alarm.ack_ts: return { "status": False, "message": "Already acknowledged by %s" % alarm.ack_user } alarm.acknowledge(request.user) return {"status": True}
def handle_run(self, run_alarms=None, limit=0, *args, **kwargs): run_alarms = run_alarms or [] if limit: delay = 60.0 / limit for a_id in run_alarms: alarm = get_alarm(a_id) if alarm and alarm.status == "A": self.print("Sending alarm %s to escalator" % alarm.id) self.run_alarm(alarm) if limit: time.sleep(delay) elif alarm: self.print("ERROR: Alarm %s is cleared. Skipping" % alarm) else: self.print("ERROR: Alarm %s is not found. Skipping" % alarm)
def check_close_consequence(alarm_id): logger.info("[%s] Checking close", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) return if alarm.status == "C": logger.info("[%s] Alarm is closed. Check passed", alarm_id) return # Detach root logger.info("[%s] Alarm is active. Detaching root", alarm_id) alarm.root = None alarm.log_message("Detached from root for not recovered", to_save=True) metrics["detached_root"] += 1 # Trigger escalations AlarmEscalation.watch_escalations(alarm)
def get_object(cls, id): alarm = get_alarm(id) if not alarm: raise KeyError() r = { "id": str(alarm.id), "timestamp": cls.qs(alarm.timestamp), "severity": alarm.severity, "reopens": alarm.reopens } if alarm.root: r["root"] = str(alarm.root) if alarm.status == "C": r["clear_timestamp"] = cls.qs(alarm.clear_timestamp) cls._apply_managed_object(alarm, r) cls._apply_alarm_class(alarm, r) cls._apply_vars(alarm, r) cls._apply_escalation(alarm, r) cls._apply_services(alarm, r) return r
def handle_close(self, close_alarms=None, *args, **kwargs): close_alarms = close_alarms or [] for a_id in close_alarms: alarm = get_alarm(a_id) if alarm and alarm.status == "A" and alarm.escalation_tt: self.print("Sending TT close for alarm %s to escalator" % alarm.id) call_later( "noc.services.escalator.escalation.notify_close", scheduler="escalator", pool=alarm.managed_object.escalator_shard, alarm_id=alarm.id, tt_id=alarm.escalation_tt, subject="Closed", body="Closed", notification_group_id=None, close_tt=False, ) elif alarm: self.print("ERROR: Alarm %s is not escalated. Skipping" % alarm) else: self.print("ERROR: Alarm %s is not found. Skipping" % alarm)
def get_alarms(self): def get_children(ca): ca._children = [] for ac in [ActiveAlarm, ArchivedAlarm]: for a in ac.objects.filter(root=ca.id): ca._children += [a] get_children(a) def flatten(ca, r, level): ca._level = level ca.service_summary = { "service": SummaryItem.items_to_dict(ca.direct_services), "subscriber": SummaryItem.items_to_dict(ca.direct_subscribers) } r += [ca] if hasattr(ca, "_children"): for c in sorted(ca._children, key=operator.attrgetter("timestamp")): flatten(c, r, level + 1) # Step upwards r = self.object a = r while r and r.root: a = get_alarm(r.root) if a: a._children = [r] r = a else: break # Fill children get_children(self.object) # Flatten result = [] flatten(a, result, 0) return result
def api_event(self, request, id): event = get_event(id) if not event: return self.response_not_found() d = self.instance_to_dict(event) dd = dict( (v, None) for v in ( "body", "symptoms", "probable_causes", "recommended_actions", "log", "vars", "resolved_vars", "raw_vars", ) ) if event.status in ("A", "S"): dd["body"] = event.body dd["symptoms"] = event.event_class.symptoms dd["probable_causes"] = event.event_class.probable_causes dd["recommended_actions"] = event.event_class.recommended_actions # Fill vars left = set(event.vars) vars = [] for ev in event.event_class.vars: if ev.name in event.vars: vars += [(ev.name, event.vars[ev.name], ev.description)] left.remove(ev.name) vars += [(v, event.vars[v], None) for v in sorted(left)] dd["vars"] = vars # Fill resolved vars vars = [] is_trap = event.raw_vars.get("source") == "SNMP Trap" for v in sorted(event.resolved_vars): desc = None if is_trap and "::" in v: desc = MIB.get_description(v) vars += [(v, event.resolved_vars[v], desc)] dd["resolved_vars"] = vars dd["raw_vars"] = sorted(event.raw_vars.items()) # Managed object properties mo = event.managed_object d["managed_object_address"] = mo.address d["managed_object_profile"] = mo.profile.name d["managed_object_platform"] = mo.platform.name if mo.platform else "" d["managed_object_version"] = mo.version.version if mo.version else "" d["segment"] = mo.segment.name d["segment_id"] = str(mo.segment.id) d["tags"] = mo.tags # Log if event.log: dd["log"] = [ { "timestamp": self.to_json(l.timestamp), "from_status": l.from_status, "to_status": l.to_status, "message": l.message, } for l in event.log ] # d.update(dd) # Get alarms if event.status in ("A", "S"): alarms = [] for a_id in event.alarms: a = get_alarm(a_id) if not a: continue if a.opening_event == event.id: role = "O" elif a.closing_event == event.id: role = "C" else: role = "" alarms += [ { "id": str(a.id), "status": a.status, "alarm_class": str(a.alarm_class.id), "alarm_class__label": a.alarm_class.name, "subject": a.subject, "role": role, "timestamp": self.to_json(a.timestamp), } ] d["alarms"] = alarms # Apply plugins if event.status in ("A", "S") and event.event_class.plugins: plugins = [] for p in event.event_class.plugins: if p.name in self.plugins: plugin = self.plugins[p.name] dd = plugin.get_data(event, p.config) if "plugins" in dd: plugins += dd["plugins"] del dd["plugins"] d.update(dd) if plugins: d["plugins"] = plugins elif event.status == "F": # Enable traceback plugin for failed events d["traceback"] = event.traceback d["plugins"] = [("NOC.fm.event.plugins.Traceback", {})] return d
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) alarm.log_message(msg, to_save=True) def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{ "profile": p.name, "summary": k.summary, "order": (getattr(p, "display_order", 100), -k.summary), }] return sorted(r, key=operator.itemgetter("order")) logger.info("[%s] Performing escalations", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) metrics["escalation_missed_alarm"] += 1 return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) metrics["escalation_already_closed"] += 1 return if alarm.root: log("[%s] Alarm is not root cause, skipping", alarm_id) metrics["escalation_alarm_is_not_root"] += 1 return # escalation = AlarmEscalation.get_by_id(escalation_id) if not escalation: log("Escalation %s is not found, skipping", escalation_id) metrics["escalation_not_found"] += 1 return if alarm.managed_object.tt_system: sample = alarm.managed_object.tt_system.telemetry_sample else: sample = PARENT_SAMPLE with Span(client="escalator", sample=sample) as ctx: alarm.set_escalation_context() # Evaluate escalation chain mo = alarm.managed_object for a in escalation.escalations: if a.delay != escalation_delay: continue # Try other type # Check administrative domain if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path: continue # Check severity if a.min_severity and alarm.severity < a.min_severity: continue # Check selector if a.selector and not SelectorCache.is_in_selector(mo, a.selector): continue # Check time pattern if a.time_pattern and not a.time_pattern.match(alarm.timestamp): continue # Render escalation message if not a.template: log("No escalation template, skipping") continue # Check global limits # @todo: Move into escalator service # @todo: Process per-ttsystem limits ets = datetime.datetime.now() - datetime.timedelta( seconds=config.escalator.ets) ae = ActiveAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) ae += ArchivedAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) if ae >= config.escalator.tt_escalation_limit: logger.error( "Escalation limit exceeded (%s/%s). Skipping", ae, config.escalator.tt_escalation_limit, ) metrics["escalation_throttled"] += 1 alarm.set_escalation_error( "Escalation limit exceeded (%s/%s). Skipping" % (ae, config.escalator.tt_escalation_limit)) return # Check whether consequences has escalations cons_escalated = sorted(alarm.iter_escalated(), key=operator.attrgetter("timestamp")) affected_objects = sorted(alarm.iter_affected(), key=operator.attrgetter("name")) # segment = alarm.managed_object.segment if segment.is_redundant: uplinks = alarm.managed_object.data.uplinks lost_redundancy = len(uplinks) > 1 affected_subscribers = summary_to_list( segment.total_subscribers, SubscriberProfile) affected_services = summary_to_list(segment.total_services, ServiceProfile) else: lost_redundancy = False affected_subscribers = [] affected_services = [] # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, "lost_redundancy": lost_redundancy, "affected_subscribers": affected_subscribers, "affected_services": affected_services, } # Escalate to TT if a.create_tt and mo.can_escalate(): tt_id = None if alarm.escalation_tt: log("Already escalated with TT #%s", alarm.escalation_tt) else: pre_reason = escalation.get_pre_reason(mo.tt_system) active_maintenance = Maintenance.get_object_maintenance(mo) if active_maintenance: for m in active_maintenance: log( "Object is under maintenance: %s (%s-%s)", m.subject, m.start, m.stop, ) metrics["escalation_stop_on_maintenance"] += 1 elif pre_reason is not None: subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug( "[%s] Escalation message:\nSubject: %s\n%s", alarm_id, subject, body) log("Creating TT in system %s", mo.tt_system.name) tts = mo.tt_system.get_system() try: try: tt_id = tts.create_tt( queue=mo.tt_queue, obj=mo.tt_system_id, reason=pre_reason, subject=subject, body=body, login="******", timestamp=alarm.timestamp, ) except TemporaryTTError as e: metrics["escalation_tt_retry"] += 1 log( "Temporary error detected. Retry after %ss", RETRY_TIMEOUT) mo.tt_system.register_failure() Job.retry_after(get_next_retry(), str(e)) ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id) alarm.escalate( ctx["tt"], close_tt=a.close_tt, wait_tt=ctx["tt"] if a.wait_tt else None, ) if tts.promote_group_tt and a.promote_group_tt: # Create group TT log("Promoting to group tt") gtt = tts.create_group_tt( tt_id, alarm.timestamp) # Append affected objects for ao in alarm.iter_affected(): if ao.can_escalate(True): if ao.tt_system == mo.tt_system: log( "Appending object %s to group tt %s", ao.name, gtt) try: tts.add_to_group_tt( gtt, ao.tt_system_id) except TTError as e: alarm.set_escalation_error( "[%s] %s" % (mo.tt_system.name, e)) else: log( "Cannot append object %s to group tt %s: Belongs to other TT system", ao.name, gtt, ) else: log( "Cannot append object %s to group tt %s: Escalations are disabled", ao.name, gtt, ) metrics["escalation_tt_create"] += 1 except TTError as e: log("Failed to create TT: %s", e) metrics["escalation_tt_fail"] += 1 alarm.log_message("Failed to escalate: %s" % e, to_save=True) alarm.set_escalation_error("[%s] %s" % (mo.tt_system.name, e)) else: log("Cannot find pre reason") metrics["escalation_tt_fail"] += 1 if tt_id and cons_escalated: # Notify consequences for ca in cons_escalated: c_tt_name, c_tt_id = ca.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, body="Covered by TT %s" % tt_id, login="******") metrics["escalation_tt_comment"] += 1 except NotImplementedError: log( "Cannot add comment to %s: Feature not implemented", ca.escalation_tt, ) metrics["escalation_tt_comment_fail"] += 1 except TTError as e: log("Failed to add comment to %s: %s", ca.escalation_tt, e) metrics["escalation_tt_comment_fail"] += 1 else: log( "Failed to add comment to %s: Invalid TT system", ca.escalation_tt) metrics["escalation_tt_comment_fail"] += 1 # Send notification if a.notification_group and mo.can_notify(): subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug("[%s] Notification message:\nSubject: %s\n%s", alarm_id, subject, body) log("Sending notification to group %s", a.notification_group.name) a.notification_group.notify(subject, body) alarm.set_clear_notification(a.notification_group, a.clear_template) metrics["escalation_notify"] += 1 # if a.stop_processing: logger.debug("Stopping processing") break nalarm = get_alarm(alarm_id) if nalarm and nalarm.status == "C": nalarm.log_message( "Alarm has been closed during escalation. Try to deescalate") logger.info( "[%s] Alarm has been closed during escalation. Try to deescalate", alarm.id) metrics["escalation_closed_while_escalated"] += 1 if tt_id and not nalarm.escalation_tt: nalarm.escalation_ts = datetime.datetime.now() nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id) nalarm.save() if not nalarm.escalation_close_ts and not nalarm.escalation_close_error: notify_close( alarm_id=alarm_id, tt_id=nalarm.escalation_tt, subject="Closing", body="Closing", notification_group_id=alarm.clear_notification_group.id if alarm.clear_notification_group else None, close_tt=alarm.close_tt, ) elif nalarm == "A" and not nalarm.escalation_tt and tt_id: logger.error("[%s] Alarm without escalation TT: %s", alarm.id, tt_id) logger.info("[%s] Escalations loop end", alarm_id)
def notify_close(alarm_id, tt_id, subject, body, notification_group_id, close_tt=False): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) if tt_id: alarm = get_alarm(alarm_id) alarm.set_escalation_close_ctx() if (alarm and alarm.status == "C" and (alarm.escalation_close_ts or alarm.escalation_close_error)): log("Alarm is already deescalated") metrics["escalation_already_deescalated"] += 1 return with Span(client="escalator", sample=PARENT_SAMPLE): c_tt_name, c_tt_id = tt_id.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() if close_tt: # Close tt try: log("Closing TT %s", tt_id) tts.close_tt(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_close"] += 1 if alarm: alarm.close_escalation() except TemporaryTTError as e: log("Temporary error detected while closing tt %s: %s", tt_id, e) metrics["escalation_tt_close_retry"] += 1 Job.retry_after(get_next_retry(), str(e)) cts.register_failure() if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) except TTError as e: log("Failed to close tt %s: %s", tt_id, e) metrics["escalation_tt_close_fail"] += 1 if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) else: # Append comment to tt try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_comment"] += 1 except TTError as e: log("Failed to add comment to %s: %s", tt_id, e) metrics["escalation_tt_comment_fail"] += 1 else: log("Failed to add comment to %s: Invalid TT system", tt_id) metrics["escalation_tt_comment_fail"] += 1 if notification_group_id: notification_group = NotificationGroup.get_by_id(notification_group_id) if notification_group: log("Sending notification to group %s", notification_group.name) notification_group.notify(subject, body) metrics["escalation_notify"] += 1 else: log("Invalid notification group %s", notification_group_id)
def api_post(self, request, id, msg): alarm = get_alarm(id) if not alarm: self.response_not_found() alarm.log_message(msg, source=request.user.username) return True
def api_clear(self, request, id): alarm = get_alarm(id) if alarm.status == "A": alarm.clear_alarm("Cleared by %s" % request.user) return True
def api_post(self, request, id, msg): alarm = get_alarm(id) if not alarm: self.response_not_found() alarm.log_message("%s: %s" % (request.user.username, msg)) return True
def api_alarm(self, request, id): alarm = get_alarm(id) if not alarm: self.response_not_found() user = request.user d = self.instance_to_dict(alarm) d["body"] = alarm.body d["symptoms"] = alarm.alarm_class.symptoms d["probable_causes"] = alarm.alarm_class.probable_causes d["recommended_actions"] = alarm.alarm_class.recommended_actions d["vars"] = sorted(alarm.vars.items()) d["status"] = alarm.status d["status__label"] = {"A": "Active", "C": "Cleared"}[alarm.status] # Managed object properties mo = alarm.managed_object d["managed_object_address"] = mo.address d["managed_object_profile"] = mo.profile.name d["managed_object_platform"] = mo.platform.name if mo.platform else "" d["managed_object_version"] = mo.version.version if mo.version else "" d["segment"] = mo.segment.name d["segment_id"] = str(mo.segment.id) d["segment_path"] = " | ".join( NetworkSegment.get_by_id(p).name for p in NetworkSegment.get_path(mo.segment)) if mo.container: cp = [] c = mo.container.id while c: try: o = Object.objects.get(id=c) if o.container: cp.insert(0, o.name) c = o.container.id if o.container else None except DoesNotExist: break d["container_path"] = " | ".join(cp) if not self.location(mo.container.id)[0]: d["address_path"] = None else: d["address_path"] = ", ".join(self.location(mo.container.id)) d["tags"] = mo.tags # Log if alarm.log: d["log"] = [{ "timestamp": self.to_json(l.timestamp), "from_status": l.from_status, "to_status": l.to_status, "message": l.message } for l in alarm.log] # Events events = [] for ec in ActiveEvent, ArchivedEvent: for e in ec.objects.filter(alarms=alarm.id): events += [{ "id": str(e.id), "event_class": str(e.event_class.id), "event_class__label": e.event_class.name, "timestamp": self.to_json(e.timestamp), "status": e.status, "managed_object": e.managed_object.id, "managed_object__label": e.managed_object.name, "subject": e.subject }] if events: d["events"] = events # Alarms children = self.get_nested_alarms(alarm) if children: d["alarms"] = {"expanded": True, "children": children} # Subscribers if alarm.status == "A": d["subscribers"] = self.get_alarm_subscribers(alarm) d["is_subscribed"] = user in alarm.subscribers # Apply plugins plugins = [] acp = alarm.alarm_class.plugins or [] acp += [self.diagnostic_plugin] for p in acp: if p.name in self.plugins: plugin = self.plugins[p.name] dd = plugin.get_data(alarm, p.config) if "plugins" in dd: plugins += dd["plugins"] del dd["plugins"] d.update(dd) if plugins: d["plugins"] = plugins return d