def api_escalation_alarm(self, request, id): alarm = get_alarm(id) if alarm.status == "A": AlarmEscalation.watch_escalations(alarm) return {"status": True} else: return {"status": False, "error": "The alarm is not active at the moment"}
def api_escalation_alarm(self, request, id): alarm = get_alarm(id) if alarm.status == "A": AlarmEscalation.watch_escalations(alarm) return {'status': True} else: return { 'status': False, 'error': 'The alarm is not active at the moment' }
def check_close_consequence(alarm_id): logger.info("[%s] Checking close", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) return if alarm.status == "C": logger.info("[%s] Alarm is closed. Check passed", alarm_id) return # Detach root logger.info("[%s] Alarm is active. Detaching root", alarm_id) alarm.root = None alarm.log_message("Detached from root for not recovered", to_save=True) metrics["detached_root"] += 1 # Trigger escalations AlarmEscalation.watch_escalations(alarm)
def alarm_escalation(alarm, mo, ctx): now = datetime.datetime.now() for esc in AlarmEscalation.get_class_escalations(alarm['alarm_class']): for e_item in esc.escalations: # Check administrative domain if (e_item.administrative_domain and e_item.administrative_domain.id not in mo.object.administrative_domain.get_path()): continue # Check severity if e_item.min_severity and alarm['severity'] < e_item.min_severity: continue # Check selector if e_item.selector and not SelectorCache.is_in_selector(mo.object, e_item.selector): continue logger.info( "%s Watch for %s after %s seconds", alarm['alarm_class'], esc.name, e_item.delay ) et = now + datetime.timedelta(seconds=e_item.delay) if et > now: delay = (et - now).total_seconds() else: delay = None if e_item.notification_group: a = ActiveAlarm.objects.filter(managed_object=mo.object, vars__path=alarm['vars']["path"]).first() if a: logger.info("Alarm already sending") break subject = e_item.template.render_subject(**ctx) body = e_item.template.render_body(**ctx) logger.debug("Notification message:\nSubject: %s\n%s", subject, body) call_later( "noc.custom.handlers.thresholds.thresholdsnotification.threshold_escalation", delay=delay, scheduler="scheduler", notification_group_id=e_item.notification_group.id, subject=subject, body=body ) return {"notification_group": e_item.notification_group, "clear_template": e_item.clear_template, "message": "Sending message to : %s" % e_item.notification_group.name} # if e_item.stop_processing: logger.debug("Stopping processing") break
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) alarm.log_message(msg, to_save=True) def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{ "profile": p.name, "summary": k.summary, "order": (getattr(p, "display_order", 100), -k.summary), }] return sorted(r, key=operator.itemgetter("order")) logger.info("[%s] Performing escalations", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) metrics["escalation_missed_alarm"] += 1 return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) metrics["escalation_already_closed"] += 1 return if alarm.root: log("[%s] Alarm is not root cause, skipping", alarm_id) metrics["escalation_alarm_is_not_root"] += 1 return # escalation = AlarmEscalation.get_by_id(escalation_id) if not escalation: log("Escalation %s is not found, skipping", escalation_id) metrics["escalation_not_found"] += 1 return if alarm.managed_object.tt_system: sample = alarm.managed_object.tt_system.telemetry_sample else: sample = PARENT_SAMPLE with Span(client="escalator", sample=sample) as ctx: alarm.set_escalation_context() # Evaluate escalation chain mo = alarm.managed_object for a in escalation.escalations: if a.delay != escalation_delay: continue # Try other type # Check administrative domain if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path: continue # Check severity if a.min_severity and alarm.severity < a.min_severity: continue # Check selector if a.selector and not SelectorCache.is_in_selector(mo, a.selector): continue # Check time pattern if a.time_pattern and not a.time_pattern.match(alarm.timestamp): continue # Render escalation message if not a.template: log("No escalation template, skipping") continue # Check global limits # @todo: Move into escalator service # @todo: Process per-ttsystem limits ets = datetime.datetime.now() - datetime.timedelta( seconds=config.escalator.ets) ae = ActiveAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) ae += ArchivedAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) if ae >= config.escalator.tt_escalation_limit: logger.error( "Escalation limit exceeded (%s/%s). Skipping", ae, config.escalator.tt_escalation_limit, ) metrics["escalation_throttled"] += 1 alarm.set_escalation_error( "Escalation limit exceeded (%s/%s). Skipping" % (ae, config.escalator.tt_escalation_limit)) return # Check whether consequences has escalations cons_escalated = sorted(alarm.iter_escalated(), key=operator.attrgetter("timestamp")) affected_objects = sorted(alarm.iter_affected(), key=operator.attrgetter("name")) # segment = alarm.managed_object.segment if segment.is_redundant: uplinks = alarm.managed_object.data.uplinks lost_redundancy = len(uplinks) > 1 affected_subscribers = summary_to_list( segment.total_subscribers, SubscriberProfile) affected_services = summary_to_list(segment.total_services, ServiceProfile) else: lost_redundancy = False affected_subscribers = [] affected_services = [] # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, "lost_redundancy": lost_redundancy, "affected_subscribers": affected_subscribers, "affected_services": affected_services, } # Escalate to TT if a.create_tt and mo.can_escalate(): tt_id = None if alarm.escalation_tt: log("Already escalated with TT #%s", alarm.escalation_tt) else: pre_reason = escalation.get_pre_reason(mo.tt_system) active_maintenance = Maintenance.get_object_maintenance(mo) if active_maintenance: for m in active_maintenance: log( "Object is under maintenance: %s (%s-%s)", m.subject, m.start, m.stop, ) metrics["escalation_stop_on_maintenance"] += 1 elif pre_reason is not None: subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug( "[%s] Escalation message:\nSubject: %s\n%s", alarm_id, subject, body) log("Creating TT in system %s", mo.tt_system.name) tts = mo.tt_system.get_system() try: try: tt_id = tts.create_tt( queue=mo.tt_queue, obj=mo.tt_system_id, reason=pre_reason, subject=subject, body=body, login="******", timestamp=alarm.timestamp, ) except TemporaryTTError as e: metrics["escalation_tt_retry"] += 1 log( "Temporary error detected. Retry after %ss", RETRY_TIMEOUT) mo.tt_system.register_failure() Job.retry_after(get_next_retry(), str(e)) ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id) alarm.escalate( ctx["tt"], close_tt=a.close_tt, wait_tt=ctx["tt"] if a.wait_tt else None, ) if tts.promote_group_tt and a.promote_group_tt: # Create group TT log("Promoting to group tt") gtt = tts.create_group_tt( tt_id, alarm.timestamp) # Append affected objects for ao in alarm.iter_affected(): if ao.can_escalate(True): if ao.tt_system == mo.tt_system: log( "Appending object %s to group tt %s", ao.name, gtt) try: tts.add_to_group_tt( gtt, ao.tt_system_id) except TTError as e: alarm.set_escalation_error( "[%s] %s" % (mo.tt_system.name, e)) else: log( "Cannot append object %s to group tt %s: Belongs to other TT system", ao.name, gtt, ) else: log( "Cannot append object %s to group tt %s: Escalations are disabled", ao.name, gtt, ) metrics["escalation_tt_create"] += 1 except TTError as e: log("Failed to create TT: %s", e) metrics["escalation_tt_fail"] += 1 alarm.log_message("Failed to escalate: %s" % e, to_save=True) alarm.set_escalation_error("[%s] %s" % (mo.tt_system.name, e)) else: log("Cannot find pre reason") metrics["escalation_tt_fail"] += 1 if tt_id and cons_escalated: # Notify consequences for ca in cons_escalated: c_tt_name, c_tt_id = ca.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, body="Covered by TT %s" % tt_id, login="******") metrics["escalation_tt_comment"] += 1 except NotImplementedError: log( "Cannot add comment to %s: Feature not implemented", ca.escalation_tt, ) metrics["escalation_tt_comment_fail"] += 1 except TTError as e: log("Failed to add comment to %s: %s", ca.escalation_tt, e) metrics["escalation_tt_comment_fail"] += 1 else: log( "Failed to add comment to %s: Invalid TT system", ca.escalation_tt) metrics["escalation_tt_comment_fail"] += 1 # Send notification if a.notification_group and mo.can_notify(): subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug("[%s] Notification message:\nSubject: %s\n%s", alarm_id, subject, body) log("Sending notification to group %s", a.notification_group.name) a.notification_group.notify(subject, body) alarm.set_clear_notification(a.notification_group, a.clear_template) metrics["escalation_notify"] += 1 # if a.stop_processing: logger.debug("Stopping processing") break nalarm = get_alarm(alarm_id) if nalarm and nalarm.status == "C": nalarm.log_message( "Alarm has been closed during escalation. Try to deescalate") logger.info( "[%s] Alarm has been closed during escalation. Try to deescalate", alarm.id) metrics["escalation_closed_while_escalated"] += 1 if tt_id and not nalarm.escalation_tt: nalarm.escalation_ts = datetime.datetime.now() nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id) nalarm.save() if not nalarm.escalation_close_ts and not nalarm.escalation_close_error: notify_close( alarm_id=alarm_id, tt_id=nalarm.escalation_tt, subject="Closing", body="Closing", notification_group_id=alarm.clear_notification_group.id if alarm.clear_notification_group else None, close_tt=alarm.close_tt, ) elif nalarm == "A" and not nalarm.escalation_tt and tt_id: logger.error("[%s] Alarm without escalation TT: %s", alarm.id, tt_id) logger.info("[%s] Escalations loop end", alarm_id)
def run_alarm(self, alarm): AlarmEscalation.watch_escalations(alarm)
def raise_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info("Empty managed object, ignoring") return # @todo: Make configurable if not managed_object.is_managed: self.logger.info( "Managed object is not managed. Do not raise alarm") return if e.managed_object.id != managed_object.id: metrics["alarm_change_mo"] += 1 self.logger.info("Changing managed object to %s", managed_object.name) discriminator, vars = r.get_vars(e) if r.unique: assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if not a: # Try to reopen alarm a = ArchivedAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator, control_time__gte=e.timestamp).first() if a: # Reopen alarm self.logger.info("[%s|%s|%s] %s reopens alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) a = a.reopen("Reopened by disposition rule '%s'" % r.u_name) metrics["alarm_reopen"] += 1 if a: # Active alarm found, refresh self.logger.info( "[%s|%s|%s] Contributing event %s to active alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) # Contribute event to alarm e.contribute_to_alarm(a) if e.timestamp < a.timestamp: # Set to earlier date a.timestamp = e.timestamp a.save() elif e.timestamp > a.last_update: # Refresh last update a.last_update = e.timestamp a.save() metrics["alarm_contribute"] += 1 return # Calculate alarm coverage summary = ServiceSummary.get_object_summary(managed_object) summary["object"] = {managed_object.object_profile.id: 1} # severity = max(ServiceSummary.get_severity(summary), 1) self.logger.info("[%s|%s|%s] %s: Calculated alarm severity is: %s", e.id, managed_object.name, managed_object.address, r.u_name, severity) # Create new alarm a = ActiveAlarm( timestamp=e.timestamp, last_update=e.timestamp, managed_object=managed_object.id, alarm_class=r.alarm_class, severity=severity, vars=vars, discriminator=discriminator, direct_services=SummaryItem.dict_to_items(summary["service"]), direct_subscribers=SummaryItem.dict_to_items( summary["subscriber"]), total_objects=ObjectSummaryItem.dict_to_items(summary["object"]), total_services=SummaryItem.dict_to_items(summary["service"]), total_subscribers=SummaryItem.dict_to_items(summary["subscriber"]), log=[ AlarmLog(timestamp=datetime.datetime.now(), from_status="A", to_status="A", message="Alarm risen from event %s(%s) by rule '%s'" % (str(e.id), str(e.event_class.name), r.u_name)) ], opening_event=e.id) a.save() e.contribute_to_alarm(a) self.logger.info("[%s|%s|%s] %s raises alarm %s(%s): %r", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, a.vars) metrics["alarm_raise"] += 1 self.correlate(r, a) # Notify about new alarm if not a.root: a.managed_object.event( a.managed_object.EV_ALARM_RISEN, { "alarm": a, "subject": a.subject, "body": a.body, "symptoms": a.alarm_class.symptoms, "recommended_actions": a.alarm_class.recommended_actions, "probable_causes": a.alarm_class.probable_causes }, delay=a.alarm_class.get_notification_delay()) # Gather diagnostics when necessary AlarmDiagnosticConfig.on_raise(a) # Watch for escalations, when necessary if config.correlator.auto_escalation and not a.root: AlarmEscalation.watch_escalations(a)