def watch_escalations(cls, alarm): now = datetime.datetime.now() for esc in cls.get_class_escalations(alarm.alarm_class): for e_item in esc.escalations: # Check administrative domain if ( e_item.administrative_domain and e_item.administrative_domain.id not in alarm.adm_path ): continue # Check severity if e_item.min_severity and alarm.severity < e_item.min_severity: continue # Check selector if e_item.selector and not SelectorCache.is_in_selector( alarm.managed_object, e_item.selector ): continue logger.debug("[%s] Watch for %s after %s seconds", alarm.id, esc.name, e_item.delay) et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay) if et > now: delay = (et - now).total_seconds() else: delay = None call_later( "noc.services.escalator.escalation.escalate", scheduler="escalator", pool=alarm.managed_object.escalator_shard, delay=delay, max_runs=esc.max_escalation_retries, alarm_id=alarm.id, escalation_id=esc.id, escalation_delay=e_item.delay, )
def alarm_escalation(alarm, mo, ctx): now = datetime.datetime.now() for esc in AlarmEscalation.get_class_escalations(alarm['alarm_class']): for e_item in esc.escalations: # Check administrative domain if (e_item.administrative_domain and e_item.administrative_domain.id not in mo.object.administrative_domain.get_path()): continue # Check severity if e_item.min_severity and alarm['severity'] < e_item.min_severity: continue # Check selector if e_item.selector and not SelectorCache.is_in_selector(mo.object, e_item.selector): continue logger.info( "%s Watch for %s after %s seconds", alarm['alarm_class'], esc.name, e_item.delay ) et = now + datetime.timedelta(seconds=e_item.delay) if et > now: delay = (et - now).total_seconds() else: delay = None if e_item.notification_group: a = ActiveAlarm.objects.filter(managed_object=mo.object, vars__path=alarm['vars']["path"]).first() if a: logger.info("Alarm already sending") break subject = e_item.template.render_subject(**ctx) body = e_item.template.render_body(**ctx) logger.debug("Notification message:\nSubject: %s\n%s", subject, body) call_later( "noc.custom.handlers.thresholds.thresholdsnotification.threshold_escalation", delay=delay, scheduler="scheduler", notification_group_id=e_item.notification_group.id, subject=subject, body=body ) return {"notification_group": e_item.notification_group, "clear_template": e_item.clear_template, "message": "Sending message to : %s" % e_item.notification_group.name} # if e_item.stop_processing: logger.debug("Stopping processing") break
def watch_escalations(cls, alarm, timestamp_policy="a"): now = datetime.datetime.now() for esc in cls.get_class_escalations(alarm.alarm_class): for e_item in esc.escalations: # Check administrative domain if (e_item.administrative_domain and e_item.administrative_domain.id not in alarm.adm_path): continue # Check severity if e_item.min_severity and alarm.severity < e_item.min_severity: continue # Check selector if e_item.selector and not SelectorCache.is_in_selector( alarm.managed_object, e_item.selector): continue logger.debug("[%s] Watch for %s after %s seconds", alarm.id, esc.name, e_item.delay) et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay) if timestamp_policy == "c": # If escalation with current timestamp - shift consequence after main escalation delay = max( (et - now).total_seconds(), 120) + 120 if et > now else 120 logger.info( "[%s] Watch escalation with create new timestamp policy, after %s seconds", alarm.id, delay, ) elif et > now: delay = (et - now).total_seconds() else: delay = None call_later( "noc.services.escalator.escalation.escalate", scheduler="escalator", pool=alarm.managed_object.escalator_shard, delay=delay, max_runs=esc.max_escalation_retries, alarm_id=alarm.id, escalation_id=esc.id, escalation_delay=e_item.delay, timestamp_policy=timestamp_policy, ) if e_item.stop_processing: break
def on_clear(cls, alarm): """ Submit clear jobs :param alarm: :return: """ cfg = defaultdict(list) for c in cls.get_class_diagnostics(alarm.alarm_class): if c.selector and not SelectorCache.is_in_selector( alarm.managed_object, c.selector ): continue if c.only_root and alarm.root: continue if c.enable_on_clear: if c.on_clear_script: cfg[c.on_clear_delay] += [{ "script": c.on_clear_script, "header": c.on_clear_header }] if c.on_clear_action: cfg[c.on_clear_delay] += [{ "action": c.on_clear_action.id, "header": c.on_clear_header }] if c.on_clear_handler: cfg[c.on_clear_delay] += [{ "handler": c.on_clear_handler, "header": c.on_clear_header }] # Submit on_clear job for delay in cfg: call_later( "noc.fm.models.alarmdiagnosticconfig.on_clear", scheduler="correlator", pool=alarm.managed_object.pool.name, delay=delay, alarm=alarm.id, cfg=cfg[delay] ) AlarmDiagnostic.clear_diagnostics(alarm)
def on_raise(cls, alarm): """ Submit raise and periodic jobs :param alarm: :return: """ r_cfg = defaultdict(list) p_cfg = defaultdict(list) for c in cls.get_class_diagnostics(alarm.alarm_class): if c.selector and not SelectorCache.is_in_selector( alarm.managed_object, c.selector ): continue if c.only_root and alarm.root: continue if c.enable_on_raise: if c.on_raise_script: r_cfg[c.on_raise_delay] += [{ "script": c.on_raise_script, "header": c.on_raise_header }] if c.on_raise_action: r_cfg[c.on_raise_delay] += [{ "action": c.on_raise_action.name, "header": c.on_raise_header }] if c.on_raise_handler: r_cfg[c.on_raise_delay] += [{ "handler": c.on_raise_handler, "header": c.on_raise_header }] if c.enable_periodic: if c.periodic_script: p_cfg[c.periodic_interval] += [{ "script": c.periodic_script, "header": c.periodic_header }] if c.periodic_action: p_cfg[c.periodic_interval] += [{ "action": c.periodic_action.name, "header": c.periodic_header }] if c.periodic_handler: p_cfg[c.periodic_interval] += [{ "handler": c.periodic_handler, "header": c.periodic_header }] # Submit on_raise job for delay in r_cfg: call_later( "noc.fm.models.alarmdiagnosticconfig.on_raise", scheduler="correlator", pool=alarm.managed_object.pool.name, delay=delay, alarm=alarm.id, cfg=r_cfg[delay] ) # Submit periodic job for delay in p_cfg: call_later( "noc.fm.models.alarmdiagnosticconfig.periodic", scheduler="correlator", max_runs=PERIODIC_JOB_MAX_RUNS, pool=alarm.managed_object.pool.name, delay=delay, alarm=alarm.id, cfg={"cfg": p_cfg[delay], "delay": delay} )
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) alarm.log_message(msg, to_save=True) def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{ "profile": p.name, "summary": k.summary, "order": (getattr(p, "display_order", 100), -k.summary), }] return sorted(r, key=operator.itemgetter("order")) logger.info("[%s] Performing escalations", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) metrics["escalation_missed_alarm"] += 1 return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) metrics["escalation_already_closed"] += 1 return if alarm.root: log("[%s] Alarm is not root cause, skipping", alarm_id) metrics["escalation_alarm_is_not_root"] += 1 return # escalation = AlarmEscalation.get_by_id(escalation_id) if not escalation: log("Escalation %s is not found, skipping", escalation_id) metrics["escalation_not_found"] += 1 return if alarm.managed_object.tt_system: sample = alarm.managed_object.tt_system.telemetry_sample else: sample = PARENT_SAMPLE with Span(client="escalator", sample=sample) as ctx: alarm.set_escalation_context() # Evaluate escalation chain mo = alarm.managed_object for a in escalation.escalations: if a.delay != escalation_delay: continue # Try other type # Check administrative domain if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path: continue # Check severity if a.min_severity and alarm.severity < a.min_severity: continue # Check selector if a.selector and not SelectorCache.is_in_selector(mo, a.selector): continue # Check time pattern if a.time_pattern and not a.time_pattern.match(alarm.timestamp): continue # Render escalation message if not a.template: log("No escalation template, skipping") continue # Check global limits # @todo: Move into escalator service # @todo: Process per-ttsystem limits ets = datetime.datetime.now() - datetime.timedelta( seconds=config.escalator.ets) ae = ActiveAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) ae += ArchivedAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) if ae >= config.escalator.tt_escalation_limit: logger.error( "Escalation limit exceeded (%s/%s). Skipping", ae, config.escalator.tt_escalation_limit, ) metrics["escalation_throttled"] += 1 alarm.set_escalation_error( "Escalation limit exceeded (%s/%s). Skipping" % (ae, config.escalator.tt_escalation_limit)) return # Check whether consequences has escalations cons_escalated = sorted(alarm.iter_escalated(), key=operator.attrgetter("timestamp")) affected_objects = sorted(alarm.iter_affected(), key=operator.attrgetter("name")) # segment = alarm.managed_object.segment if segment.is_redundant: uplinks = alarm.managed_object.data.uplinks lost_redundancy = len(uplinks) > 1 affected_subscribers = summary_to_list( segment.total_subscribers, SubscriberProfile) affected_services = summary_to_list(segment.total_services, ServiceProfile) else: lost_redundancy = False affected_subscribers = [] affected_services = [] # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, "lost_redundancy": lost_redundancy, "affected_subscribers": affected_subscribers, "affected_services": affected_services, } # Escalate to TT if a.create_tt and mo.can_escalate(): tt_id = None if alarm.escalation_tt: log("Already escalated with TT #%s", alarm.escalation_tt) else: pre_reason = escalation.get_pre_reason(mo.tt_system) active_maintenance = Maintenance.get_object_maintenance(mo) if active_maintenance: for m in active_maintenance: log( "Object is under maintenance: %s (%s-%s)", m.subject, m.start, m.stop, ) metrics["escalation_stop_on_maintenance"] += 1 elif pre_reason is not None: subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug( "[%s] Escalation message:\nSubject: %s\n%s", alarm_id, subject, body) log("Creating TT in system %s", mo.tt_system.name) tts = mo.tt_system.get_system() try: try: tt_id = tts.create_tt( queue=mo.tt_queue, obj=mo.tt_system_id, reason=pre_reason, subject=subject, body=body, login="******", timestamp=alarm.timestamp, ) except TemporaryTTError as e: metrics["escalation_tt_retry"] += 1 log( "Temporary error detected. Retry after %ss", RETRY_TIMEOUT) mo.tt_system.register_failure() Job.retry_after(get_next_retry(), str(e)) ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id) alarm.escalate( ctx["tt"], close_tt=a.close_tt, wait_tt=ctx["tt"] if a.wait_tt else None, ) if tts.promote_group_tt and a.promote_group_tt: # Create group TT log("Promoting to group tt") gtt = tts.create_group_tt( tt_id, alarm.timestamp) # Append affected objects for ao in alarm.iter_affected(): if ao.can_escalate(True): if ao.tt_system == mo.tt_system: log( "Appending object %s to group tt %s", ao.name, gtt) try: tts.add_to_group_tt( gtt, ao.tt_system_id) except TTError as e: alarm.set_escalation_error( "[%s] %s" % (mo.tt_system.name, e)) else: log( "Cannot append object %s to group tt %s: Belongs to other TT system", ao.name, gtt, ) else: log( "Cannot append object %s to group tt %s: Escalations are disabled", ao.name, gtt, ) metrics["escalation_tt_create"] += 1 except TTError as e: log("Failed to create TT: %s", e) metrics["escalation_tt_fail"] += 1 alarm.log_message("Failed to escalate: %s" % e, to_save=True) alarm.set_escalation_error("[%s] %s" % (mo.tt_system.name, e)) else: log("Cannot find pre reason") metrics["escalation_tt_fail"] += 1 if tt_id and cons_escalated: # Notify consequences for ca in cons_escalated: c_tt_name, c_tt_id = ca.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, body="Covered by TT %s" % tt_id, login="******") metrics["escalation_tt_comment"] += 1 except NotImplementedError: log( "Cannot add comment to %s: Feature not implemented", ca.escalation_tt, ) metrics["escalation_tt_comment_fail"] += 1 except TTError as e: log("Failed to add comment to %s: %s", ca.escalation_tt, e) metrics["escalation_tt_comment_fail"] += 1 else: log( "Failed to add comment to %s: Invalid TT system", ca.escalation_tt) metrics["escalation_tt_comment_fail"] += 1 # Send notification if a.notification_group and mo.can_notify(): subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug("[%s] Notification message:\nSubject: %s\n%s", alarm_id, subject, body) log("Sending notification to group %s", a.notification_group.name) a.notification_group.notify(subject, body) alarm.set_clear_notification(a.notification_group, a.clear_template) metrics["escalation_notify"] += 1 # if a.stop_processing: logger.debug("Stopping processing") break nalarm = get_alarm(alarm_id) if nalarm and nalarm.status == "C": nalarm.log_message( "Alarm has been closed during escalation. Try to deescalate") logger.info( "[%s] Alarm has been closed during escalation. Try to deescalate", alarm.id) metrics["escalation_closed_while_escalated"] += 1 if tt_id and not nalarm.escalation_tt: nalarm.escalation_ts = datetime.datetime.now() nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id) nalarm.save() if not nalarm.escalation_close_ts and not nalarm.escalation_close_error: notify_close( alarm_id=alarm_id, tt_id=nalarm.escalation_tt, subject="Closing", body="Closing", notification_group_id=alarm.clear_notification_group.id if alarm.clear_notification_group else None, close_tt=alarm.close_tt, ) elif nalarm == "A" and not nalarm.escalation_tt and tt_id: logger.error("[%s] Alarm without escalation TT: %s", alarm.id, tt_id) logger.info("[%s] Escalations loop end", alarm_id)
def check_alarm(self, alarm): def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{"profile": p.name, "summary": k.summary}] return sorted(r, key=lambda x: -x["summary"]) def iter_consequences(alarm): """ Generator yielding all consequences alarm """ for ac in [ArchivedAlarm, ActiveAlarm]: for a in ac.objects.filter(root=alarm.id): yield a for ca in a.iter_consequences(): yield ca def iter_affected(alarm): """ Generator yielding all affected managed objects """ seen = {alarm.managed_object} yield alarm.managed_object for a in iter_consequences(alarm): if a.managed_object not in seen: seen.add(a.managed_object) yield a.managed_object def iter_escalated(alarm): """ Generator yielding all escalated consequences """ for a in iter_consequences(alarm): if a.escalation_tt: yield a mo = alarm.managed_object self.print("-" * 72) self.print("Alarm Id : %s Time: %s" % (alarm.id, alarm.timestamp)) self.print("Class : %s" % alarm.alarm_class.name) self.print("Object : %s Platform: %s IP: %s" % (mo.name, mo.platform, mo.address)) c = mo.administrative_domain adm_domains = [c] while c.parent: c = c.parent adm_domains.insert(0, c) self.print("Adm. Dom.: %s (%s)" % (" | ".join( c.name for c in adm_domains), " | ".join(str(c.id) for c in adm_domains))) escalations = list( AlarmEscalation.objects.filter( alarm_classes__alarm_class=alarm.alarm_class.id)) if not escalations: self.print("@ No matched escalations") return for esc in escalations: self.print("[Chain: %s]" % esc.name) if alarm.root: self.print(" @ Not a root cause (Root Id: %s)" % alarm.root) continue for e in esc.escalations: self.print(" [After %ss]" % e.delay) # Check administrative domain if e.administrative_domain and e.administrative_domain.id not in alarm.adm_path: self.print( " @ Administrative domain mismatch (%s not in %s)" % (e.administrative_domain.id, alarm.adm_path)) continue # Check severity if e.min_severity and alarm.severity < e.min_severity: self.print(" @ Severity mismatch: %s < %s" % (alarm.severity, e.min_severity)) continue # Check selector if e.selector and not SelectorCache.is_in_selector( mo, e.selector): self.print(" @ Selector mismatch (%s required)" % (e.selector.name)) continue # Check time pattern if e.time_pattern and not e.time_pattern.match( alarm.timestamp): self.print(" @ Time pattern mismatch (%s required)" % (e.time_pattern.name)) continue # Render escalation message if not e.template: self.print(" @ No escalation template") continue # Check whether consequences has escalations cons_escalated = sorted(iter_escalated(alarm), key=operator.attrgetter("timestamp")) affected_objects = sorted(iter_affected(alarm), key=operator.attrgetter("name")) # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, } if e.create_tt: self.print(" Creating TT") tt_system = mo.tt_system if not tt_system: self.print(" @ No TT System. Cannot escalate") elif not mo.can_escalate(): self.print(" @ Escalation disabled by policy") else: tts = tt_system.get_system() self.print(" TT System: %s Mapped Id: %s" % (tt_system.name, mo.tt_system_id)) subject = e.template.render_subject(**ctx) body = e.template.render_body(**ctx) self.print(" @ Create network TT") self.print(" | Subject: %s" % subject) self.print(" |") self.print(" | %s" % body.replace("\n", "\n | ")) tt_id = "<NETWORK TT>" ctx["tt"] = "%s:%s" % (tt_system.name, tt_id) # alarm.escalate(ctx["tt"], close_tt=e.close_tt) if tts.promote_group_tt: self.print(" Promoting group TT") self.print(" @ Create Group TT") # Add objects for o in alarm.iter_affected(): if o.can_escalate(depended=True): if o.tt_system == mo.tt_system: self.print( " @ Add to group TT %s. Remote Id: %s" % (o.name, o.tt_system_id)) else: self.print( " @ Cannot add to group TT. Belongs to other TT system" % o.name) else: self.print( " @ Cannot add to group TT %s. Escalations are disabled" % (o.name)) if e.notification_group: if mo.can_notify(): subject = e.template.render_subject(**ctx) body = e.template.render_body(**ctx) self.print(" @ Sending notification to group '%s'" % e.notification_group.name) self.print(" | Subject: %s" % subject) self.print(" |") self.print(" | %s" % body.replace("\n", "\n | ")) else: self.print(" @ Notification disabled by policy") if e.stop_processing: self.print(" @ Stop processing") break