Example #1
0
File: views.py Project: nbashev/noc
 def api_escalation_alarm(self, request, id):
     alarm = get_alarm(id)
     if alarm.status == "A":
         AlarmEscalation.watch_escalations(alarm)
         return {"status": True}
     else:
         return {"status": False, "error": "The alarm is not active at the moment"}
Example #2
0
 def api_escalation_alarm(self, request, id):
     alarm = get_alarm(id)
     if alarm.status == "A":
         AlarmEscalation.watch_escalations(alarm)
         return {'status': True}
     else:
         return {
             'status': False,
             'error': 'The alarm is not active at the moment'
         }
Example #3
0
def check_close_consequence(alarm_id):
    logger.info("[%s] Checking close", alarm_id)
    alarm = get_alarm(alarm_id)
    if alarm is None:
        logger.info("[%s] Missing alarm, skipping", alarm_id)
        return
    if alarm.status == "C":
        logger.info("[%s] Alarm is closed. Check passed", alarm_id)
        return
    # Detach root
    logger.info("[%s] Alarm is active. Detaching root", alarm_id)
    alarm.root = None
    alarm.log_message("Detached from root for not recovered", to_save=True)
    metrics["detached_root"] += 1
    # Trigger escalations
    AlarmEscalation.watch_escalations(alarm)
Example #4
0
def alarm_escalation(alarm, mo, ctx):
    now = datetime.datetime.now()
    for esc in AlarmEscalation.get_class_escalations(alarm['alarm_class']):
        for e_item in esc.escalations:
            # Check administrative domain
            if (e_item.administrative_domain and
                    e_item.administrative_domain.id not in mo.object.administrative_domain.get_path()):
                continue
            # Check severity
            if e_item.min_severity and alarm['severity'] < e_item.min_severity:
                continue
            # Check selector
            if e_item.selector and not SelectorCache.is_in_selector(mo.object, e_item.selector):
                continue
            logger.info(
                "%s Watch for %s after %s seconds",
                alarm['alarm_class'], esc.name, e_item.delay
            )
            et = now + datetime.timedelta(seconds=e_item.delay)
            if et > now:
                delay = (et - now).total_seconds()
            else:
                delay = None
            if e_item.notification_group:
                a = ActiveAlarm.objects.filter(managed_object=mo.object, vars__path=alarm['vars']["path"]).first()
                if a:
                    logger.info("Alarm already sending")
                    break
                subject = e_item.template.render_subject(**ctx)
                body = e_item.template.render_body(**ctx)
                logger.debug("Notification message:\nSubject: %s\n%s", subject, body)

                call_later(
                    "noc.custom.handlers.thresholds.thresholdsnotification.threshold_escalation",
                    delay=delay,
                    scheduler="scheduler",
                    notification_group_id=e_item.notification_group.id,
                    subject=subject,
                    body=body
                )
                return {"notification_group": e_item.notification_group,
                        "clear_template": e_item.clear_template,
                        "message": "Sending message to : %s" % e_item.notification_group.name}
            #
            if e_item.stop_processing:
                logger.debug("Stopping processing")
                break
Example #5
0
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs):
    def log(message, *args):
        msg = message % args
        logger.info("[%s] %s", alarm_id, msg)
        alarm.log_message(msg, to_save=True)

    def summary_to_list(summary, model):
        r = []
        for k in summary:
            p = model.get_by_id(k.profile)
            if not p or getattr(p, "show_in_summary", True) is False:
                continue
            r += [{
                "profile": p.name,
                "summary": k.summary,
                "order": (getattr(p, "display_order", 100), -k.summary),
            }]
        return sorted(r, key=operator.itemgetter("order"))

    logger.info("[%s] Performing escalations", alarm_id)
    alarm = get_alarm(alarm_id)
    if alarm is None:
        logger.info("[%s] Missing alarm, skipping", alarm_id)
        metrics["escalation_missed_alarm"] += 1
        return
    if alarm.status == "C":
        logger.info("[%s] Alarm is closed, skipping", alarm_id)
        metrics["escalation_already_closed"] += 1
        return
    if alarm.root:
        log("[%s] Alarm is not root cause, skipping", alarm_id)
        metrics["escalation_alarm_is_not_root"] += 1
        return
    #
    escalation = AlarmEscalation.get_by_id(escalation_id)
    if not escalation:
        log("Escalation %s is not found, skipping", escalation_id)
        metrics["escalation_not_found"] += 1
        return
    if alarm.managed_object.tt_system:
        sample = alarm.managed_object.tt_system.telemetry_sample
    else:
        sample = PARENT_SAMPLE
    with Span(client="escalator", sample=sample) as ctx:
        alarm.set_escalation_context()
        # Evaluate escalation chain
        mo = alarm.managed_object
        for a in escalation.escalations:
            if a.delay != escalation_delay:
                continue  # Try other type
            # Check administrative domain
            if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path:
                continue
            # Check severity
            if a.min_severity and alarm.severity < a.min_severity:
                continue
            # Check selector
            if a.selector and not SelectorCache.is_in_selector(mo, a.selector):
                continue
            # Check time pattern
            if a.time_pattern and not a.time_pattern.match(alarm.timestamp):
                continue
            # Render escalation message
            if not a.template:
                log("No escalation template, skipping")
                continue
            # Check global limits
            # @todo: Move into escalator service
            # @todo: Process per-ttsystem limits
            ets = datetime.datetime.now() - datetime.timedelta(
                seconds=config.escalator.ets)
            ae = ActiveAlarm._get_collection().count_documents(
                {"escalation_ts": {
                    "$gte": ets
                }})
            ae += ArchivedAlarm._get_collection().count_documents(
                {"escalation_ts": {
                    "$gte": ets
                }})
            if ae >= config.escalator.tt_escalation_limit:
                logger.error(
                    "Escalation limit exceeded (%s/%s). Skipping",
                    ae,
                    config.escalator.tt_escalation_limit,
                )
                metrics["escalation_throttled"] += 1
                alarm.set_escalation_error(
                    "Escalation limit exceeded (%s/%s). Skipping" %
                    (ae, config.escalator.tt_escalation_limit))
                return
            # Check whether consequences has escalations
            cons_escalated = sorted(alarm.iter_escalated(),
                                    key=operator.attrgetter("timestamp"))
            affected_objects = sorted(alarm.iter_affected(),
                                      key=operator.attrgetter("name"))
            #
            segment = alarm.managed_object.segment
            if segment.is_redundant:
                uplinks = alarm.managed_object.data.uplinks
                lost_redundancy = len(uplinks) > 1
                affected_subscribers = summary_to_list(
                    segment.total_subscribers, SubscriberProfile)
                affected_services = summary_to_list(segment.total_services,
                                                    ServiceProfile)
            else:
                lost_redundancy = False
                affected_subscribers = []
                affected_services = []
            #
            ctx = {
                "alarm":
                alarm,
                "affected_objects":
                affected_objects,
                "cons_escalated":
                cons_escalated,
                "total_objects":
                summary_to_list(alarm.total_objects, ManagedObjectProfile),
                "total_subscribers":
                summary_to_list(alarm.total_subscribers, SubscriberProfile),
                "total_services":
                summary_to_list(alarm.total_services, ServiceProfile),
                "tt":
                None,
                "lost_redundancy":
                lost_redundancy,
                "affected_subscribers":
                affected_subscribers,
                "affected_services":
                affected_services,
            }
            # Escalate to TT
            if a.create_tt and mo.can_escalate():
                tt_id = None
                if alarm.escalation_tt:
                    log("Already escalated with TT #%s", alarm.escalation_tt)
                else:
                    pre_reason = escalation.get_pre_reason(mo.tt_system)
                    active_maintenance = Maintenance.get_object_maintenance(mo)
                    if active_maintenance:
                        for m in active_maintenance:
                            log(
                                "Object is under maintenance: %s (%s-%s)",
                                m.subject,
                                m.start,
                                m.stop,
                            )
                        metrics["escalation_stop_on_maintenance"] += 1
                    elif pre_reason is not None:
                        subject = a.template.render_subject(**ctx)
                        body = a.template.render_body(**ctx)
                        logger.debug(
                            "[%s] Escalation message:\nSubject: %s\n%s",
                            alarm_id, subject, body)
                        log("Creating TT in system %s", mo.tt_system.name)
                        tts = mo.tt_system.get_system()
                        try:
                            try:
                                tt_id = tts.create_tt(
                                    queue=mo.tt_queue,
                                    obj=mo.tt_system_id,
                                    reason=pre_reason,
                                    subject=subject,
                                    body=body,
                                    login="******",
                                    timestamp=alarm.timestamp,
                                )
                            except TemporaryTTError as e:
                                metrics["escalation_tt_retry"] += 1
                                log(
                                    "Temporary error detected. Retry after %ss",
                                    RETRY_TIMEOUT)
                                mo.tt_system.register_failure()
                                Job.retry_after(get_next_retry(), str(e))
                            ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id)
                            alarm.escalate(
                                ctx["tt"],
                                close_tt=a.close_tt,
                                wait_tt=ctx["tt"] if a.wait_tt else None,
                            )
                            if tts.promote_group_tt and a.promote_group_tt:
                                # Create group TT
                                log("Promoting to group tt")
                                gtt = tts.create_group_tt(
                                    tt_id, alarm.timestamp)
                                # Append affected objects
                                for ao in alarm.iter_affected():
                                    if ao.can_escalate(True):
                                        if ao.tt_system == mo.tt_system:
                                            log(
                                                "Appending object %s to group tt %s",
                                                ao.name, gtt)
                                            try:
                                                tts.add_to_group_tt(
                                                    gtt, ao.tt_system_id)
                                            except TTError as e:
                                                alarm.set_escalation_error(
                                                    "[%s] %s" %
                                                    (mo.tt_system.name, e))
                                        else:
                                            log(
                                                "Cannot append object %s to group tt %s: Belongs to other TT system",
                                                ao.name,
                                                gtt,
                                            )
                                    else:
                                        log(
                                            "Cannot append object %s to group tt %s: Escalations are disabled",
                                            ao.name,
                                            gtt,
                                        )
                            metrics["escalation_tt_create"] += 1
                        except TTError as e:
                            log("Failed to create TT: %s", e)
                            metrics["escalation_tt_fail"] += 1
                            alarm.log_message("Failed to escalate: %s" % e,
                                              to_save=True)
                            alarm.set_escalation_error("[%s] %s" %
                                                       (mo.tt_system.name, e))
                    else:
                        log("Cannot find pre reason")
                        metrics["escalation_tt_fail"] += 1
                if tt_id and cons_escalated:
                    # Notify consequences
                    for ca in cons_escalated:
                        c_tt_name, c_tt_id = ca.escalation_tt.split(":")
                        cts = TTSystem.get_by_name(c_tt_name)
                        if cts:
                            tts = cts.get_system()
                            try:
                                log("Appending comment to TT %s", tt_id)
                                tts.add_comment(c_tt_id,
                                                body="Covered by TT %s" %
                                                tt_id,
                                                login="******")
                                metrics["escalation_tt_comment"] += 1
                            except NotImplementedError:
                                log(
                                    "Cannot add comment to %s: Feature not implemented",
                                    ca.escalation_tt,
                                )
                                metrics["escalation_tt_comment_fail"] += 1
                            except TTError as e:
                                log("Failed to add comment to %s: %s",
                                    ca.escalation_tt, e)
                                metrics["escalation_tt_comment_fail"] += 1
                        else:
                            log(
                                "Failed to add comment to %s: Invalid TT system",
                                ca.escalation_tt)
                            metrics["escalation_tt_comment_fail"] += 1
            # Send notification
            if a.notification_group and mo.can_notify():
                subject = a.template.render_subject(**ctx)
                body = a.template.render_body(**ctx)
                logger.debug("[%s] Notification message:\nSubject: %s\n%s",
                             alarm_id, subject, body)
                log("Sending notification to group %s",
                    a.notification_group.name)
                a.notification_group.notify(subject, body)
                alarm.set_clear_notification(a.notification_group,
                                             a.clear_template)
                metrics["escalation_notify"] += 1
            #
            if a.stop_processing:
                logger.debug("Stopping processing")
                break
        nalarm = get_alarm(alarm_id)
        if nalarm and nalarm.status == "C":
            nalarm.log_message(
                "Alarm has been closed during escalation. Try to deescalate")
            logger.info(
                "[%s] Alarm has been closed during escalation. Try to deescalate",
                alarm.id)
            metrics["escalation_closed_while_escalated"] += 1
            if tt_id and not nalarm.escalation_tt:
                nalarm.escalation_ts = datetime.datetime.now()
                nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id)
                nalarm.save()
            if not nalarm.escalation_close_ts and not nalarm.escalation_close_error:
                notify_close(
                    alarm_id=alarm_id,
                    tt_id=nalarm.escalation_tt,
                    subject="Closing",
                    body="Closing",
                    notification_group_id=alarm.clear_notification_group.id
                    if alarm.clear_notification_group else None,
                    close_tt=alarm.close_tt,
                )
        elif nalarm == "A" and not nalarm.escalation_tt and tt_id:
            logger.error("[%s] Alarm without escalation TT: %s", alarm.id,
                         tt_id)
        logger.info("[%s] Escalations loop end", alarm_id)
Example #6
0
 def run_alarm(self, alarm):
     AlarmEscalation.watch_escalations(alarm)
Example #7
0
 def raise_alarm(self, r, e):
     managed_object = self.eval_expression(r.managed_object, event=e)
     if not managed_object:
         self.logger.info("Empty managed object, ignoring")
         return
     # @todo: Make configurable
     if not managed_object.is_managed:
         self.logger.info(
             "Managed object is not managed. Do not raise alarm")
         return
     if e.managed_object.id != managed_object.id:
         metrics["alarm_change_mo"] += 1
         self.logger.info("Changing managed object to %s",
                          managed_object.name)
     discriminator, vars = r.get_vars(e)
     if r.unique:
         assert discriminator is not None
         a = ActiveAlarm.objects.filter(
             managed_object=managed_object.id,
             discriminator=discriminator).first()
         if not a:
             # Try to reopen alarm
             a = ArchivedAlarm.objects.filter(
                 managed_object=managed_object.id,
                 discriminator=discriminator,
                 control_time__gte=e.timestamp).first()
             if a:
                 # Reopen alarm
                 self.logger.info("[%s|%s|%s] %s reopens alarm %s(%s)",
                                  e.id, managed_object.name,
                                  managed_object.address,
                                  e.event_class.name, a.alarm_class.name,
                                  a.id)
                 a = a.reopen("Reopened by disposition rule '%s'" %
                              r.u_name)
                 metrics["alarm_reopen"] += 1
         if a:
             # Active alarm found, refresh
             self.logger.info(
                 "[%s|%s|%s] Contributing event %s to active alarm %s(%s)",
                 e.id, managed_object.name, managed_object.address,
                 e.event_class.name, a.alarm_class.name, a.id)
             # Contribute event to alarm
             e.contribute_to_alarm(a)
             if e.timestamp < a.timestamp:
                 # Set to earlier date
                 a.timestamp = e.timestamp
                 a.save()
             elif e.timestamp > a.last_update:
                 # Refresh last update
                 a.last_update = e.timestamp
                 a.save()
             metrics["alarm_contribute"] += 1
             return
     # Calculate alarm coverage
     summary = ServiceSummary.get_object_summary(managed_object)
     summary["object"] = {managed_object.object_profile.id: 1}
     #
     severity = max(ServiceSummary.get_severity(summary), 1)
     self.logger.info("[%s|%s|%s] %s: Calculated alarm severity is: %s",
                      e.id, managed_object.name, managed_object.address,
                      r.u_name, severity)
     # Create new alarm
     a = ActiveAlarm(
         timestamp=e.timestamp,
         last_update=e.timestamp,
         managed_object=managed_object.id,
         alarm_class=r.alarm_class,
         severity=severity,
         vars=vars,
         discriminator=discriminator,
         direct_services=SummaryItem.dict_to_items(summary["service"]),
         direct_subscribers=SummaryItem.dict_to_items(
             summary["subscriber"]),
         total_objects=ObjectSummaryItem.dict_to_items(summary["object"]),
         total_services=SummaryItem.dict_to_items(summary["service"]),
         total_subscribers=SummaryItem.dict_to_items(summary["subscriber"]),
         log=[
             AlarmLog(timestamp=datetime.datetime.now(),
                      from_status="A",
                      to_status="A",
                      message="Alarm risen from event %s(%s) by rule '%s'" %
                      (str(e.id), str(e.event_class.name), r.u_name))
         ],
         opening_event=e.id)
     a.save()
     e.contribute_to_alarm(a)
     self.logger.info("[%s|%s|%s] %s raises alarm %s(%s): %r", e.id,
                      managed_object.name, managed_object.address,
                      e.event_class.name, a.alarm_class.name, a.id, a.vars)
     metrics["alarm_raise"] += 1
     self.correlate(r, a)
     # Notify about new alarm
     if not a.root:
         a.managed_object.event(
             a.managed_object.EV_ALARM_RISEN, {
                 "alarm": a,
                 "subject": a.subject,
                 "body": a.body,
                 "symptoms": a.alarm_class.symptoms,
                 "recommended_actions": a.alarm_class.recommended_actions,
                 "probable_causes": a.alarm_class.probable_causes
             },
             delay=a.alarm_class.get_notification_delay())
     # Gather diagnostics when necessary
     AlarmDiagnosticConfig.on_raise(a)
     # Watch for escalations, when necessary
     if config.correlator.auto_escalation and not a.root:
         AlarmEscalation.watch_escalations(a)