Exemple #1
0
 def watch_escalations(cls, alarm):
     now = datetime.datetime.now()
     for esc in cls.get_class_escalations(alarm.alarm_class):
         for e_item in esc.escalations:
             # Check administrative domain
             if (
                 e_item.administrative_domain
                 and e_item.administrative_domain.id not in alarm.adm_path
             ):
                 continue
             # Check severity
             if e_item.min_severity and alarm.severity < e_item.min_severity:
                 continue
             # Check selector
             if e_item.selector and not SelectorCache.is_in_selector(
                 alarm.managed_object, e_item.selector
             ):
                 continue
             logger.debug("[%s] Watch for %s after %s seconds", alarm.id, esc.name, e_item.delay)
             et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay)
             if et > now:
                 delay = (et - now).total_seconds()
             else:
                 delay = None
             call_later(
                 "noc.services.escalator.escalation.escalate",
                 scheduler="escalator",
                 pool=alarm.managed_object.escalator_shard,
                 delay=delay,
                 max_runs=esc.max_escalation_retries,
                 alarm_id=alarm.id,
                 escalation_id=esc.id,
                 escalation_delay=e_item.delay,
             )
Exemple #2
0
def alarm_escalation(alarm, mo, ctx):
    now = datetime.datetime.now()
    for esc in AlarmEscalation.get_class_escalations(alarm['alarm_class']):
        for e_item in esc.escalations:
            # Check administrative domain
            if (e_item.administrative_domain and
                    e_item.administrative_domain.id not in mo.object.administrative_domain.get_path()):
                continue
            # Check severity
            if e_item.min_severity and alarm['severity'] < e_item.min_severity:
                continue
            # Check selector
            if e_item.selector and not SelectorCache.is_in_selector(mo.object, e_item.selector):
                continue
            logger.info(
                "%s Watch for %s after %s seconds",
                alarm['alarm_class'], esc.name, e_item.delay
            )
            et = now + datetime.timedelta(seconds=e_item.delay)
            if et > now:
                delay = (et - now).total_seconds()
            else:
                delay = None
            if e_item.notification_group:
                a = ActiveAlarm.objects.filter(managed_object=mo.object, vars__path=alarm['vars']["path"]).first()
                if a:
                    logger.info("Alarm already sending")
                    break
                subject = e_item.template.render_subject(**ctx)
                body = e_item.template.render_body(**ctx)
                logger.debug("Notification message:\nSubject: %s\n%s", subject, body)

                call_later(
                    "noc.custom.handlers.thresholds.thresholdsnotification.threshold_escalation",
                    delay=delay,
                    scheduler="scheduler",
                    notification_group_id=e_item.notification_group.id,
                    subject=subject,
                    body=body
                )
                return {"notification_group": e_item.notification_group,
                        "clear_template": e_item.clear_template,
                        "message": "Sending message to : %s" % e_item.notification_group.name}
            #
            if e_item.stop_processing:
                logger.debug("Stopping processing")
                break
Exemple #3
0
 def watch_escalations(cls, alarm, timestamp_policy="a"):
     now = datetime.datetime.now()
     for esc in cls.get_class_escalations(alarm.alarm_class):
         for e_item in esc.escalations:
             # Check administrative domain
             if (e_item.administrative_domain and
                     e_item.administrative_domain.id not in alarm.adm_path):
                 continue
             # Check severity
             if e_item.min_severity and alarm.severity < e_item.min_severity:
                 continue
             # Check selector
             if e_item.selector and not SelectorCache.is_in_selector(
                     alarm.managed_object, e_item.selector):
                 continue
             logger.debug("[%s] Watch for %s after %s seconds", alarm.id,
                          esc.name, e_item.delay)
             et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay)
             if timestamp_policy == "c":
                 # If escalation with current timestamp - shift consequence after main escalation
                 delay = max(
                     (et -
                      now).total_seconds(), 120) + 120 if et > now else 120
                 logger.info(
                     "[%s] Watch escalation with create new timestamp policy, after %s seconds",
                     alarm.id,
                     delay,
                 )
             elif et > now:
                 delay = (et - now).total_seconds()
             else:
                 delay = None
             call_later(
                 "noc.services.escalator.escalation.escalate",
                 scheduler="escalator",
                 pool=alarm.managed_object.escalator_shard,
                 delay=delay,
                 max_runs=esc.max_escalation_retries,
                 alarm_id=alarm.id,
                 escalation_id=esc.id,
                 escalation_delay=e_item.delay,
                 timestamp_policy=timestamp_policy,
             )
             if e_item.stop_processing:
                 break
Exemple #4
0
 def on_clear(cls, alarm):
     """
     Submit clear jobs
     :param alarm:
     :return:
     """
     cfg = defaultdict(list)
     for c in cls.get_class_diagnostics(alarm.alarm_class):
         if c.selector and not SelectorCache.is_in_selector(
                 alarm.managed_object, c.selector
         ):
             continue
         if c.only_root and alarm.root:
             continue
         if c.enable_on_clear:
             if c.on_clear_script:
                 cfg[c.on_clear_delay] += [{
                     "script": c.on_clear_script,
                     "header": c.on_clear_header
                 }]
             if c.on_clear_action:
                 cfg[c.on_clear_delay] += [{
                     "action": c.on_clear_action.id,
                     "header": c.on_clear_header
                 }]
             if c.on_clear_handler:
                 cfg[c.on_clear_delay] += [{
                     "handler": c.on_clear_handler,
                     "header": c.on_clear_header
                 }]
     # Submit on_clear job
     for delay in cfg:
         call_later(
             "noc.fm.models.alarmdiagnosticconfig.on_clear",
             scheduler="correlator",
             pool=alarm.managed_object.pool.name,
             delay=delay,
             alarm=alarm.id,
             cfg=cfg[delay]
         )
     AlarmDiagnostic.clear_diagnostics(alarm)
Exemple #5
0
 def on_raise(cls, alarm):
     """
     Submit raise and periodic jobs
     :param alarm:
     :return:
     """
     r_cfg = defaultdict(list)
     p_cfg = defaultdict(list)
     for c in cls.get_class_diagnostics(alarm.alarm_class):
         if c.selector and not SelectorCache.is_in_selector(
                 alarm.managed_object, c.selector
         ):
             continue
         if c.only_root and alarm.root:
             continue
         if c.enable_on_raise:
             if c.on_raise_script:
                 r_cfg[c.on_raise_delay] += [{
                     "script": c.on_raise_script,
                     "header": c.on_raise_header
                 }]
             if c.on_raise_action:
                 r_cfg[c.on_raise_delay] += [{
                     "action": c.on_raise_action.name,
                     "header": c.on_raise_header
                 }]
             if c.on_raise_handler:
                 r_cfg[c.on_raise_delay] += [{
                     "handler": c.on_raise_handler,
                     "header": c.on_raise_header
                 }]
         if c.enable_periodic:
             if c.periodic_script:
                 p_cfg[c.periodic_interval] += [{
                     "script": c.periodic_script,
                     "header": c.periodic_header
                 }]
             if c.periodic_action:
                 p_cfg[c.periodic_interval] += [{
                     "action": c.periodic_action.name,
                     "header": c.periodic_header
                 }]
             if c.periodic_handler:
                 p_cfg[c.periodic_interval] += [{
                     "handler": c.periodic_handler,
                     "header": c.periodic_header
                 }]
     # Submit on_raise job
     for delay in r_cfg:
         call_later(
             "noc.fm.models.alarmdiagnosticconfig.on_raise",
             scheduler="correlator",
             pool=alarm.managed_object.pool.name,
             delay=delay,
             alarm=alarm.id,
             cfg=r_cfg[delay]
         )
     # Submit periodic job
     for delay in p_cfg:
         call_later(
             "noc.fm.models.alarmdiagnosticconfig.periodic",
             scheduler="correlator",
             max_runs=PERIODIC_JOB_MAX_RUNS,
             pool=alarm.managed_object.pool.name,
             delay=delay,
             alarm=alarm.id,
             cfg={"cfg": p_cfg[delay], "delay": delay}
         )
Exemple #6
0
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs):
    def log(message, *args):
        msg = message % args
        logger.info("[%s] %s", alarm_id, msg)
        alarm.log_message(msg, to_save=True)

    def summary_to_list(summary, model):
        r = []
        for k in summary:
            p = model.get_by_id(k.profile)
            if not p or getattr(p, "show_in_summary", True) is False:
                continue
            r += [{
                "profile": p.name,
                "summary": k.summary,
                "order": (getattr(p, "display_order", 100), -k.summary),
            }]
        return sorted(r, key=operator.itemgetter("order"))

    logger.info("[%s] Performing escalations", alarm_id)
    alarm = get_alarm(alarm_id)
    if alarm is None:
        logger.info("[%s] Missing alarm, skipping", alarm_id)
        metrics["escalation_missed_alarm"] += 1
        return
    if alarm.status == "C":
        logger.info("[%s] Alarm is closed, skipping", alarm_id)
        metrics["escalation_already_closed"] += 1
        return
    if alarm.root:
        log("[%s] Alarm is not root cause, skipping", alarm_id)
        metrics["escalation_alarm_is_not_root"] += 1
        return
    #
    escalation = AlarmEscalation.get_by_id(escalation_id)
    if not escalation:
        log("Escalation %s is not found, skipping", escalation_id)
        metrics["escalation_not_found"] += 1
        return
    if alarm.managed_object.tt_system:
        sample = alarm.managed_object.tt_system.telemetry_sample
    else:
        sample = PARENT_SAMPLE
    with Span(client="escalator", sample=sample) as ctx:
        alarm.set_escalation_context()
        # Evaluate escalation chain
        mo = alarm.managed_object
        for a in escalation.escalations:
            if a.delay != escalation_delay:
                continue  # Try other type
            # Check administrative domain
            if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path:
                continue
            # Check severity
            if a.min_severity and alarm.severity < a.min_severity:
                continue
            # Check selector
            if a.selector and not SelectorCache.is_in_selector(mo, a.selector):
                continue
            # Check time pattern
            if a.time_pattern and not a.time_pattern.match(alarm.timestamp):
                continue
            # Render escalation message
            if not a.template:
                log("No escalation template, skipping")
                continue
            # Check global limits
            # @todo: Move into escalator service
            # @todo: Process per-ttsystem limits
            ets = datetime.datetime.now() - datetime.timedelta(
                seconds=config.escalator.ets)
            ae = ActiveAlarm._get_collection().count_documents(
                {"escalation_ts": {
                    "$gte": ets
                }})
            ae += ArchivedAlarm._get_collection().count_documents(
                {"escalation_ts": {
                    "$gte": ets
                }})
            if ae >= config.escalator.tt_escalation_limit:
                logger.error(
                    "Escalation limit exceeded (%s/%s). Skipping",
                    ae,
                    config.escalator.tt_escalation_limit,
                )
                metrics["escalation_throttled"] += 1
                alarm.set_escalation_error(
                    "Escalation limit exceeded (%s/%s). Skipping" %
                    (ae, config.escalator.tt_escalation_limit))
                return
            # Check whether consequences has escalations
            cons_escalated = sorted(alarm.iter_escalated(),
                                    key=operator.attrgetter("timestamp"))
            affected_objects = sorted(alarm.iter_affected(),
                                      key=operator.attrgetter("name"))
            #
            segment = alarm.managed_object.segment
            if segment.is_redundant:
                uplinks = alarm.managed_object.data.uplinks
                lost_redundancy = len(uplinks) > 1
                affected_subscribers = summary_to_list(
                    segment.total_subscribers, SubscriberProfile)
                affected_services = summary_to_list(segment.total_services,
                                                    ServiceProfile)
            else:
                lost_redundancy = False
                affected_subscribers = []
                affected_services = []
            #
            ctx = {
                "alarm":
                alarm,
                "affected_objects":
                affected_objects,
                "cons_escalated":
                cons_escalated,
                "total_objects":
                summary_to_list(alarm.total_objects, ManagedObjectProfile),
                "total_subscribers":
                summary_to_list(alarm.total_subscribers, SubscriberProfile),
                "total_services":
                summary_to_list(alarm.total_services, ServiceProfile),
                "tt":
                None,
                "lost_redundancy":
                lost_redundancy,
                "affected_subscribers":
                affected_subscribers,
                "affected_services":
                affected_services,
            }
            # Escalate to TT
            if a.create_tt and mo.can_escalate():
                tt_id = None
                if alarm.escalation_tt:
                    log("Already escalated with TT #%s", alarm.escalation_tt)
                else:
                    pre_reason = escalation.get_pre_reason(mo.tt_system)
                    active_maintenance = Maintenance.get_object_maintenance(mo)
                    if active_maintenance:
                        for m in active_maintenance:
                            log(
                                "Object is under maintenance: %s (%s-%s)",
                                m.subject,
                                m.start,
                                m.stop,
                            )
                        metrics["escalation_stop_on_maintenance"] += 1
                    elif pre_reason is not None:
                        subject = a.template.render_subject(**ctx)
                        body = a.template.render_body(**ctx)
                        logger.debug(
                            "[%s] Escalation message:\nSubject: %s\n%s",
                            alarm_id, subject, body)
                        log("Creating TT in system %s", mo.tt_system.name)
                        tts = mo.tt_system.get_system()
                        try:
                            try:
                                tt_id = tts.create_tt(
                                    queue=mo.tt_queue,
                                    obj=mo.tt_system_id,
                                    reason=pre_reason,
                                    subject=subject,
                                    body=body,
                                    login="******",
                                    timestamp=alarm.timestamp,
                                )
                            except TemporaryTTError as e:
                                metrics["escalation_tt_retry"] += 1
                                log(
                                    "Temporary error detected. Retry after %ss",
                                    RETRY_TIMEOUT)
                                mo.tt_system.register_failure()
                                Job.retry_after(get_next_retry(), str(e))
                            ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id)
                            alarm.escalate(
                                ctx["tt"],
                                close_tt=a.close_tt,
                                wait_tt=ctx["tt"] if a.wait_tt else None,
                            )
                            if tts.promote_group_tt and a.promote_group_tt:
                                # Create group TT
                                log("Promoting to group tt")
                                gtt = tts.create_group_tt(
                                    tt_id, alarm.timestamp)
                                # Append affected objects
                                for ao in alarm.iter_affected():
                                    if ao.can_escalate(True):
                                        if ao.tt_system == mo.tt_system:
                                            log(
                                                "Appending object %s to group tt %s",
                                                ao.name, gtt)
                                            try:
                                                tts.add_to_group_tt(
                                                    gtt, ao.tt_system_id)
                                            except TTError as e:
                                                alarm.set_escalation_error(
                                                    "[%s] %s" %
                                                    (mo.tt_system.name, e))
                                        else:
                                            log(
                                                "Cannot append object %s to group tt %s: Belongs to other TT system",
                                                ao.name,
                                                gtt,
                                            )
                                    else:
                                        log(
                                            "Cannot append object %s to group tt %s: Escalations are disabled",
                                            ao.name,
                                            gtt,
                                        )
                            metrics["escalation_tt_create"] += 1
                        except TTError as e:
                            log("Failed to create TT: %s", e)
                            metrics["escalation_tt_fail"] += 1
                            alarm.log_message("Failed to escalate: %s" % e,
                                              to_save=True)
                            alarm.set_escalation_error("[%s] %s" %
                                                       (mo.tt_system.name, e))
                    else:
                        log("Cannot find pre reason")
                        metrics["escalation_tt_fail"] += 1
                if tt_id and cons_escalated:
                    # Notify consequences
                    for ca in cons_escalated:
                        c_tt_name, c_tt_id = ca.escalation_tt.split(":")
                        cts = TTSystem.get_by_name(c_tt_name)
                        if cts:
                            tts = cts.get_system()
                            try:
                                log("Appending comment to TT %s", tt_id)
                                tts.add_comment(c_tt_id,
                                                body="Covered by TT %s" %
                                                tt_id,
                                                login="******")
                                metrics["escalation_tt_comment"] += 1
                            except NotImplementedError:
                                log(
                                    "Cannot add comment to %s: Feature not implemented",
                                    ca.escalation_tt,
                                )
                                metrics["escalation_tt_comment_fail"] += 1
                            except TTError as e:
                                log("Failed to add comment to %s: %s",
                                    ca.escalation_tt, e)
                                metrics["escalation_tt_comment_fail"] += 1
                        else:
                            log(
                                "Failed to add comment to %s: Invalid TT system",
                                ca.escalation_tt)
                            metrics["escalation_tt_comment_fail"] += 1
            # Send notification
            if a.notification_group and mo.can_notify():
                subject = a.template.render_subject(**ctx)
                body = a.template.render_body(**ctx)
                logger.debug("[%s] Notification message:\nSubject: %s\n%s",
                             alarm_id, subject, body)
                log("Sending notification to group %s",
                    a.notification_group.name)
                a.notification_group.notify(subject, body)
                alarm.set_clear_notification(a.notification_group,
                                             a.clear_template)
                metrics["escalation_notify"] += 1
            #
            if a.stop_processing:
                logger.debug("Stopping processing")
                break
        nalarm = get_alarm(alarm_id)
        if nalarm and nalarm.status == "C":
            nalarm.log_message(
                "Alarm has been closed during escalation. Try to deescalate")
            logger.info(
                "[%s] Alarm has been closed during escalation. Try to deescalate",
                alarm.id)
            metrics["escalation_closed_while_escalated"] += 1
            if tt_id and not nalarm.escalation_tt:
                nalarm.escalation_ts = datetime.datetime.now()
                nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id)
                nalarm.save()
            if not nalarm.escalation_close_ts and not nalarm.escalation_close_error:
                notify_close(
                    alarm_id=alarm_id,
                    tt_id=nalarm.escalation_tt,
                    subject="Closing",
                    body="Closing",
                    notification_group_id=alarm.clear_notification_group.id
                    if alarm.clear_notification_group else None,
                    close_tt=alarm.close_tt,
                )
        elif nalarm == "A" and not nalarm.escalation_tt and tt_id:
            logger.error("[%s] Alarm without escalation TT: %s", alarm.id,
                         tt_id)
        logger.info("[%s] Escalations loop end", alarm_id)
Exemple #7
0
    def check_alarm(self, alarm):
        def summary_to_list(summary, model):
            r = []
            for k in summary:
                p = model.get_by_id(k.profile)
                if not p or getattr(p, "show_in_summary", True) is False:
                    continue
                r += [{"profile": p.name, "summary": k.summary}]
            return sorted(r, key=lambda x: -x["summary"])

        def iter_consequences(alarm):
            """
            Generator yielding all consequences alarm
            """
            for ac in [ArchivedAlarm, ActiveAlarm]:
                for a in ac.objects.filter(root=alarm.id):
                    yield a
                    for ca in a.iter_consequences():
                        yield ca

        def iter_affected(alarm):
            """
            Generator yielding all affected managed objects
            """
            seen = {alarm.managed_object}
            yield alarm.managed_object
            for a in iter_consequences(alarm):
                if a.managed_object not in seen:
                    seen.add(a.managed_object)
                    yield a.managed_object

        def iter_escalated(alarm):
            """
            Generator yielding all escalated consequences
            """
            for a in iter_consequences(alarm):
                if a.escalation_tt:
                    yield a

        mo = alarm.managed_object
        self.print("-" * 72)
        self.print("Alarm Id : %s  Time: %s" % (alarm.id, alarm.timestamp))
        self.print("Class    : %s" % alarm.alarm_class.name)
        self.print("Object   : %s  Platform: %s  IP: %s" %
                   (mo.name, mo.platform, mo.address))
        c = mo.administrative_domain
        adm_domains = [c]
        while c.parent:
            c = c.parent
            adm_domains.insert(0, c)
        self.print("Adm. Dom.: %s (%s)" % (" | ".join(
            c.name
            for c in adm_domains), " | ".join(str(c.id) for c in adm_domains)))
        escalations = list(
            AlarmEscalation.objects.filter(
                alarm_classes__alarm_class=alarm.alarm_class.id))
        if not escalations:
            self.print("@ No matched escalations")
            return
        for esc in escalations:
            self.print("[Chain: %s]" % esc.name)
            if alarm.root:
                self.print("    @ Not a root cause (Root Id: %s)" % alarm.root)
                continue
            for e in esc.escalations:
                self.print("    [After %ss]" % e.delay)
                # Check administrative domain
                if e.administrative_domain and e.administrative_domain.id not in alarm.adm_path:
                    self.print(
                        "    @ Administrative domain mismatch (%s not in %s)" %
                        (e.administrative_domain.id, alarm.adm_path))
                    continue
                # Check severity
                if e.min_severity and alarm.severity < e.min_severity:
                    self.print("    @ Severity mismatch: %s < %s" %
                               (alarm.severity, e.min_severity))
                    continue
                # Check selector
                if e.selector and not SelectorCache.is_in_selector(
                        mo, e.selector):
                    self.print("    @ Selector mismatch (%s required)" %
                               (e.selector.name))
                    continue
                # Check time pattern
                if e.time_pattern and not e.time_pattern.match(
                        alarm.timestamp):
                    self.print("    @ Time pattern mismatch (%s required)" %
                               (e.time_pattern.name))
                    continue
                # Render escalation message
                if not e.template:
                    self.print("    @ No escalation template")
                    continue
                # Check whether consequences has escalations
                cons_escalated = sorted(iter_escalated(alarm),
                                        key=operator.attrgetter("timestamp"))
                affected_objects = sorted(iter_affected(alarm),
                                          key=operator.attrgetter("name"))
                #
                ctx = {
                    "alarm":
                    alarm,
                    "affected_objects":
                    affected_objects,
                    "cons_escalated":
                    cons_escalated,
                    "total_objects":
                    summary_to_list(alarm.total_objects, ManagedObjectProfile),
                    "total_subscribers":
                    summary_to_list(alarm.total_subscribers,
                                    SubscriberProfile),
                    "total_services":
                    summary_to_list(alarm.total_services, ServiceProfile),
                    "tt":
                    None,
                }
                if e.create_tt:
                    self.print("    Creating TT")
                    tt_system = mo.tt_system
                    if not tt_system:
                        self.print("    @ No TT System. Cannot escalate")
                    elif not mo.can_escalate():
                        self.print("    @ Escalation disabled by policy")
                    else:
                        tts = tt_system.get_system()
                        self.print("    TT System: %s  Mapped Id: %s" %
                                   (tt_system.name, mo.tt_system_id))
                        subject = e.template.render_subject(**ctx)
                        body = e.template.render_body(**ctx)
                        self.print("    @ Create network TT")
                        self.print("    | Subject: %s" % subject)
                        self.print("    |")
                        self.print("    | %s" % body.replace("\n", "\n    | "))
                        tt_id = "<NETWORK TT>"
                        ctx["tt"] = "%s:%s" % (tt_system.name, tt_id)
                        # alarm.escalate(ctx["tt"], close_tt=e.close_tt)
                        if tts.promote_group_tt:
                            self.print("    Promoting group TT")
                            self.print("    @ Create Group TT")
                            # Add objects
                            for o in alarm.iter_affected():
                                if o.can_escalate(depended=True):
                                    if o.tt_system == mo.tt_system:
                                        self.print(
                                            "    @ Add to group TT %s. Remote Id: %s"
                                            % (o.name, o.tt_system_id))
                                    else:
                                        self.print(
                                            "    @ Cannot add to group TT. Belongs to other TT system"
                                            % o.name)
                                else:
                                    self.print(
                                        "    @ Cannot add to group TT %s. Escalations are disabled"
                                        % (o.name))
                if e.notification_group:
                    if mo.can_notify():
                        subject = e.template.render_subject(**ctx)
                        body = e.template.render_body(**ctx)
                        self.print("    @ Sending notification to group '%s'" %
                                   e.notification_group.name)
                        self.print("    | Subject: %s" % subject)
                        self.print("    |")
                        self.print("    | %s" % body.replace("\n", "\n    | "))
                    else:
                        self.print("    @ Notification disabled by policy")
                if e.stop_processing:
                    self.print("    @ Stop processing")
                    break