def wait_tt(alarm_id): logger.info("[%s] Checking escalated TT", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) return c_tt_name, c_tt_id = alarm.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if not cts: logger.error("Unknown TT system: %s", c_tt_name) return ti = None tts = cts.get_system() try: ti = tts.get_tt(c_tt_id) except tts.TTError as e: logger.error("Cannot get TT info: %s", e) if ti and ti["resolved"]: # Close alarm alarm.clear_alarm( "Closed by TT %s" % alarm.escalation_tt, ts=ti.get("close_ts", datetime.datetime.now()), force=True, ) else: Job.retry_after(config.escalator.wait_tt_check_interval, msg="Next check")
def ensure_discovery_jobs(self): if self.profile and self.profile.discovery_interval > 0: Job.submit("scheduler", self.DISCOVERY_JOB, key=self.id, keep_ts=True) else: Job.remove("scheduler", self.DISCOVERY_JOB, key=self.id)
def periodic(alarm, cfg, *args, **kwargs): a = get_alarm(alarm) if not a: logger.info("[%s] Alarm is not found, skipping", alarm) return if a.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm) return AlarmDiagnosticConfig.get_diag(a, cfg["cfg"], "R") if cfg.get("delay"): Job.retry_after(delay=cfg["delay"])
def api_stop_discovery(self, request, id): o = self.get_object_or_404(ManagedObject, id=id) if not o.has_access(request.user): return self.response_forbidden("Access denied") r = ujson.loads(request.body).get("names", []) for name, jcls in self.DISCOVERY_JOBS: if name not in r: continue if not getattr(o.object_profile, "enable_%s_discovery" % name): continue # Disabled by profile Job.remove("discovery", jcls, key=o.id, pool=o.pool.name) return {"success": True}
def instance_to_dict(self, mo, fields=None): job = Job.get_job_data( "discovery", jcls="noc.services.discovery.jobs.box.job.BoxDiscoveryJob", key=mo.id, pool=mo.pool.name, ) last_update = mo.config.get_revisions(reverse=True) if last_update: last_update = humanize_distance(last_update[0].ts) last_success = "--" last_status = None if job: last_success = humanize_distance( job["last"]) if "last" in job else "--" last_status = job["ls"] if "ls" in job else None return { "id": str(mo.id), "name": mo.name, "profile_name": mo.profile.name, "last_success": last_success, "status": job["s"] if job else "--", "last_status": last_status, "last_update": last_update if last_update else None, }
def apply_discovery_jobs(profile_id, box_changed, periodic_changed): def iter_objects(): pool_cache = cachetools.LRUCache( maxsize=200, missing=lambda x: Pool.objects.get(id=x) ) for o_id, is_managed, pool_id in profile.managedobject_set.values_list("id", "is_managed", "pool"): yield o_id, is_managed, pool_cache[pool_id] try: profile = ManagedObjectProfile.objects.get(id=profile_id) except ManagedObjectProfile.DoesNotExist: return for mo_id, is_managed, pool in iter_objects(): if box_changed: if profile.enable_box_discovery and is_managed: Job.submit( "discovery", "noc.services.discovery.jobs.box.job.BoxDiscoveryJob", key=mo_id, pool=pool ) else: Job.remove( "discovery", "noc.services.discovery.jobs.box.job.BoxDiscoveryJob", key=mo_id, pool=pool ) if periodic_changed: if profile.enable_periodic_discovery and is_managed: Job.submit( "discovery", "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob", key=mo_id, pool=pool ) else: Job.remove( "discovery", "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob", key=mo_id, pool=pool )
def api_discovery(self, request, id): from noc.core.scheduler.job import Job o = self.get_object_or_404(ManagedObject, id=id) if not o.has_access(request.user): return self.response_forbidden("Access denied") link_count = defaultdict(int) for link in Link.object_links(o): m = link.discovery_method or "" if "+" in m: m = m.split("+")[0] link_count[m] += 1 r = [{ "name": "ping", "enable_profile": o.object_profile.enable_ping, "status": o.get_status(), "last_run": None, "last_status": None, "next_run": None, "jcls": None, }] for name, jcls in self.DISCOVERY_JOBS: job = Job.get_job_data( "discovery", jcls=jcls, key=o.id, pool=o.pool.name) or {} d = { "name": name, "enable_profile": getattr(o.object_profile, "enable_%s_discovery" % name), "status": job.get(Job.ATTR_STATUS), "last_run": self.to_json(job.get(Job.ATTR_LAST)), "last_status": job.get(Job.ATTR_LAST_STATUS), "next_run": self.to_json(job.get(Job.ATTR_TS)), "jcls": jcls, } r += [d] return r
def instance_to_dict(self, mo, fields=None): job = Job.get_job_data( "discovery", jcls="noc.services.discovery.jobs.box.job.BoxDiscoveryJob", key=mo.id, pool=mo.pool.name) last_update = mo.config.get_revisions(reverse=True) if last_update: last_update = humanize_distance(last_update[0].ts) last_success = '--' last_status = None if job: last_success = humanize_distance( job["last"]) if "last" in job else '--' last_status = job["ls"] if "ls" in job else None return { 'id': str(mo.id), 'name': mo.name, 'profile_name': mo.profile.name, 'last_success': last_success, 'status': job["s"] if job else '--', 'last_status': last_status, 'last_update': last_update if last_update else None }
def process_origin_route(self): """ Update origin -> route :return: """ # Get AS with discovered routes discoverable_as = set( "AS%s" % a.asn for a in AS.objects.all() if a.profile.enable_discovery_prefix_whois_route and a.profile ) # as -> [(prefix, description)] as_routes = defaultdict(list) if discoverable_as: logger.info( "Collecting prefix discovery information for AS: %s", ", ".join(a for a in discoverable_as) ) def parser(f, fields=None): for obj in self.parse_rpsl(f): if obj and "route" in obj and "origin" in obj: origin = obj["origin"][0] if origin in discoverable_as: as_routes[origin] += [( obj["route"][0], "\n".join(obj["descr"]) if "descr" in obj else None )] yield obj else: parser = self.parse_rpsl # r = defaultdict(list) if self.use_ripe: logger.info("Processing RIPE origin -> route") v = self.update_from_rpsl(self.RIPE_ROUTE_ORIGIN, r, "route", "origin", False, parser) logger.info("Processed RIPE origin -> route: %d records" % v) if self.use_arin: logger.info("Processing ARIN origin -> route") v = self.update_from_rpsl(self.ARIN, r, "route", "origin", False, parser) logger.info("Processed ARIN origin -> route: %d records" % v) if self.use_radb: logger.info("Processing RADb origin -> route") v = self.update_from_rpsl(self.RADB, r, "route", "origin", False, parser) logger.info("Processed RADb origin -> route: %d records" % v) if r: import noc.lib.nosql # noqa Connect to MongoDB # Upload to database logger.info("Updating noc.whois.origin.route collection") count = WhoisOriginRoute.upload(r) logger.info("%d records written into noc.whois.origin.route collection" % count) if as_routes: import noc.lib.nosql # noqa Connect to MongoDB delay = 0 for a in as_routes: logger.info("[%s] Sending %d prefixes to AS discovery", a, len(as_routes[a])) Job.submit( "scheduler", self.JCLS_WHOIS_PREFIX, key=AS.get_by_asn(int(a[2:])).id, delta=delay, data={ "whois_route": as_routes[a] } ) delay += self.PER_AS_DELAY
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) alarm.log_message(msg, to_save=True) def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{ "profile": p.name, "summary": k.summary, "order": (getattr(p, "display_order", 100), -k.summary), }] return sorted(r, key=operator.itemgetter("order")) logger.info("[%s] Performing escalations", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) metrics["escalation_missed_alarm"] += 1 return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) metrics["escalation_already_closed"] += 1 return if alarm.root: log("[%s] Alarm is not root cause, skipping", alarm_id) metrics["escalation_alarm_is_not_root"] += 1 return # escalation = AlarmEscalation.get_by_id(escalation_id) if not escalation: log("Escalation %s is not found, skipping", escalation_id) metrics["escalation_not_found"] += 1 return if alarm.managed_object.tt_system: sample = alarm.managed_object.tt_system.telemetry_sample else: sample = PARENT_SAMPLE with Span(client="escalator", sample=sample) as ctx: alarm.set_escalation_context() # Evaluate escalation chain mo = alarm.managed_object for a in escalation.escalations: if a.delay != escalation_delay: continue # Try other type # Check administrative domain if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path: continue # Check severity if a.min_severity and alarm.severity < a.min_severity: continue # Check selector if a.selector and not SelectorCache.is_in_selector(mo, a.selector): continue # Check time pattern if a.time_pattern and not a.time_pattern.match(alarm.timestamp): continue # Render escalation message if not a.template: log("No escalation template, skipping") continue # Check global limits # @todo: Move into escalator service # @todo: Process per-ttsystem limits ets = datetime.datetime.now() - datetime.timedelta( seconds=config.escalator.ets) ae = ActiveAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) ae += ArchivedAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) if ae >= config.escalator.tt_escalation_limit: logger.error( "Escalation limit exceeded (%s/%s). Skipping", ae, config.escalator.tt_escalation_limit, ) metrics["escalation_throttled"] += 1 alarm.set_escalation_error( "Escalation limit exceeded (%s/%s). Skipping" % (ae, config.escalator.tt_escalation_limit)) return # Check whether consequences has escalations cons_escalated = sorted(alarm.iter_escalated(), key=operator.attrgetter("timestamp")) affected_objects = sorted(alarm.iter_affected(), key=operator.attrgetter("name")) # segment = alarm.managed_object.segment if segment.is_redundant: uplinks = alarm.managed_object.data.uplinks lost_redundancy = len(uplinks) > 1 affected_subscribers = summary_to_list( segment.total_subscribers, SubscriberProfile) affected_services = summary_to_list(segment.total_services, ServiceProfile) else: lost_redundancy = False affected_subscribers = [] affected_services = [] # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, "lost_redundancy": lost_redundancy, "affected_subscribers": affected_subscribers, "affected_services": affected_services, } # Escalate to TT if a.create_tt and mo.can_escalate(): tt_id = None if alarm.escalation_tt: log("Already escalated with TT #%s", alarm.escalation_tt) else: pre_reason = escalation.get_pre_reason(mo.tt_system) active_maintenance = Maintenance.get_object_maintenance(mo) if active_maintenance: for m in active_maintenance: log( "Object is under maintenance: %s (%s-%s)", m.subject, m.start, m.stop, ) metrics["escalation_stop_on_maintenance"] += 1 elif pre_reason is not None: subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug( "[%s] Escalation message:\nSubject: %s\n%s", alarm_id, subject, body) log("Creating TT in system %s", mo.tt_system.name) tts = mo.tt_system.get_system() try: try: tt_id = tts.create_tt( queue=mo.tt_queue, obj=mo.tt_system_id, reason=pre_reason, subject=subject, body=body, login="******", timestamp=alarm.timestamp, ) except TemporaryTTError as e: metrics["escalation_tt_retry"] += 1 log( "Temporary error detected. Retry after %ss", RETRY_TIMEOUT) mo.tt_system.register_failure() Job.retry_after(get_next_retry(), str(e)) ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id) alarm.escalate( ctx["tt"], close_tt=a.close_tt, wait_tt=ctx["tt"] if a.wait_tt else None, ) if tts.promote_group_tt and a.promote_group_tt: # Create group TT log("Promoting to group tt") gtt = tts.create_group_tt( tt_id, alarm.timestamp) # Append affected objects for ao in alarm.iter_affected(): if ao.can_escalate(True): if ao.tt_system == mo.tt_system: log( "Appending object %s to group tt %s", ao.name, gtt) try: tts.add_to_group_tt( gtt, ao.tt_system_id) except TTError as e: alarm.set_escalation_error( "[%s] %s" % (mo.tt_system.name, e)) else: log( "Cannot append object %s to group tt %s: Belongs to other TT system", ao.name, gtt, ) else: log( "Cannot append object %s to group tt %s: Escalations are disabled", ao.name, gtt, ) metrics["escalation_tt_create"] += 1 except TTError as e: log("Failed to create TT: %s", e) metrics["escalation_tt_fail"] += 1 alarm.log_message("Failed to escalate: %s" % e, to_save=True) alarm.set_escalation_error("[%s] %s" % (mo.tt_system.name, e)) else: log("Cannot find pre reason") metrics["escalation_tt_fail"] += 1 if tt_id and cons_escalated: # Notify consequences for ca in cons_escalated: c_tt_name, c_tt_id = ca.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, body="Covered by TT %s" % tt_id, login="******") metrics["escalation_tt_comment"] += 1 except NotImplementedError: log( "Cannot add comment to %s: Feature not implemented", ca.escalation_tt, ) metrics["escalation_tt_comment_fail"] += 1 except TTError as e: log("Failed to add comment to %s: %s", ca.escalation_tt, e) metrics["escalation_tt_comment_fail"] += 1 else: log( "Failed to add comment to %s: Invalid TT system", ca.escalation_tt) metrics["escalation_tt_comment_fail"] += 1 # Send notification if a.notification_group and mo.can_notify(): subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug("[%s] Notification message:\nSubject: %s\n%s", alarm_id, subject, body) log("Sending notification to group %s", a.notification_group.name) a.notification_group.notify(subject, body) alarm.set_clear_notification(a.notification_group, a.clear_template) metrics["escalation_notify"] += 1 # if a.stop_processing: logger.debug("Stopping processing") break nalarm = get_alarm(alarm_id) if nalarm and nalarm.status == "C": nalarm.log_message( "Alarm has been closed during escalation. Try to deescalate") logger.info( "[%s] Alarm has been closed during escalation. Try to deescalate", alarm.id) metrics["escalation_closed_while_escalated"] += 1 if tt_id and not nalarm.escalation_tt: nalarm.escalation_ts = datetime.datetime.now() nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id) nalarm.save() if not nalarm.escalation_close_ts and not nalarm.escalation_close_error: notify_close( alarm_id=alarm_id, tt_id=nalarm.escalation_tt, subject="Closing", body="Closing", notification_group_id=alarm.clear_notification_group.id if alarm.clear_notification_group else None, close_tt=alarm.close_tt, ) elif nalarm == "A" and not nalarm.escalation_tt and tt_id: logger.error("[%s] Alarm without escalation TT: %s", alarm.id, tt_id) logger.info("[%s] Escalations loop end", alarm_id)
def notify_close(alarm_id, tt_id, subject, body, notification_group_id, close_tt=False): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) if tt_id: alarm = get_alarm(alarm_id) alarm.set_escalation_close_ctx() if (alarm and alarm.status == "C" and (alarm.escalation_close_ts or alarm.escalation_close_error)): log("Alarm is already deescalated") metrics["escalation_already_deescalated"] += 1 return with Span(client="escalator", sample=PARENT_SAMPLE): c_tt_name, c_tt_id = tt_id.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() if close_tt: # Close tt try: log("Closing TT %s", tt_id) tts.close_tt(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_close"] += 1 if alarm: alarm.close_escalation() except TemporaryTTError as e: log("Temporary error detected while closing tt %s: %s", tt_id, e) metrics["escalation_tt_close_retry"] += 1 Job.retry_after(get_next_retry(), str(e)) cts.register_failure() if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) except TTError as e: log("Failed to close tt %s: %s", tt_id, e) metrics["escalation_tt_close_fail"] += 1 if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) else: # Append comment to tt try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_comment"] += 1 except TTError as e: log("Failed to add comment to %s: %s", tt_id, e) metrics["escalation_tt_comment_fail"] += 1 else: log("Failed to add comment to %s: Invalid TT system", tt_id) metrics["escalation_tt_comment_fail"] += 1 if notification_group_id: notification_group = NotificationGroup.get_by_id(notification_group_id) if notification_group: log("Sending notification to group %s", notification_group.name) notification_group.notify(subject, body) metrics["escalation_notify"] += 1 else: log("Invalid notification group %s", notification_group_id)
def remove_job(self): logger.info("Removing job") Job.remove("scheduler", self.JCLS)
def submit_job(self): logger.info("Submitting job") Job.submit("scheduler", self.JCLS)
def wipe(o): if not hasattr(o, "id"): try: o = ManagedObject.objects.get(id=o) except ManagedObject.DoesNotExist: return True log = PrefixLoggerAdapter(logger, str(o.id)) # Wiping discovery tasks log.debug("Wiping discovery tasks") for j in [ ManagedObject.BOX_DISCOVERY_JOB, ManagedObject.PERIODIC_DISCOVERY_JOB ]: Job.remove("discovery", j, key=o.id, pool=o.pool.name) # Wiping FM events log.debug("Wiping events") FailedEvent.objects.filter(managed_object=o.id).delete() ActiveEvent.objects.filter(managed_object=o.id).delete() ArchivedEvent.objects.filter(managed_object=o.id).delete() # Wiping alarms log.debug("Wiping alarms") for ac in (ActiveAlarm, ArchivedAlarm): for a in ac.objects.filter(managed_object=o.id): # Relink root causes my_root = a.root for iac in (ActiveAlarm, ArchivedAlarm): for ia in iac.objects.filter(root=a.id): ia.root = my_root ia.save() # Delete alarm a.delete() # Wiping MAC DB log.debug("Wiping MAC DB") MACDB._get_collection().remove({"managed_object": o.id}) # Wiping discovery id cache log.debug("Wiping discovery id") DiscoveryID._get_collection().remove({"object": o.id}) # Wiping interfaces, subs and links # Wipe links log.debug("Wiping links") for i in Interface.objects.filter(managed_object=o.id): # @todo: Remove aggregated links correctly Link.objects.filter(interfaces=i.id).delete() # log.debug("Wiping subinterfaces") SubInterface.objects.filter(managed_object=o.id).delete() log.debug("Wiping interfaces") Interface.objects.filter(managed_object=o.id).delete() log.debug("Wiping forwarding instances") ForwardingInstance.objects.filter(managed_object=o.id).delete() # Unbind from IPAM log.debug("Unbind from IPAM") for a in Address.objects.filter(managed_object=o): a.managed_object = None a.save() # Wipe object status log.debug("Wiping object status") ObjectStatus.objects.filter(object=o.id).delete() # Wipe outages log.debug("Wiping outages") Outage.objects.filter(object=o.id).delete() # Wipe uptimes log.debug("Wiping uptimes") Uptime.objects.filter(object=o.id).delete() # Wipe reboots log.debug("Wiping reboots") Reboot.objects.filter(object=o.id).delete() # Delete Managed Object's capabilities log.debug("Wiping capabilitites") ObjectCapabilities.objects.filter(object=o.id).delete() # Delete Managed Object's attributes log.debug("Wiping attributes") ManagedObjectAttribute.objects.filter(managed_object=o).delete() # Finally delete object and config log.debug("Finally wiping object") o.delete() log.debug("Done")