async def on_activate(self): self.slot_number, self.total_slots = await self.acquire_slot() self.is_distributed = self.total_slots > 1 # Prepare scheduler if self.is_distributed: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"shard": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, # @fixme have to be configured ? submit_threshold=100, max_chunk=100, filter=ifilter, ) self.scheduler.correlator = self self.scheduler.run() # Subscribe stream, move to separate task to let the on_activate to terminate self.loop.create_task( self.subscribe_stream("dispose.%s" % config.pool, self.slot_number, self.on_dispose_event))
def get_job_avg(): """ Calculate average time execute discovery job :return: """ job_map = { "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob": "periodic", "noc.services.discovery.jobs.box.job.BoxDiscoveryJob": "box", } r = {} match = {"ldur": {"$gte": 0.05}} # If Ping fail and skipping discovery for pool in Pool.objects.filter(): coll = Scheduler("discovery", pool=pool.name).get_collection() r[pool] = { job_map[c["_id"]]: c["avg_dur"] for c in coll.aggregate([{ "$match": match }, { "$group": { "_id": "$jcls", "avg_dur": { "$avg": "$ldur" } } }]) if c["_id"] } return r
def on_activate(self): self.scheduler = Scheduler( "scheduler", reset_running=True, max_threads=config.scheduler.max_threads, ioloop=self.ioloop ) self.scheduler.run()
def scheduler(s): scheduler, pool = "scheduler", "default" if "." in s: scheduler, pool = s.split(".") if scheduler in SHARDING_SCHEDULER: # raise argparse.ArgumentTypeError("Scheduler: %s, not supporting sharding") return Scheduler(scheduler, pool=pool).get_collection() return Scheduler(scheduler).get_collection()
def get_data(self, **kwargs): old = datetime.datetime.now() - datetime.timedelta( minutes=self.STALE_INTERVAL) data = [] for pool in Pool._get_collection().find({}, {"_id": 0, "name": 1}): scheduler = Scheduler("discovery", pool=pool["name"]) for r in scheduler.get_collection().find({ "runs": { "$gt": 1 }, "jcls": { "$regex": "_discovery$" }, "st": { "$lte": old } }): mo = ManagedObject.get_by_id(r["key"]) if not mo or not mo.is_managed: continue msg = "" if r["tb"]: tb = r["tb"] if "text" in tb and "code" in tb: if tb["text"].endswith("END OF TRACEBACK"): tb["text"] = "Job crashed" msg = "(%s) %s" % (tb["text"], tb["code"]) data += [[ mo.administrative_domain.name, mo.name, mo.profile.name, mo.platform.name, mo.version.name, mo.address, mo.segment.name, r["jcls"], humanize_distance(r["st"]), msg, ]] return self.from_dataset( title=self.title, columns=[ _("Admin. Domain"), _("Object"), _("Profile"), _("Platform"), _("Version"), _("Address"), _("Segment"), _("Job"), _("Last Success"), _("Reason"), ], data=sorted(data), enumerate=True, )
def count(self): scheduler = Scheduler(self.scheduler, pool=self.pool).get_collection() find = { Job.ATTR_KEY: { "$in": list(self.mos_filter.values_list("id", flat=True)) } } if self.pipeline and "$match" in self.pipeline[0]: find.update(self.pipeline[0]["$match"]) return scheduler.count(find)
class SchedulerService(TornadoService): name = "scheduler" leader_lock_name = "scheduler" use_mongo = True async def on_activate(self): self.scheduler = Scheduler("scheduler", reset_running=True, max_threads=config.scheduler.max_threads) self.scheduler.run()
def __iter__(self): mos_ids = list(self.mos_filter.values_list("id", flat=True)) self.pipeline = [{ "$match": { Job.ATTR_KEY: { "$in": mos_ids } } }] + self.pipeline scheduler = Scheduler(self.scheduler, pool=self.pool).get_collection() yield from scheduler.aggregate(self.pipeline)
class SchedulerService(Service): name = "scheduler" leader_group_name = "scheduler" leader_lock_name = "scheduler" @tornado.gen.coroutine def on_activate(self): self.scheduler = Scheduler( "scheduler", reset_running=True, max_threads=config.scheduler.max_threads, ioloop=self.ioloop ) self.scheduler.run()
def fix(): for p in Pool.objects.all(): s = Scheduler("discovery", pool=p.name) c = s.get_collection() if not c.count_documents({}): continue # Remove unused schedules fields c.update_many({ "jcls": "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob" }, { "$unset": { "ctx": "", "ctv": "" } }) c.database.command({"compact": c.name, "force": True})
def on_activate(self): self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, ioloop=self.ioloop, # @fixme have to be configured ? submit_threshold=100, max_chunk=100) self.scheduler.correlator = self self.subscribe("correlator.dispose.%s" % config.pool, "dispose", self.on_dispose_event, max_in_flight=config.correlator.max_threads) self.scheduler.run()
def call_later( name: str, delay: Optional[float] = None, scheduler: str = "scheduler", pool: Optional[str] = None, job_class: str = DEFAULT_JOB_CLASS, shard: Optional[int] = None, max_runs: Optional[int] = None, **kwargs, ): """ Schedule to run callable *name* in scheduler process :param name: Full callable name :param delay: delay in seconds :param scheduler: Name of scheduler :param pool: Pool name :param job_class: Job class :param shard: Sharding key :param max_runs: Maximum amount of retries """ scheduler = Scheduler(scheduler, pool=pool) data = kwargs or {} ts = datetime.datetime.now() if delay: ts += datetime.timedelta(seconds=delay) # Process sharding if shard is None: shard = dict_hash_int_args(job_class=job_class, name=name, pool=pool, **kwargs) shard = (shard if shard >= 0 else -shard) % 0x7FFFFFFF # set_op = {Job.ATTR_TS: ts} iset_op = { Job.ATTR_STATUS: Job.S_WAIT, Job.ATTR_RUNS: 0, Job.ATTR_FAULTS: 0, Job.ATTR_OFFSET: 0, Job.ATTR_SHARD: shard, } if max_runs: iset_op[Job.ATTR_MAX_RUNS] = max_runs if data: set_op[Job.ATTR_DATA] = { k: v for k, v in data.items() if not k.startswith("_") } q = {Job.ATTR_CLASS: job_class, Job.ATTR_KEY: name} for k in list(data): if k.startswith("_"): # Hidden attribute JobClass, remove it from data q[k] = data[k] continue q["%s.%s" % (Job.ATTR_DATA, k)] = data[k] op = {"$set": set_op, "$setOnInsert": iset_op} logger.info("Delayed call to %s(%s) in %ss", name, data, delay or "0") logger.debug("update(%s, %s, upsert=True)", q, op) scheduler.get_collection().update_one(q, op, upsert=True)
def on_activate(self): self.slot_number, self.total_slots = yield self.acquire_slot() if self.total_slots > 1: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"key": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler("discovery", pool=config.pool, reset_running=True, max_threads=config.discovery.max_threads, ioloop=self.ioloop, filter=ifilter, service=self, sample=config.discovery.sample) self.scheduler.run()
def run_job(self, job, mo, checks): if job == "segment": scheduler = Scheduler("scheduler", pool=None, service=ServiceStub()) else: scheduler = Scheduler("discovery", pool=mo.pool.name, service=ServiceStub()) jcls = self.jcls[job] # Try to dereference job job_args = scheduler.get_collection().find_one({ Job.ATTR_CLASS: jcls, Job.ATTR_KEY: mo.id }) if job_args: self.print("Job ID: %s" % job_args["_id"]) else: job_args = {Job.ATTR_ID: "fakeid", Job.ATTR_KEY: mo.id} job_args["_checks"] = checks job = get_handler(jcls)(scheduler, job_args) if job.context_version: ctx_key = job.get_context_cache_key() self.print("Loading job context from %s" % ctx_key) ctx = cache.get(ctx_key, version=job.context_version) if not ctx: self.print("Job context is empty") job.load_context(ctx) sample = 1 if self.trace else 0 with Span(sample=sample): job.dereference() job.handler() if sample: spans = get_spans() self.print("Spans:") self.print("\n".join(str(s) for s in spans)) if scheduler.service.metrics: self.print("Collected CH data:") for t in scheduler.service.metrics: self.print("Table: %s" % t) self.print("\n".join( str(x) for x in scheduler.service.metrics[t])) if job.context_version and job.context: self.print("Saving job context to %s" % ctx_key) scheduler.cache_set(key=ctx_key, value=job.context, version=job.context_version) scheduler.apply_cache_ops() time.sleep(3)
def run_job(jcls, mo, checks): scheduler = Scheduler("discovery", pool=mo.pool.name, service=get_service(mo.pool)) job_args = {Job.ATTR_ID: "fakeid", Job.ATTR_KEY: mo.id, "_checks": checks} job = jcls(scheduler, job_args) # Inject beef managed object job.dereference() job.object = mo # Run job job.handler() assert not job.problems
def call_later(name, delay=None, scheduler="scheduler", pool=None, max_runs=None, **kwargs): """ Run callable *name* in scheduler process :param name: Full callable name :param delay: delay in seconds :param scheduler: Name of scheduler :param pool: Pool name :param max_runs: Maximum amount of retries """ scheduler = Scheduler(scheduler, pool=pool) data = kwargs or {} ts = datetime.datetime.now() if delay: ts += datetime.timedelta(seconds=delay) set_op = { Job.ATTR_TS: ts } iset_op = { Job.ATTR_STATUS: Job.S_WAIT, Job.ATTR_RUNS: 0, Job.ATTR_FAULTS: 0, Job.ATTR_OFFSET: 0 } if max_runs: iset_op[Job.ATTR_MAX_RUNS] = max_runs if data: set_op[Job.ATTR_DATA] = data q = { Job.ATTR_CLASS: "noc.core.scheduler.calljob.CallJob", Job.ATTR_KEY: name } for k in data: q["%s.%s" % (Job.ATTR_DATA, k)] = data[k] op = { "$set": set_op, "$setOnInsert": iset_op } logger.info("Delayed call to %s(%s) in %ss", name, data, delay or "0") logger.debug("update(%s, %s, upsert=True)", q, op) scheduler.get_collection().update(q, op, upsert=True)
class DiscoveryService(Service): name = "discovery" leader_group_name = "discovery-%(pool)s" pooled = True require_nsq_writer = True use_mongo = True process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super(DiscoveryService, self).__init__() self.send_callback = None self.slot_number = 0 self.total_slots = 0 @tornado.gen.coroutine def on_activate(self): self.slot_number, self.total_slots = yield self.acquire_slot() if self.total_slots > 1: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"key": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler( "discovery", pool=config.pool, reset_running=True, max_threads=config.discovery.max_threads, ioloop=self.ioloop, filter=ifilter, service=self, sample=config.discovery.sample, ) self.scheduler.run() def get_mon_data(self): r = super(DiscoveryService, self).get_mon_data() if self.scheduler: self.scheduler.apply_metrics(r) return r
def add_arguments(self, parser): parser.add_argument( "--scheduler", "-s", dest="scheduler", default=Scheduler("scheduler").get_collection(), type=self.scheduler, help="Select scheduler. For sharded use SCHEDULER_NAME.SHARD_NAME" ), parser.add_argument( "--format", "-f", dest="store", # action="format", choices=["json", "csv"], help="Set output format"), subparsers = parser.add_subparsers(dest="cmd") # load command list_parser = subparsers.add_parser("list") list_parser.add_argument("--name", help="Job name in scheduler") list_parser.add_argument("key", nargs=argparse.REMAINDER, help="List of job key") get_parser = subparsers.add_parser("get") get_parser.add_argument("--id", help="Job name in scheduler") subparsers.add_parser("set") # Parse Job Field reschedule = subparsers.add_parser("reschedule", help="Shift Jobs to interval") reschedule.add_argument("--name", help="Job name in scheduler") reschedule.add_argument("--start", type=self.valid_date, help="Start interval for place") reschedule.add_argument("--end", type=self.valid_date, help="End interval for place") reschedule.add_argument("--force", default=False, action="store_true", help="Really do reschedule") reschedule.add_argument("key", nargs=argparse.REMAINDER, help="List of job key") parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
def apply_shards(self): # Get shards settings shard_threads = defaultdict(int) shard_threads[DEFAULT_TTSYSTEM_SHARD] = config.escalator.max_threads for s in TTSystem.objects.all(): if not s.is_active: continue shard_threads[s.shard_name] += s.max_threads # Run shard schedulers for sn in shard_threads: self.logger.info("Running shard %s (%d threads)", sn, shard_threads[sn]) self.shards[sn] = Scheduler("escalator", pool=sn, reset_running=True, max_threads=shard_threads[sn], ioloop=self.ioloop, service=self, sample=config.escalator.sample) self.shards[sn].run()
def add_arguments(self, parser): parser.add_argument( "--scheduler", "-s", dest="scheduler", default=Scheduler("scheduler").get_collection(), type=self.scheduler, help="Select scheduler. For sharded use SCHEDULER_NAME.SHARD_NAME", ), parser.add_argument( "--format", "-f", dest="store", # action="format", choices=["json", "csv"], help="Set output format", ), subparsers = parser.add_subparsers(dest="cmd") # load command list_parser = subparsers.add_parser("list") list_parser.add_argument("--name", help="Job name in scheduler") list_parser.add_argument("key", nargs=argparse.REMAINDER, help="List of job key") get_parser = subparsers.add_parser("get") get_parser.add_argument("--id", help="Job name in scheduler") subparsers.add_parser("set") estimate = subparsers.add_parser("estimate") estimate.add_argument("--device-count", type=int, default=0, help="Device count") estimate.add_argument("--box-interval", type=int, default=65400, help="Box discovery interval (in seconds)") estimate.add_argument( "--periodic-interval", type=int, default=300, help="Periodic discovery interval (in seconds)", ) # Parse Job Field reschedule = subparsers.add_parser("reschedule", help="Shift Jobs to interval") reschedule.add_argument("--name", help="Job name in scheduler") reschedule.add_argument("--start", type=self.valid_date, help="Start interval for place") reschedule.add_argument("--end", type=self.valid_date, help="End interval for place") reschedule.add_argument("--force", default=False, action="store_true", help="Really do reschedule") reschedule.add_argument("key", nargs=argparse.REMAINDER, help="List of job key") parser.add_argument("infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin)
class CorrelatorService(TornadoService): name = "correlator" pooled = True use_mongo = True process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super().__init__() self.version = version.version self.rules = {} # event_class -> [Rule] self.back_rules = {} # event_class -> [Rule] self.triggers = {} # alarm_class -> [Trigger1, .. , TriggerN] self.rca_forward = { } # alarm_class -> [RCA condition, ..., RCA condititon] self.rca_reverse = defaultdict( set) # alarm_class -> set([alarm_class]) # self.slot_number = 0 self.total_slots = 0 self.is_distributed = False # Scheduler self.scheduler: Optional[Scheduler] = None # Locks self.topo_rca_lock: Optional[RCALock] = None async def on_activate(self): self.slot_number, self.total_slots = await self.acquire_slot() self.is_distributed = self.total_slots > 1 # Prepare scheduler if self.is_distributed: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"shard": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, # @fixme have to be configured ? submit_threshold=100, max_chunk=100, filter=ifilter, ) self.scheduler.correlator = self self.scheduler.run() # Subscribe stream, move to separate task to let the on_activate to terminate self.loop.create_task( self.subscribe_stream("dispose.%s" % config.pool, self.slot_number, self.on_dispose_event)) def on_start(self): """ Load rules from database just after loading config """ super().on_start() connect() # use_mongo connect do after on_start. self.load_rules() self.load_triggers() self.load_rca_rules() def load_rules(self): """ Load rules from database """ self.logger.debug("Loading rules") self.rules = {} self.back_rules = {} nr = 0 nbr = 0 for c in EventClass.objects.all(): if c.disposition: r = [] for dr in c.disposition: rule = Rule(c, dr) r += [rule] nr += 1 if dr.combo_condition != "none" and dr.combo_window: for cc in dr.combo_event_classes: try: self.back_rules[cc.id] += [dr] except KeyError: self.back_rules[cc.id] = [dr] nbr += 1 self.rules[c.id] = r self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr)) def load_triggers(self): self.logger.info("Loading triggers") self.triggers = {} n = 0 cn = 0 ec = [(c.name, c.id) for c in AlarmClass.objects.all()] for t in AlarmTrigger.objects.filter(is_enabled=True): self.logger.debug("Trigger '%s' for classes:" % t.name) for c_name, c_id in ec: if re.search(t.alarm_class_re, c_name, re.IGNORECASE): try: self.triggers[c_id] += [Trigger(t)] except KeyError: self.triggers[c_id] = [Trigger(t)] cn += 1 self.logger.debug(" %s" % c_name) n += 1 self.logger.info("%d triggers has been loaded to %d classes" % (n, cn)) def load_rca_rules(self): """ Load root cause analisys rules """ self.logger.info("Loading RCA Rules") n = 0 self.rca_forward = {} self.rca_reverse = {} for a in AlarmClass.objects.filter(root_cause__0__exists=True): if not a.root_cause: continue self.rca_forward[a.id] = [] for c in a.root_cause: rc = RCACondition(a, c) self.rca_forward[a.id] += [rc] if rc.root.id not in self.rca_reverse: self.rca_reverse[rc.root.id] = [] self.rca_reverse[rc.root.id] += [rc] n += 1 self.logger.info("%d RCA Rules have been loaded" % n) def mark_as_failed(self, event): """ Write error log and mark event as failed """ self.logger.error("Failed to process event %s" % str(event.id)) # Prepare traceback t, v, tb = sys.exc_info() now = datetime.datetime.now() r = ["UNHANDLED EXCEPTION (%s)" % str(now)] r += [str(t), str(v)] r += [format_frames(get_traceback_frames(tb))] r = "\n".join(r) event.mark_as_failed(version=self.version, traceback=r) def set_root_cause(self, a): """ Search for root cause and set, if found :returns: Boolean. True, if root cause set """ for rc in self.rca_forward[a.alarm_class.id]: # Check condition if not rc.check_condition(a): continue # Check match condition q = rc.get_match_condition(a) root = ActiveAlarm.objects.filter(**q).first() if root: # Root cause found self.logger.info("%s is root cause for %s (Rule: %s)", root.id, a.id, rc.name) metrics["alarm_correlated_rule"] += 1 a.set_root(root, rca_type=RCA_RULE) return True return False def set_reverse_root_cause(self, a): """ Set *a* as root cause for existing events :param a: :return: """ found = False for rc in self.rca_reverse[a.alarm_class.id]: # Check reverse match condition q = rc.get_reverse_match_condition(a) for ca in ActiveAlarm.objects.filter(**q): # Check condition if not rc.check_condition(ca): continue # Try to set root cause qq = rc.get_match_condition(ca, id=a.id) rr = ActiveAlarm.objects.filter(**qq).first() if rr: # Reverse root cause found self.logger.info( "%s is root cause for %s (Reverse rule: %s)", a.id, ca.id, rc.name) metrics["alarm_correlated_rule"] += 1 ca.set_root(a, rca_type=RCA_RULE) found = True return found async def raise_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info("Empty managed object, ignoring") return # @todo: Make configurable if not managed_object.is_managed: self.logger.info( "Managed object is not managed. Do not raise alarm") return if e.managed_object.id != managed_object.id: metrics["alarm_change_mo"] += 1 self.logger.info("Changing managed object to %s", managed_object.name) discriminator, vars = r.get_vars(e) if r.unique: assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if not a: # Try to reopen alarm a = ArchivedAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator, control_time__gte=e.timestamp, ).first() if a: # Reopen alarm self.logger.info( "[%s|%s|%s] %s reopens alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) a = a.reopen("Reopened by disposition rule '%s'" % r.u_name) metrics["alarm_reopen"] += 1 if a: # Active alarm found, refresh self.logger.info( "[%s|%s|%s] Contributing event %s to active alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) # Contribute event to alarm e.contribute_to_alarm(a) if e.timestamp < a.timestamp: # Set to earlier date a.timestamp = e.timestamp a.save() elif e.timestamp > a.last_update: # Refresh last update a.last_update = e.timestamp a.save() metrics["alarm_contribute"] += 1 return # Calculate alarm coverage summary = ServiceSummary.get_object_summary(managed_object) summary["object"] = {managed_object.object_profile.id: 1} # severity = max(ServiceSummary.get_severity(summary), 1) self.logger.info( "[%s|%s|%s] %s: Calculated alarm severity is: %s", e.id, managed_object.name, managed_object.address, r.u_name, severity, ) # Create new alarm direct_services = SummaryItem.dict_to_items(summary["service"]) direct_subscribers = SummaryItem.dict_to_items(summary["subscriber"]) a = ActiveAlarm( timestamp=e.timestamp, last_update=e.timestamp, managed_object=managed_object.id, alarm_class=r.alarm_class, severity=severity, vars=vars, discriminator=discriminator, direct_services=direct_services, direct_subscribers=direct_subscribers, total_objects=ObjectSummaryItem.dict_to_items(summary["object"]), total_services=direct_services, total_subscribers=direct_subscribers, log=[ AlarmLog( timestamp=datetime.datetime.now(), from_status="A", to_status="A", message="Alarm risen from event %s(%s) by rule '%s'" % (str(e.id), str(e.event_class.name), r.u_name), ) ], opening_event=e.id, ) a.save() e.contribute_to_alarm(a) self.logger.info( "[%s|%s|%s] %s raises alarm %s(%s): %r", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, a.vars, ) metrics["alarm_raise"] += 1 await self.correlate(r, a) # Notify about new alarm if not a.root: a.managed_object.event( a.managed_object.EV_ALARM_RISEN, { "alarm": a, "subject": a.subject, "body": a.body, "symptoms": a.alarm_class.symptoms, "recommended_actions": a.alarm_class.recommended_actions, "probable_causes": a.alarm_class.probable_causes, }, delay=a.alarm_class.get_notification_delay(), ) # Gather diagnostics when necessary AlarmDiagnosticConfig.on_raise(a) # Watch for escalations, when necessary if config.correlator.auto_escalation and not a.root: AlarmEscalation.watch_escalations(a) async def correlate(self, r, a): # Topology RCA if a.alarm_class.topology_rca: await self.topology_rca(a) # Rule-based RCA if a.alarm_class.id in self.rca_forward: # Check alarm is a consequence of existing one self.set_root_cause(a) if a.alarm_class.id in self.rca_reverse: # Check alarm is the root cause for existing ones self.set_reverse_root_cause(a) # Call handlers for h in a.alarm_class.get_handlers(): try: has_root = bool(a.root) h(a) if not has_root and a.root: self.logger.info( "[%s|%s|%s] Set root to %s (handler %s)", a.id, a.managed_object.name, a.managed_object.address, a.root, h, ) except Exception: # noqa. Can probable happens anything from handler error_report() metrics["error", ("type", "alarm_handler")] += 1 # Call triggers if necessary if r.alarm_class.id in self.triggers: for t in self.triggers[r.alarm_class.id]: try: t.call(a) except: # noqa. Can probable happens anything from trigger error_report() # if not a.severity: # Alarm severity has been reset to 0 by handlers # Silently drop alarm self.logger.debug("Alarm severity is 0, dropping") a.delete() metrics["alarm_drop"] += 1 return def clear_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info( "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring", e.id) metrics["unknown_object"] += 1 return if r.unique: discriminator, vars = r.get_vars(e) assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if a: self.logger.info( "[%s|%s|%s] %s clears alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) e.contribute_to_alarm(a) a.closing_event = e.id a.last_update = max(a.last_update, e.timestamp) a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name, ts=e.timestamp) metrics["alarm_clear"] += 1 def get_delayed_event(self, r, e): """ Check wrether all delayed conditions are met :param r: Delayed rule :param e: Event which can trigger delayed rule """ # @todo: Rewrite to scheduler discriminator, vars = r.get_vars(e) ws = e.timestamp - datetime.timedelta(seconds=r.combo_window) de = ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class=r.event_class, discriminator=discriminator, timestamp__gte=ws, ).first() if not de: # No starting event return None # Probable starting event found, get all interesting following event # classes fe = [ ee.event_class.id for ee in ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class__in=r.combo_event_classes, discriminator=discriminator, timestamp__gte=ws, ).order_by("timestamp") ] if r.combo_condition == "sequence": # Exact match if fe == self.combo_event_classes: return de elif r.combo_condition == "all": # All present if not any([c for c in r.combo_event_classes if c not in fe]): return de elif r.combo_condition == "any": # Any found if fe: return de return None def eval_expression(self, expression, **kwargs): """ Evaluate expression in given context """ env = {"re": re, "utils": utils} env.update(kwargs) return eval(expression, {}, env) async def on_dispose_event(self, msg: Message) -> None: """ Called on new dispose message """ data = orjson.loads(msg.value) event_id = data["event_id"] hint = data["event"] self.logger.info("[%s] Receiving message", event_id) metrics["alarm_dispose"] += 1 try: event = ActiveEvent.from_json(hint) event.timestamp = event.timestamp.replace(tzinfo=None) await self.dispose_event(event) except Exception: metrics["alarm_dispose_error"] += 1 error_report() finally: if self.topo_rca_lock: # Release pending RCA Lock await self.topo_rca_lock.release() self.topo_rca_lock = None async def dispose_event(self, e): """ Dispose event according to disposition rule """ event_id = str(e.id) self.logger.info("[%s] Disposing", event_id) drc = self.rules.get(e.event_class.id) if not drc: self.logger.info( "[%s] No disposition rules for class %s, skipping", event_id, e.event_class.name) return # Apply disposition rules for r in drc: if self.eval_expression(r.condition, event=e): # Process action if r.action == "drop": self.logger.info("[%s] Dropped by action", event_id) e.delete() return elif r.action == "ignore": self.logger.info("[%s] Ignored by action", event_id) return elif r.action == "raise" and r.combo_condition == "none": await self.raise_alarm(r, e) elif r.action == "clear" and r.combo_condition == "none": self.clear_alarm(r, e) if r.action in ("raise", "clear"): # Write discriminator if can trigger delayed event if r.unique and r.event_class.id in self.back_rules: discriminator, vars = r.get_vars(e) e.discriminator = discriminator e.save() # Process delayed combo conditions if e.event_class.id in self.back_rules: for br in self.back_rules[e.event_class.id]: de = self.get_delayed_event(br, e) if de: if br.action == "raise": await self.raise_alarm(br, de) elif br.action == "clear": self.clear_alarm(br, de) if r.stop_disposition: break self.logger.info("[%s] Disposition complete", event_id) async def topology_rca(self, alarm): """ Topology-based RCA :param alarm: :return: """ def can_correlate(a1, a2): """ Check if alarms can be correlated together (within corellation window) :param a1: :param a2: :return: """ return (not config.correlator.topology_rca_window or (a1.timestamp - a2.timestamp).total_seconds() <= config.correlator.topology_rca_window) def all_uplinks_failed(a1): """ Check if all uplinks for alarm is failed :param a1: :return: """ if not a1.uplinks: return False return sum(1 for mo in a1.uplinks if mo in neighbor_alarms) == len(a1.uplinks) def get_root(a1) -> Optional[ActiveAlarm]: """ Get root cause for failed uplinks. Considering all uplinks are failed. Uplinks are ordered according to path length. Return first applicable :param a1: :return: """ for u in a1.uplinks: na = neighbor_alarms[u] if can_correlate(a1, na): return na return None def get_neighboring_alarms(ca: ActiveAlarm) -> Dict[int, ActiveAlarm]: r = { na.managed_object.id: na for na in ActiveAlarm.objects.filter( alarm_class=ca.alarm_class.id, rca_neighbors__in=[ca.managed_object.id]) } # Add current alarm to correlate downlink alarms properly r[alarm.managed_object.id] = ca return r def iter_downlink_alarms(a1): """ Yield all downlink alarms :param a1: :return: """ mo = a1.managed_object.id for na in neighbor_alarms.values(): if na.uplinks and mo in na.uplinks: yield na def correlate_uplinks(ca: ActiveAlarm) -> bool: """ Correlate with uplink alarms if all uplinks are faulty. :param a1: :return: """ if not all_uplinks_failed(ca): return False self.logger.info("[%s] All uplinks are faulty. Correlating", ca.id) ra = get_root(ca) if not ra: return False self.logger.info("[%s] Set root to %s", ca.id, ra.id) ca.set_root(ra, rca_type=RCA_TOPOLOGY) metrics["alarm_correlated_topology"] += 1 return True def correlate_merge_downlinks(ca: ActiveAlarm) -> bool: """ Donwlink merge correlation :param ca: :return: """ if not ca.uplinks or not ca.rca_neighbors: return False dlm_neighbors = { mo: w for mo, w in zip(ca.rca_neighbors, ca.dlm_windows) if w > 0 } dlm_candidates = set(neighbor_alarms) & set(dlm_neighbors) if not dlm_candidates: return False # Get possible candidates t0 = ca.timestamp candidates = list( sorted( (neighbor_alarms[mo] for mo in dlm_candidates if (t0 - neighbor_alarms[mo].timestamp ).total_seconds() <= dlm_neighbors[mo]), key=operator.attrgetter("timestamp"), )) if not candidates: return False ra = candidates[0] self.logger.info("[%s] Set root to %s (downlink merge)", ca.id, ra.id) ca.set_root(ra, rca_type=RCA_DOWNLINK_MERGE) metrics["alarm_correlated_topology"] += 1 return True self.logger.debug("[%s] Topology RCA", alarm.id) # Acquire lock if self.is_distributed: # Set lock until the end of dispose mo = alarm.managed_object self.topo_rca_lock = RCALock(mo.data.rca_neighbors + [mo.id]) await self.topo_rca_lock.acquire() # Get neighboring alarms neighbor_alarms = get_neighboring_alarms(alarm) # Correlate current alarm correlate_uplinks(alarm) or correlate_merge_downlinks(alarm) # Correlate all downlink alarms for a in iter_downlink_alarms(alarm): correlate_uplinks(a) self.logger.debug("[%s] Correlation completed", alarm.id)
def get_scheduler(cls): return Scheduler(cls.SCHEDULER)
class CorrelatorService(Service): name = "correlator" pooled = True leader_lock_name = "correlator-%(pool)s" process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super(CorrelatorService, self).__init__() self.version = version.version self.rules = {} # event_class -> [Rule] self.back_rules = {} # event_class -> [Rule] self.triggers = {} # alarm_class -> [Trigger1, .. , TriggerN] self.rca_forward = { } # alarm_class -> [RCA condition, ..., RCA condititon] self.rca_reverse = defaultdict( set) # alarm_class -> set([alarm_class]) self.scheduler = None self.rca_lock = Lock() self.topology_rca_lock = Lock() def on_activate(self): self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, ioloop=self.ioloop, # @fixme have to be configured ? submit_threshold=100, max_chunk=100) self.scheduler.correlator = self self.subscribe("correlator.dispose.%s" % config.pool, "dispose", self.on_dispose_event, max_in_flight=config.correlator.max_threads) self.scheduler.run() def on_start(self): """ Load rules from database just after loading config """ super(CorrelatorService, self).on_start() self.load_rules() self.load_triggers() self.load_rca_rules() def load_rules(self): """ Load rules from database """ self.logger.debug("Loading rules") self.rules = {} self.back_rules = {} nr = 0 nbr = 0 for c in EventClass.objects.all(): if c.disposition: r = [] for dr in c.disposition: rule = Rule(c, dr) r += [rule] nr += 1 if dr.combo_condition != "none" and dr.combo_window: for cc in dr.combo_event_classes: try: self.back_rules[cc.id] += [dr] except KeyError: self.back_rules[cc.id] = [dr] nbr += 1 self.rules[c.id] = r self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr)) def load_triggers(self): self.logger.info("Loading triggers") self.triggers = {} n = 0 cn = 0 ec = [(c.name, c.id) for c in AlarmClass.objects.all()] for t in AlarmTrigger.objects.filter(is_enabled=True): self.logger.debug("Trigger '%s' for classes:" % t.name) for c_name, c_id in ec: if re.search(t.alarm_class_re, c_name, re.IGNORECASE): try: self.triggers[c_id] += [Trigger(t)] except KeyError: self.triggers[c_id] = [Trigger(t)] cn += 1 self.logger.debug(" %s" % c_name) n += 1 self.logger.info("%d triggers has been loaded to %d classes" % (n, cn)) def load_rca_rules(self): """ Load root cause analisys rules """ self.logger.info("Loading RCA Rules") n = 0 self.rca_forward = {} self.rca_reverse = {} for a in AlarmClass.objects.filter(root_cause__0__exists=True): if not a.root_cause: continue self.rca_forward[a.id] = [] for c in a.root_cause: rc = RCACondition(a, c) self.rca_forward[a.id] += [rc] if rc.root.id not in self.rca_reverse: self.rca_reverse[rc.root.id] = [] self.rca_reverse[rc.root.id] += [rc] n += 1 self.logger.info("%d RCA Rules have been loaded" % n) def mark_as_failed(self, event): """ Write error log and mark event as failed """ self.logger.error("Failed to process event %s" % str(event.id)) # Prepare traceback t, v, tb = sys.exc_info() now = datetime.datetime.now() r = ["UNHANDLED EXCEPTION (%s)" % str(now)] r += [str(t), str(v)] r += [format_frames(get_traceback_frames(tb))] r = "\n".join(r) event.mark_as_failed(version=self.version, traceback=r) def set_root_cause(self, a): """ Search for root cause and set, if found :returns: Boolean. True, if root cause set """ for rc in self.rca_forward[a.alarm_class.id]: # Check condition if not rc.check_condition(a): continue # Check match condition q = rc.get_match_condition(a) root = ActiveAlarm.objects.filter(**q).first() if root: # Root cause found self.logger.info("%s is root cause for %s (Rule: %s)", root.id, a.id, rc.name) metrics["alarm_correlated_rule"] += 1 a.set_root(root) return True return False def set_reverse_root_cause(self, a): """ Set *a* as root cause for existing events :param a: :return: """ found = False for rc in self.rca_reverse[a.alarm_class.id]: # Check reverse match condition q = rc.get_reverse_match_condition(a) for ca in ActiveAlarm.objects.filter(**q): # Check condition if not rc.check_condition(ca): continue # Try to set root cause qq = rc.get_match_condition(ca, id=a.id) rr = ActiveAlarm.objects.filter(**qq).first() if rr: # Reverse root cause found self.logger.info( "%s is root cause for %s (Reverse rule: %s)", a.id, ca.id, rc.name) metrics["alarm_correlated_rule"] += 1 ca.set_root(a) found = True return found def raise_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info("Empty managed object, ignoring") return # @todo: Make configurable if not managed_object.is_managed: self.logger.info( "Managed object is not managed. Do not raise alarm") return if e.managed_object.id != managed_object.id: metrics["alarm_change_mo"] += 1 self.logger.info("Changing managed object to %s", managed_object.name) discriminator, vars = r.get_vars(e) if r.unique: assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if not a: # Try to reopen alarm a = ArchivedAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator, control_time__gte=e.timestamp).first() if a: # Reopen alarm self.logger.info("[%s|%s|%s] %s reopens alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) a = a.reopen("Reopened by disposition rule '%s'" % r.u_name) metrics["alarm_reopen"] += 1 if a: # Active alarm found, refresh self.logger.info( "[%s|%s|%s] Contributing event %s to active alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) # Contribute event to alarm e.contribute_to_alarm(a) if e.timestamp < a.timestamp: # Set to earlier date a.timestamp = e.timestamp a.save() elif e.timestamp > a.last_update: # Refresh last update a.last_update = e.timestamp a.save() metrics["alarm_contribute"] += 1 return # Calculate alarm coverage summary = ServiceSummary.get_object_summary(managed_object) summary["object"] = {managed_object.object_profile.id: 1} # severity = max(ServiceSummary.get_severity(summary), 1) self.logger.info("[%s|%s|%s] %s: Calculated alarm severity is: %s", e.id, managed_object.name, managed_object.address, r.u_name, severity) # Create new alarm a = ActiveAlarm( timestamp=e.timestamp, last_update=e.timestamp, managed_object=managed_object.id, alarm_class=r.alarm_class, severity=severity, vars=vars, discriminator=discriminator, direct_services=SummaryItem.dict_to_items(summary["service"]), direct_subscribers=SummaryItem.dict_to_items( summary["subscriber"]), total_objects=ObjectSummaryItem.dict_to_items(summary["object"]), total_services=SummaryItem.dict_to_items(summary["service"]), total_subscribers=SummaryItem.dict_to_items(summary["subscriber"]), log=[ AlarmLog(timestamp=datetime.datetime.now(), from_status="A", to_status="A", message="Alarm risen from event %s(%s) by rule '%s'" % (str(e.id), str(e.event_class.name), r.u_name)) ], opening_event=e.id) a.save() e.contribute_to_alarm(a) self.logger.info("[%s|%s|%s] %s raises alarm %s(%s): %r", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, a.vars) metrics["alarm_raise"] += 1 self.correlate(r, a) # Notify about new alarm if not a.root: a.managed_object.event( a.managed_object.EV_ALARM_RISEN, { "alarm": a, "subject": a.subject, "body": a.body, "symptoms": a.alarm_class.symptoms, "recommended_actions": a.alarm_class.recommended_actions, "probable_causes": a.alarm_class.probable_causes }, delay=a.alarm_class.get_notification_delay()) # Gather diagnostics when necessary AlarmDiagnosticConfig.on_raise(a) # Watch for escalations, when necessary if config.correlator.auto_escalation and not a.root: AlarmEscalation.watch_escalations(a) def correlate(self, r, a): # Topology RCA if a.alarm_class.topology_rca: with self.topology_rca_lock: self.topology_rca(a) # Rule-based RCA with self.rca_lock: if a.alarm_class.id in self.rca_forward: # Check alarm is a consequence of existing one self.set_root_cause(a) if a.alarm_class.id in self.rca_reverse: # Check alarm is the root cause for existing ones self.set_reverse_root_cause(a) # Call handlers for h in a.alarm_class.get_handlers(): try: has_root = bool(a.root) h(a) if not has_root and a.root: self.logger.info("[%s|%s|%s] Set root to %s (handler %s)", a.id, a.managed_object.name, a.managed_object.address, a.root, h) except: # noqa. Can probable happens anything from handler error_report() metrics["error", ("type", "alarm_handler")] += 1 # Call triggers if necessary if r.alarm_class.id in self.triggers: for t in self.triggers[r.alarm_class.id]: try: t.call(a) except: # noqa. Can probable happens anything from trigger error_report() # if not a.severity: # Alarm severity has been reset to 0 by handlers # Silently drop alarm self.logger.debug("Alarm severity is 0, dropping") a.delete() metrics["alarm_drop"] += 1 return def clear_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info( "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring", e.id) metrics["unknown_object"] += 1 return if r.unique: discriminator, vars = r.get_vars(e) assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if a: self.logger.info("[%s|%s|%s] %s clears alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) e.contribute_to_alarm(a) a.closing_event = e.id a.last_update = max(a.last_update, e.timestamp) a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name, ts=e.timestamp) metrics["alarm_clear"] += 1 def get_delayed_event(self, r, e): """ Check wrether all delayed conditions are met :param r: Delayed rule :param e: Event which can trigger delayed rule """ # @todo: Rewrite to scheduler discriminator, vars = r.get_vars(e) ws = e.timestamp - datetime.timedelta(seconds=r.combo_window) de = ActiveEvent.objects.filter(managed_object=e.managed_object_id, event_class=r.event_class, discriminator=discriminator, timestamp__gte=ws).first() if not de: # No starting event return None # Probable starting event found, get all interesting following event # classes fe = [ ee.event_class.id for ee in ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class__in=r.combo_event_classes, discriminator=discriminator, timestamp__gte=ws).order_by("timestamp") ] if r.combo_condition == "sequence": # Exact match if fe == self.combo_event_classes: return de elif r.combo_condition == "all": # All present if not any([c for c in r.combo_event_classes if c not in fe]): return de elif r.combo_condition == "any": # Any found if fe: return de return None def eval_expression(self, expression, **kwargs): """ Evaluate expression in given context """ env = {"re": re, "utils": utils} env.update(kwargs) return eval(expression, {}, env) def on_dispose_event(self, message, event_id, event=None, *args, **kwargs): """ Called on new dispose message """ self.logger.info("[%s] Receiving message", event_id) message.enable_async() self.get_executor("max").submit(self.dispose_worker, message, event_id, event) def dispose_worker(self, message, event_id, event_hint=None): metrics["alarm_dispose"] += 1 try: if event_hint: event = self.get_event_from_hint(event_hint) else: event = self.lookup_event(event_id) if event: self.dispose_event(event) except Exception: metrics["alarm_dispose_error"] += 1 error_report() self.ioloop.add_callback(message.finish) def lookup_event(self, event_id): """ Lookup event by id. Uses cache heating effect from classifier :param event_id: :return: ActiveEvent instance or None """ self.logger.info("[%s] Lookup event", event_id) e = ActiveEvent.get_by_id(event_id) if not e: self.logger.info("[%s] Event not found, skipping", event_id) metrics["event_lookup_failed"] += 1 metrics["event_lookups"] += 1 return e def get_event_from_hint(self, hint): """ Get ActiveEvent from json hint :param hint: :return: """ metrics["event_hints"] += 1 e = ActiveEvent.from_json(hint) # Prevent TypeError: can't compare offset-naive and offset-aware datetimes # when calculating alarm timestamp e.timestamp = e.timestamp.replace(tzinfo=None) return e def dispose_event(self, e): """ Dispose event according to disposition rule """ event_id = str(e.id) self.logger.info("[%s] Disposing", event_id) drc = self.rules.get(e.event_class.id) if not drc: self.logger.info( "[%s] No disposition rules for class %s, skipping", event_id, e.event_class.name) return # Apply disposition rules for r in drc: if self.eval_expression(r.condition, event=e): # Process action if r.action == "drop": self.logger.info("[%s] Dropped by action", event_id) e.delete() return elif r.action == "ignore": self.logger.info("[%s] Ignored by action", event_id) return elif r.action == "raise" and r.combo_condition == "none": self.raise_alarm(r, e) elif r.action == "clear" and r.combo_condition == "none": self.clear_alarm(r, e) if r.action in ("raise", "clear"): # Write discriminator if can trigger delayed event if r.unique and r.event_class.id in self.back_rules: discriminator, vars = r.get_vars(e) e.discriminator = discriminator e.save() # Process delayed combo conditions if e.event_class.id in self.back_rules: for br in self.back_rules[e.event_class.id]: de = self.get_delayed_event(br, e) if de: if br.action == "raise": self.raise_alarm(br, de) elif br.action == "clear": self.clear_alarm(br, de) if r.stop_disposition: break self.logger.info("[%s] Disposition complete", event_id) def topology_rca(self, alarm, seen=None): def can_correlate(a1, a2): return (not config.correlator.topology_rca_window or total_seconds(a1.timestamp - a2.timestamp) <= config.correlator.topology_rca_window) self.logger.debug("[%s] Topology RCA", alarm.id) seen = seen or set() if alarm.root or alarm.id in seen: self.logger.debug("[%s] Already correlated", alarm.id) return # Already correlated seen.add(alarm.id) # Get neighboring alarms na = {} # Downlinks q = Q(alarm_class=alarm.alarm_class.id, uplinks=alarm.managed_object.id) # Uplinks # @todo: Try to use $graphLookup to find affinity alarms if alarm.uplinks: q |= Q(alarm_class=alarm.alarm_class.id, managed_object__in=list(alarm.uplinks)) for a in ActiveAlarm.objects.filter(q): na[a.managed_object.id] = a # Correlate with uplinks if alarm.uplinks and len([na[o] for o in alarm.uplinks if o in na ]) == len(alarm.uplinks): # All uplinks are faulty # uplinks are ordered according to path length # Correlate with first applicable self.logger.info("[%s] All uplinks are faulty. Correlating", alarm.id) for u in alarm.uplinks: a = na[u] if can_correlate(alarm, a): self.logger.info("[%s] Set root to %s", alarm.id, a.id) alarm.set_root(a) metrics["alarm_correlated_topology"] += 1 break # Correlate neighbors' alarms for d in na: self.topology_rca(na[d], seen) self.logger.debug("[%s] Correlation completed", alarm.id)