def get_availability(self, days): now = datetime.datetime.now() d = datetime.timedelta(days=days) b = now - d outages = defaultdict(int) q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False) for o in Outage.objects.filter(q): start = max(o.start, b) stop = o.stop if o.stop else now outages[o.object] += total_seconds(stop - start) td = total_seconds(d) # Normalize to percents return dict((o, (td - outages[o]) * 100.0 / td) for o in outages)
def get_data(self, duration, **kwargs): now = datetime.datetime.now() d = datetime.timedelta(seconds=int(duration)) b = now - d outages = defaultdict(list) otime = defaultdict(int) q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False) for o in Outage.objects.filter(q): start = max(o.start, b) stop = o.stop if o.stop else now outages[o.object] += [o] otime[o.object] = total_seconds(stop - start) td = total_seconds(d) # Load managed objects mos = list(otime) chunk = 500 mo = {} while mos: for o in ManagedObject.objects.filter(id__in=mos[:chunk]): mo[o.id] = o mos = mos[chunk:] r = [] for o in sorted(otime, key=lambda x: -otime[x]): m = mo.get(o) if not m: continue # Hanging Outage dt = otime[o] downtime = "%02d:%02d:%02d" % ((dt // 3600) % 24, (dt // 60) % 60, dt % 60) if dt >= 86400: downtime = "%dd %s" % (dt // 86400, downtime) r += [(m.name, m.profile_name, m.platform, m.is_managed, m.get_status(), downtime, float(td - dt) * 100 / td, len(outages[o]))] return self.from_dataset(title=self.title, columns=[ "Object", "Profile", "Platform", TableColumn("Managed", format="bool"), TableColumn("Status", format="bool"), TableColumn("Downtime", align="right"), TableColumn("Availability", align="right", format="percent"), TableColumn("Downs", align="right", format="integer") ], data=r, enumerate=True)
def watch_escalations(cls, alarm): now = datetime.datetime.now() for esc in cls.get_class_escalations(alarm.alarm_class): for e_item in esc.escalations: # Check administrative domain if (e_item.administrative_domain and e_item.administrative_domain.id not in alarm.adm_path): continue # Check severity if e_item.min_severity and alarm.severity < e_item.min_severity: continue # Check selector if e_item.selector and not SelectorCache.is_in_selector( alarm.managed_object, e_item.selector): continue logger.debug("[%s] Watch for %s after %s seconds", alarm.id, esc.name, e_item.delay) et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay) if et > now: delay = total_seconds(et - now) else: delay = None call_later("noc.services.escalator.escalation.escalate", scheduler="escalator", pool=alarm.managed_object.escalator_shard, delay=delay, max_runs=esc.max_escalation_retries, alarm_id=alarm.id, escalation_id=esc.id, escalation_delay=e_item.delay)
def get_availability(start_date, stop_date, skip_zero_avail=False): # now = datetime.datetime.now() b = start_date d = stop_date outages = defaultdict(list) td = total_seconds(d - b) # q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False) q = (Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)) & Q(start__lt=d) for o in Outage.objects.filter(q): start = max(o.start, b) stop = o.stop if (o.stop and o.stop < d) else d if total_seconds(stop - start) == td and skip_zero_avail: continue outages[o.object] += [total_seconds(stop - start)] # Normalize to percents return dict((o, ((td - sum(outages[o])) * 100.0 / td, int(sum(outages[o])), len(outages[o]))) for o in outages)
def get_interval(self): if isinstance(self.schedule["interval"], (int, long)): # Migrate IntervalJob to MultiIntervalJob interval = [(None, self.schedule["interval"])] dt = 0 else: interval = self.schedule["interval"] dt = total_seconds(datetime.datetime.now() - self.schedule["scheduled"]) # Find appropriative time time range for t, i in interval: if t is None or t > dt: return i
def forwards(self): db = get_db() bulk = db.noc.fm.uptimes.initialize_unordered_bulk_op() n = 0 for d in db.noc.fm.uptimes.find({}): bulk.find({ "_id": d["_id"] }).update({ "$set": { "last_value": float(total_seconds(d["last"] - d["start"])) } }) n += 1 if n: bulk.execute()
def forwards(self): db = get_db() bulk = [] for d in db.noc.fm.uptimes.find({}): bulk += [UpdateOne({"_id": d["_id"]}, { "$set": { "last_value": float(total_seconds(d["last"] - d["start"])) } })] if bulk: print("Commiting changes to database") try: db.noc.fm.uptimes.bulk_write(bulk) print("Database has been synced") except BulkWriteError as e: print("Bulk write error: '%s'", e.details) print("Stopping check")
def run(self): with Span(server=self.scheduler.name, service=self.attrs[self.ATTR_CLASS], sample=self.attrs.get(self.ATTR_SAMPLE, 0), in_label=self.attrs.get(self.ATTR_KEY, "")): self.start_time = perf_counter() if self.is_retries_exceeded(): self.logger.info("[%s|%s] Retries exceeded. Remove job", self.name, self.attrs[Job.ATTR_ID]) self.remove_job() return self.logger.info( "[%s] Starting at %s (Lag %.2fms)", self.name, self.scheduler.scheduler_id, total_seconds(datetime.datetime.now() - self.attrs[self.ATTR_TS]) * 1000.0) # Run handler status = self.E_EXCEPTION delay = None with Span(service="job.dereference"): try: ds = self.dereference() can_run = self.can_run() except Exception as e: self.logger.error("Unknown error during dereference: %s", e) ds = None can_run = False if ds: with Span(service="job.run"): if can_run: try: data = self.attrs.get(self.ATTR_DATA) or {} result = self.handler(**data) if tornado.gen.is_future(result): # Wait for future result = yield result status = self.E_SUCCESS except RetryAfter as e: self.logger.info("Retry after %ss: %s", e.delay, e) status = self.E_RETRY delay = e.delay except self.failed_exceptions: status = self.E_FAILED except Exception: error_report() status = self.E_EXCEPTION else: self.logger.info("Deferred") status = self.E_DEFERRED elif ds is not None: self.logger.info("Cannot dereference") status = self.E_DEREFERENCE self.duration = perf_counter() - self.start_time self.logger.info("Completed. Status: %s (%.2fms)", self.STATUS_MAP.get(status, status), self.duration * 1000) # Schedule next run if delay is None: with Span(service="job.schedule_next"): self.schedule_next(status) else: with Span(service="job.schedule_retry"): # Retry if self.context_version: ctx = self.context or None ctx_key = self.get_context_cache_key() else: ctx = None ctx_key = None self.scheduler.set_next_run( self.attrs[self.ATTR_ID], status=status, ts=datetime.datetime.now() + datetime.timedelta(seconds=delay), duration=self.duration, context_version=self.context_version, context=ctx, context_key=ctx_key)
def duration(self): """ Logged event duration in seconds """ return total_seconds(self.timestamp - self.start_timestamp)
def extract(self): nr = 0 # Get reboots r = Reboot._get_collection().aggregate([{ "$match": { "ts": { "$gt": self.start - self.reboot_interval, "$lte": self.stop } } }, { "$sort": { "ts": 1 } }, { "$group": { "_id": "$object", "reboots": { "$push": "$ts" } } }]) # object -> [ts1, .., tsN] reboots = dict((d["_id"], d["reboots"]) for d in r) # for d in self.iter_data(): mo = ManagedObject.get_by_id(d["managed_object"]) if not mo: continue # Process reboot data o_reboots = reboots.get(d["managed_object"], []) n_reboots = hits_in_range(o_reboots, d["timestamp"] - self.reboot_interval, d["clear_timestamp"]) # self.alarm_stream.push( ts=d["timestamp"], close_ts=d["clear_timestamp"], duration=max( 0, int(total_seconds(d["clear_timestamp"] - d["timestamp"]))), alarm_id=str(d["_id"]), root=str(d.get("root") or ""), alarm_class=AlarmClass.get_by_id(d["alarm_class"]), severity=d["severity"], reopens=d.get("reopens") or 0, direct_services=sum(ss["summary"] for ss in d.get("direct_services", [])), direct_subscribers=sum( ss["summary"] for ss in d.get("direct_subscribers", [])), total_objects=sum(ss["summary"] for ss in d.get("total_objects", [])), total_services=sum(ss["summary"] for ss in d.get("total_services", [])), total_subscribers=sum( ss["summary"] for ss in d.get("total_subscribers", [])), escalation_ts=d.get("escalation_ts"), escalation_tt=d.get("escalation_tt"), managed_object=mo, pool=mo.pool, ip=mo.address, profile=mo.profile, object_profile=mo.object_profile, vendor=mo.vendor, platform=mo.platform, version=mo.version, administrative_domain=mo.administrative_domain, segment=mo.segment, container=mo.container, x=mo.x, y=mo.y, reboots=n_reboots, services=[{ "profile": ServiceProfile.get_by_id(ss["profile"]).bi_id, "summary": ss["summary"] } for ss in d.get("direct_services", [])], subscribers=[{ "profile": SubscriberProfile.get_by_id(ss["profile"]).bi_id, "summary": ss["summary"] } for ss in d.get("direct_subscribers", [])], # location=mo.container.get_address_text() ) nr += 1 self.last_ts = d["clear_timestamp"] self.alarm_stream.finish() return nr
def can_correlate(a1, a2): return (not config.correlator.topology_rca_window or total_seconds(a1.timestamp - a2.timestamp) <= config.correlator.topology_rca_window)
def register(cls, managed_object, uptime): """ Register uptime :param managed_object: Managed object reference :param uptime: Registered uptime in seconds """ if not uptime: return oid = managed_object.id now = datetime.datetime.now() delta = datetime.timedelta(seconds=uptime) logger.debug("[%s] Register uptime %s", managed_object.name, delta) # Update data c = cls._get_collection() d = c.find_one({"object": oid, "stop": None}) if d: # Check for reboot is_rebooted = False if d["last_value"] > uptime: # Check for counter wrapping # Get wrapped delta dl = cls.FWRAP - d["last_value"] + uptime # Get timestamp delta tsd = total_seconds(now - d["last"]) if abs(dl - tsd) > tsd * cls.WPREC: is_rebooted = True else: logger.debug("Counter wrap detected") if is_rebooted: # Reboot registered # Closing existing uptime ts = now - delta logger.debug("[%s] Closing uptime (%s - %s, delta %s)", managed_object.name, d["start"], ts - cls.SEC, delta) c.update({"_id": d["_id"]}, {"$set": {"stop": ts - cls.SEC}}) # Start new uptime logger.debug("[%s] Starting new uptime from %s", managed_object.name, ts) c.insert({ "object": oid, "start": ts, "stop": None, "last": now, "last_value": uptime }) # Reboot.register(managed_object, ts, d["last"]) else: logger.debug("[%s] Refreshing existing uptime (%s - %s)", managed_object.name, d["start"], now) c.update({"_id": d["_id"]}, {"$set": { "last": now, "last_value": uptime }}) else: # First uptime logger.debug("[%s] First uptime from %s", managed_object.name, now) c.insert({ "object": oid, "start": now - delta, "stop": None, "last": now, "last_value": uptime })
def get_data(self, request, duration, from_date=None, to_date=None, **kwargs): now = datetime.datetime.now() if not from_date: duration = 1 if int(duration): self.logger.info("Use duration\n") d = datetime.timedelta(seconds=int(duration)) b = now - d q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False) else: b = datetime.datetime.strptime(from_date, "%d.%m.%Y") q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False) if to_date: if from_date == to_date: t1 = datetime.datetime.strptime( to_date, "%d.%m.%Y") + datetime.timedelta(1) else: t1 = datetime.datetime.strptime(to_date, "%d.%m.%Y") else: t1 = now q &= Q(start__lte=t1) | Q(stop__lte=t1) d = datetime.timedelta(seconds=int((t1 - b).total_seconds())) outages = defaultdict(list) otime = defaultdict(int) for o in Outage.objects.filter(q): start = max(o.start, b) stop = o.stop if o.stop else now outages[o.object] += [o] otime[o.object] += total_seconds(stop - start) td = total_seconds(d) if not request.user.is_superuser: for mo in ManagedObject.objects.exclude( administrative_domain__in=UserAccess.get_domains( request.user)): if mo.id in otime: otime.pop(mo.id) # Load managed objects mos = list(otime) chunk = 500 mo = {} while mos: for o in ManagedObject.objects.filter(id__in=mos[:chunk]): mo[o.id] = o mos = mos[chunk:] r = [] for o in sorted(otime, key=lambda x: -otime[x]): m = mo.get(o) if not m: continue # Hanging Outage dt = min(td, otime[o]) downtime = "%02d:%02d:%02d" % ((dt // 3600) % 24, (dt // 60) % 60, dt % 60) if dt >= 86400: downtime = "%dd %s" % (dt // 86400, downtime) if td: avail = float(td - dt) * 100 / td else: avail = 0 r += [(m.name, m.address, m.profile.name, m.platform.name if m.platform else "", _("Yes") if m.is_managed else _("No"), _("Yes") if m.get_status() else _("No"), downtime, avail, len(outages[o]))] return self.from_dataset(title=self.title, columns=[ _("Managed Object"), _("Address"), _("Profile"), _("Platform"), TableColumn(_("Managed"), align="right"), TableColumn(_("Status"), align="right"), TableColumn(_("Downtime"), align="right"), TableColumn(_("Availability"), align="right", format="percent"), TableColumn(_("Downs"), align="right", format="integer") ], data=r, enumerate=True)
def process_mrtasks(self): """ Process Map/Reduce tasks """ def map_callback(mt_id, result=None, error=None): try: mt = MapTask.objects.get(id=mt_id) except MapTask.DoesNotExist: self.logger.error("Late answer for map task %d is ignored" % mt_id) return if error: # Process non-fatal reasons TIMEOUTS = { ERR_ACTIVATOR_NOT_AVAILABLE: 10, ERR_OVERLOAD: 10, ERR_DOWN: 30, } if error.code in TIMEOUTS: # Any of non-fatal reasons require retry timeout = TIMEOUTS[error.code] variation = 2 timeout = random.randint(-timeout / variation, timeout / variation) next_try = (datetime.datetime.now() + datetime.timedelta(seconds=timeout)) if error.code in (ERR_OVERLOAD, ERR_ACTIVATOR_NOT_AVAILABLE): next_retries = mt.retries_left else: next_retries = mt.retries_left - 1 if mt.retries_left and (not mt.task or next_try < mt.task.stop_time): # Check we're still in task time and have retries left self.log_mrt(logging.INFO, task=mt, status="retry") mt.next_try = next_try mt.retries_left = next_retries mt.status = "W" mt.save() return mt.status = "F" mt.script_result = dict(code=error.code, text=error.text) self.log_mrt(logging.INFO, task=mt, status="failed", code=error.code, error=error.text) else: mt.status = "C" mt.script_result = result self.log_mrt(logging.INFO, task=mt, status="completed") mt.save() # Additional stack frame to store mt_id in a closure def exec_script(mt): kwargs = {} if mt.script_params: kwargs = mt.script_params self.log_mrt(logging.INFO, task=mt, status="running", args=kwargs) self.script(mt.managed_object, mt.map_script, lambda result=None, error=None: map_callback( mt.id, result, error), timeout=mt.script_timeout, **kwargs) def fail_task(mt, code, text): mt.status = "F" mt.script_result = dict(code=code, text=text) try: mt.save() except Exception: pass # Can raise integrity error if MRT is gone self.log_mrt(logging.INFO, task=mt, status="failed", code=code, error=text) t = datetime.datetime.now() # self.logger.debug("Processing MRT schedules") # Reset rates sae_mrt_rate = 0 shard_mrt_rate = {} # shard_id -> count throttled_shards = set() # shard_id self.blocked_pools = set() # Reset block status # Run tasks qs = {"status": "W", "next_try__lte": t} if not self.single_shard: qs["managed_object__activator__shard__is_active"] = True qs["managed_object__activator__shard__name__in"] = self.shards for mt in MapTask.objects.filter(**qs)\ .order_by("next_try")\ .select_related("activator", "managed_object")\ .select_for_update(): # Check object is managed if not mt.managed_object.is_managed: fail_task(mt, ERR_OBJECT_NOT_MANAGED, "Object is not managed") continue # Check reduce task still valid is_valid_reduce = True try: mt.task except ReduceTask.DoesNotExist: is_valid_reduce = False # Check for task timeouts if not is_valid_reduce or (mt.task and mt.task.stop_time < t): fail_task(mt, ERR_TIMEOUT, text="Timed out") continue # Check blocked pools if mt.managed_object.activator.name in self.blocked_pools: # Silently skip task until next round self.logger.debug("Delaying task to the blocked pool '%s'" % mt.managed_object.activator.name) continue # Check for global rate limit if self.max_mrt_rate_per_sae: if sae_mrt_rate > self.max_mrt_rate_per_sae: self.log_mrt(logging.INFO, task=mt, status="throttled", msg="Per-SAE rate limit exceeded " "(%d)" % self.max_mrt_rate_per_sae) break sae_mrt_rate += 1 # Check for shard rate limit if self.max_mrt_rate_per_shard: s_id = mt.managed_object.activator.shard.id if s_id in throttled_shards: # Shard is throttled, do not log continue sr = shard_mrt_rate.get(s_id, 0) + 1 if sr > self.max_mrt_rate_per_shard: # Log and throttle shard self.log_mrt(logging.INFO, task=mt, status="throttled", msg="Per-shard rate limit exceeded " "(%d)" % self.max_mrt_rate_per_shard) throttled_shards.add(s_id) else: shard_mrt_rate[s_id] = sr mt.status = "R" mt.save() exec_script(mt) dt = total_seconds(datetime.datetime.now() - t) # self.logger.debug("MRT Schedules processed in %ss" % dt) if dt > self.mrt_schedule_interval: self.logger.error( "SAE is overloaded by MRT scheduling (took %ss)" % dt)