def make_call(url, body, limit=3): req_headers = { "X-NOC-Calling-Service": self._service.name, "Content-Type": "text/json" } sample = 1 if span_ctx and span_id else 0 with Span(server=self._service_name, service=method, sample=sample, context=span_ctx, parent=span_id) as span: if sample: req_headers["X-NOC-Span-Ctx"] = span.span_context req_headers["X-NOC-Span"] = span.span_id code, headers, data = yield fetch( url, method="POST", headers=req_headers, body=body, connect_timeout=CONNECT_TIMEOUT, request_timeout=REQUEST_TIMEOUT) # Process response if code == 200: raise tornado.gen.Return(data) elif code == 307: # Process redirect if not limit: raise RPCException("Redirects limit exceeded") url = headers.get("location") self._logger.debug("Redirecting to %s", url) r = yield make_call(url, data, limit - 1) raise tornado.gen.Return(r) elif code in (598, 599): span.error_code = code self._logger.debug("Timed out") raise tornado.gen.Return(None) else: span.error_code = code raise RPCHTTPError("HTTP Error %s: %s" % (code, body))
def execute(self, path, method, **kwargs): """ Perform request and return result :param path: :param method: :param kwargs: :return: """ self.buffer = b"" self.path = path self.method = method self.error = None with Span(server=self.script.credentials.get("address"), service=self.name, in_label=self.method) as s: with IOLoopContext() as loop: loop.run_until_complete(self.submit()) if self.error: if s: s.error_text = str(self.error) raise self.error return self.result
def handler(self, **kwargs): with Span(sample=self.object.box_telemetry_sample), bulk_datastream_changes(): has_cli = "C" in self.object.get_access_preference() ResolverCheck(self).run() if self.object.auth_profile and self.object.auth_profile.enable_suggest: SuggestSNMPCheck(self).run() if self.object.object_profile.enable_box_discovery_profile: ProfileCheck(self).run() if has_cli and self.object.auth_profile and self.object.auth_profile.enable_suggest: SuggestCLICheck(self).run() if self.object.auth_profile and self.object.auth_profile.enable_suggest: # Still suggest self.logger.info( "Cannot choose valid credentials. Stopping" ) return # Run remaining checks if has_cli and self.allow_sessions(): self.logger.debug("Using CLI sessions") with self.object.open_session(): self.run_checks() else: self.run_checks()
def run_job(self, job, mo, checks): scheduler = Scheduler("discovery", pool=mo.pool.name, service=ServiceStub()) jcls = self.jcls[job] # Try to dereference job job_args = scheduler.get_collection().find_one({Job.ATTR_CLASS: jcls, Job.ATTR_KEY: mo.id}) if job_args: self.print("Job ID: %s" % job_args["_id"]) else: job_args = {Job.ATTR_ID: "fakeid", Job.ATTR_KEY: mo.id} job_args["_checks"] = checks job = get_handler(jcls)(scheduler, job_args) if job.context_version: ctx_key = job.get_context_cache_key() self.print("Loading job context from %s" % ctx_key) ctx = cache.get(ctx_key, version=job.context_version) if not ctx: self.print("Job context is empty") job.load_context(ctx) sample = 1 if self.trace else 0 with Span(sample=sample): job.dereference() job.handler() if sample: spans = get_spans() self.print("Spans:") self.print("\n".join(spans)) if scheduler.service.metrics: self.print("Collected CH data:") for f in scheduler.service.metrics: self.print("Fields: %s", f) self.print("\n".join(scheduler.service.metrics[f])) if job.context_version and job.context: self.print("Saving job context to %s" % ctx_key) scheduler.cache_set(key=ctx_key, value=job.context, version=job.context_version) scheduler.apply_cache_ops() time.sleep(3)
def run(self): with Span(server=self.scheduler.name, service=self.attrs[self.ATTR_CLASS], sample=self.attrs.get(self.ATTR_SAMPLE, 0), in_label=self.attrs.get(self.ATTR_KEY, "")): self.start_time = perf_counter() if self.is_retries_exceeded(): self.logger.info("[%s|%s] Retries exceeded. Remove job", self.name, self.attrs[Job.ATTR_ID]) self.remove_job() return self.logger.info( "[%s] Starting at %s (Lag %.2fms)", self.name, self.scheduler.scheduler_id, total_seconds(datetime.datetime.now() - self.attrs[self.ATTR_TS]) * 1000.0) # Run handler status = self.E_EXCEPTION delay = None with Span(service="job.dereference"): try: ds = self.dereference() can_run = self.can_run() except Exception as e: self.logger.error("Unknown error during dereference: %s", e) ds = None can_run = False if ds: with Span(service="job.run"): if can_run: try: data = self.attrs.get(self.ATTR_DATA) or {} result = self.handler(**data) if tornado.gen.is_future(result): # Wait for future result = yield result status = self.E_SUCCESS except RetryAfter as e: self.logger.info("Retry after %ss: %s", e.delay, e) status = self.E_RETRY delay = e.delay except self.failed_exceptions: status = self.E_FAILED except Exception: error_report() status = self.E_EXCEPTION else: self.logger.info("Deferred") status = self.E_DEFERRED elif ds is not None: self.logger.info("Cannot dereference") status = self.E_DEREFERENCE self.duration = perf_counter() - self.start_time self.logger.info("Completed. Status: %s (%.2fms)", self.STATUS_MAP.get(status, status), self.duration * 1000) # Schedule next run if delay is None: with Span(service="job.schedule_next"): self.schedule_next(status) else: with Span(service="job.schedule_retry"): # Retry if self.context_version: ctx = self.context or None ctx_key = self.get_context_cache_key() else: ctx = None ctx_key = None self.scheduler.set_next_run( self.attrs[self.ATTR_ID], status=status, ts=datetime.datetime.now() + datetime.timedelta(seconds=delay), duration=self.duration, context_version=self.context_version, context=ctx, context_key=ctx_key)
def post(self, *args, **kwargs): """ Request is the list of { id: <managed object id>, script: <script name>, args: <arguments> } :param args: :param kwargs: :return: """ metrics["mrt_requests"] += 1 # Parse request req = ujson.loads(self.request.body) # Disable nginx proxy buffering self.set_header("X-Accel-Buffering", "no") # Object ids ids = set(int(d["id"]) for d in req if "id" in d and "script" in d) logger.info( "Run task on parralels: %d (Max concurrent %d), for User: %s", len(req), self.CONCURRENCY, self.current_user, ) # Check access qs = ManagedObject.objects.filter(id__in=list(ids)) if not self.current_user.is_superuser: adm_domains = UserAccess.get_domains(self.current_user) qs = qs.filter(administrative_domain__in=adm_domains) ids = dict(qs.values_list("id", "bi_id")) with Span( sample=int(config.mrt.enable_command_logging), server="MRT", service="post", client=self.current_user, in_label=req, ) as span: if self.service.use_telemetry: logger.info("[%s] Enable telemetry for task, user: %s", span.span_id, self.current_user) futures = [] for d in req: if "id" not in d or "script" not in d: continue oid = int(d["id"]) if oid not in ids: yield self.write_chunk({ "id": str(d["id"]), "error": "Access denied" }) metrics["mrt_access_denied"] += 1 if len(futures) >= config.mrt.max_concurrency: wi = tornado.gen.WaitIterator(*futures) r = yield next(wi) del futures[wi.current_index] yield self.write_chunk(r) futures += [ self.run_script(oid, d["script"], d.get("args"), span_id=span.span_id, bi_id=ids.get(oid)) ] # Wait for rest wi = tornado.gen.WaitIterator(*futures) while not wi.done(): r = yield next(wi) yield self.write_chunk(r) logger.info("Done")
def handler(self, whois_route=None, **kwargs): if whois_route: self.set_artefact("whois_route", whois_route) with Span(sample=0): PrefixCheck(self).run()
def handler(self): # type: () -> Tuple[int, Dict] # Decode request try: req = ujson.loads(self.request.body) except ValueError: return 400, {"status": False, "error": "Cannot decode JSON"} # Validate try: req = Request.clean(req) except ValueError as e: return 400, {"status": False, "error": "Bad request: %s" % e} # Find start of path try: with Span(in_label="start_of_path"): start, start_iface = self.get_object_and_interface( **req["from"]) except ValueError as e: return 404, { "status": False, "error": "Failed to find start of path: %s" % e } # Find end of path if "level" in req["to"]: goal = ManagedObjectLevelGoal(req["to"]["level"]) end_iface = None else: try: with Span(in_label="end_of_path"): end, end_iface = self.get_object_and_interface(**req["to"]) goal = ManagedObjectGoal(end) except ValueError as e: return 404, { "status": False, "error": "Failed to find end of path: %s" % e } # Trace the path if req.get("config"): max_depth = req["config"]["max_depth"] n_shortest = req["config"]["n_shortest"] else: max_depth = MAX_DEPTH_DEFAULT n_shortest = N_SHORTEST_DEFAULT error = None with Span(in_label="find_path"): t0 = perf_counter() try: paths = list( self.iter_paths( start, start_iface, goal, end_iface, constraints=self.get_constraints( start, start_iface, req.get("constraints")), max_depth=max_depth, n_shortest=n_shortest, )) except ValueError as e: error = str(e) dt = perf_counter() - t0 if error: return 404, {"status": False, "error": e, "time": dt} return 200, {"status": True, "paths": paths, "time": dt}
def escalate(alarm_id, escalation_id, escalation_delay, *args, **kwargs): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) alarm.log_message(msg, to_save=True) def summary_to_list(summary, model): r = [] for k in summary: p = model.get_by_id(k.profile) if not p or getattr(p, "show_in_summary", True) is False: continue r += [{ "profile": p.name, "summary": k.summary, "order": (getattr(p, "display_order", 100), -k.summary), }] return sorted(r, key=operator.itemgetter("order")) logger.info("[%s] Performing escalations", alarm_id) alarm = get_alarm(alarm_id) if alarm is None: logger.info("[%s] Missing alarm, skipping", alarm_id) metrics["escalation_missed_alarm"] += 1 return if alarm.status == "C": logger.info("[%s] Alarm is closed, skipping", alarm_id) metrics["escalation_already_closed"] += 1 return if alarm.root: log("[%s] Alarm is not root cause, skipping", alarm_id) metrics["escalation_alarm_is_not_root"] += 1 return # escalation = AlarmEscalation.get_by_id(escalation_id) if not escalation: log("Escalation %s is not found, skipping", escalation_id) metrics["escalation_not_found"] += 1 return if alarm.managed_object.tt_system: sample = alarm.managed_object.tt_system.telemetry_sample else: sample = PARENT_SAMPLE with Span(client="escalator", sample=sample) as ctx: alarm.set_escalation_context() # Evaluate escalation chain mo = alarm.managed_object for a in escalation.escalations: if a.delay != escalation_delay: continue # Try other type # Check administrative domain if a.administrative_domain and a.administrative_domain.id not in alarm.adm_path: continue # Check severity if a.min_severity and alarm.severity < a.min_severity: continue # Check selector if a.selector and not SelectorCache.is_in_selector(mo, a.selector): continue # Check time pattern if a.time_pattern and not a.time_pattern.match(alarm.timestamp): continue # Render escalation message if not a.template: log("No escalation template, skipping") continue # Check global limits # @todo: Move into escalator service # @todo: Process per-ttsystem limits ets = datetime.datetime.now() - datetime.timedelta( seconds=config.escalator.ets) ae = ActiveAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) ae += ArchivedAlarm._get_collection().count_documents( {"escalation_ts": { "$gte": ets }}) if ae >= config.escalator.tt_escalation_limit: logger.error( "Escalation limit exceeded (%s/%s). Skipping", ae, config.escalator.tt_escalation_limit, ) metrics["escalation_throttled"] += 1 alarm.set_escalation_error( "Escalation limit exceeded (%s/%s). Skipping" % (ae, config.escalator.tt_escalation_limit)) return # Check whether consequences has escalations cons_escalated = sorted(alarm.iter_escalated(), key=operator.attrgetter("timestamp")) affected_objects = sorted(alarm.iter_affected(), key=operator.attrgetter("name")) # segment = alarm.managed_object.segment if segment.is_redundant: uplinks = alarm.managed_object.data.uplinks lost_redundancy = len(uplinks) > 1 affected_subscribers = summary_to_list( segment.total_subscribers, SubscriberProfile) affected_services = summary_to_list(segment.total_services, ServiceProfile) else: lost_redundancy = False affected_subscribers = [] affected_services = [] # ctx = { "alarm": alarm, "affected_objects": affected_objects, "cons_escalated": cons_escalated, "total_objects": summary_to_list(alarm.total_objects, ManagedObjectProfile), "total_subscribers": summary_to_list(alarm.total_subscribers, SubscriberProfile), "total_services": summary_to_list(alarm.total_services, ServiceProfile), "tt": None, "lost_redundancy": lost_redundancy, "affected_subscribers": affected_subscribers, "affected_services": affected_services, } # Escalate to TT if a.create_tt and mo.can_escalate(): tt_id = None if alarm.escalation_tt: log("Already escalated with TT #%s", alarm.escalation_tt) else: pre_reason = escalation.get_pre_reason(mo.tt_system) active_maintenance = Maintenance.get_object_maintenance(mo) if active_maintenance: for m in active_maintenance: log( "Object is under maintenance: %s (%s-%s)", m.subject, m.start, m.stop, ) metrics["escalation_stop_on_maintenance"] += 1 elif pre_reason is not None: subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug( "[%s] Escalation message:\nSubject: %s\n%s", alarm_id, subject, body) log("Creating TT in system %s", mo.tt_system.name) tts = mo.tt_system.get_system() try: try: tt_id = tts.create_tt( queue=mo.tt_queue, obj=mo.tt_system_id, reason=pre_reason, subject=subject, body=body, login="******", timestamp=alarm.timestamp, ) except TemporaryTTError as e: metrics["escalation_tt_retry"] += 1 log( "Temporary error detected. Retry after %ss", RETRY_TIMEOUT) mo.tt_system.register_failure() Job.retry_after(get_next_retry(), str(e)) ctx["tt"] = "%s:%s" % (mo.tt_system.name, tt_id) alarm.escalate( ctx["tt"], close_tt=a.close_tt, wait_tt=ctx["tt"] if a.wait_tt else None, ) if tts.promote_group_tt and a.promote_group_tt: # Create group TT log("Promoting to group tt") gtt = tts.create_group_tt( tt_id, alarm.timestamp) # Append affected objects for ao in alarm.iter_affected(): if ao.can_escalate(True): if ao.tt_system == mo.tt_system: log( "Appending object %s to group tt %s", ao.name, gtt) try: tts.add_to_group_tt( gtt, ao.tt_system_id) except TTError as e: alarm.set_escalation_error( "[%s] %s" % (mo.tt_system.name, e)) else: log( "Cannot append object %s to group tt %s: Belongs to other TT system", ao.name, gtt, ) else: log( "Cannot append object %s to group tt %s: Escalations are disabled", ao.name, gtt, ) metrics["escalation_tt_create"] += 1 except TTError as e: log("Failed to create TT: %s", e) metrics["escalation_tt_fail"] += 1 alarm.log_message("Failed to escalate: %s" % e, to_save=True) alarm.set_escalation_error("[%s] %s" % (mo.tt_system.name, e)) else: log("Cannot find pre reason") metrics["escalation_tt_fail"] += 1 if tt_id and cons_escalated: # Notify consequences for ca in cons_escalated: c_tt_name, c_tt_id = ca.escalation_tt.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, body="Covered by TT %s" % tt_id, login="******") metrics["escalation_tt_comment"] += 1 except NotImplementedError: log( "Cannot add comment to %s: Feature not implemented", ca.escalation_tt, ) metrics["escalation_tt_comment_fail"] += 1 except TTError as e: log("Failed to add comment to %s: %s", ca.escalation_tt, e) metrics["escalation_tt_comment_fail"] += 1 else: log( "Failed to add comment to %s: Invalid TT system", ca.escalation_tt) metrics["escalation_tt_comment_fail"] += 1 # Send notification if a.notification_group and mo.can_notify(): subject = a.template.render_subject(**ctx) body = a.template.render_body(**ctx) logger.debug("[%s] Notification message:\nSubject: %s\n%s", alarm_id, subject, body) log("Sending notification to group %s", a.notification_group.name) a.notification_group.notify(subject, body) alarm.set_clear_notification(a.notification_group, a.clear_template) metrics["escalation_notify"] += 1 # if a.stop_processing: logger.debug("Stopping processing") break nalarm = get_alarm(alarm_id) if nalarm and nalarm.status == "C": nalarm.log_message( "Alarm has been closed during escalation. Try to deescalate") logger.info( "[%s] Alarm has been closed during escalation. Try to deescalate", alarm.id) metrics["escalation_closed_while_escalated"] += 1 if tt_id and not nalarm.escalation_tt: nalarm.escalation_ts = datetime.datetime.now() nalarm.escalation_tt = "%s:%s" % (mo.tt_system.name, tt_id) nalarm.save() if not nalarm.escalation_close_ts and not nalarm.escalation_close_error: notify_close( alarm_id=alarm_id, tt_id=nalarm.escalation_tt, subject="Closing", body="Closing", notification_group_id=alarm.clear_notification_group.id if alarm.clear_notification_group else None, close_tt=alarm.close_tt, ) elif nalarm == "A" and not nalarm.escalation_tt and tt_id: logger.error("[%s] Alarm without escalation TT: %s", alarm.id, tt_id) logger.info("[%s] Escalations loop end", alarm_id)
def notify_close(alarm_id, tt_id, subject, body, notification_group_id, close_tt=False): def log(message, *args): msg = message % args logger.info("[%s] %s", alarm_id, msg) if tt_id: alarm = get_alarm(alarm_id) alarm.set_escalation_close_ctx() if (alarm and alarm.status == "C" and (alarm.escalation_close_ts or alarm.escalation_close_error)): log("Alarm is already deescalated") metrics["escalation_already_deescalated"] += 1 return with Span(client="escalator", sample=PARENT_SAMPLE): c_tt_name, c_tt_id = tt_id.split(":") cts = TTSystem.get_by_name(c_tt_name) if cts: tts = cts.get_system() if close_tt: # Close tt try: log("Closing TT %s", tt_id) tts.close_tt(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_close"] += 1 if alarm: alarm.close_escalation() except TemporaryTTError as e: log("Temporary error detected while closing tt %s: %s", tt_id, e) metrics["escalation_tt_close_retry"] += 1 Job.retry_after(get_next_retry(), str(e)) cts.register_failure() if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) except TTError as e: log("Failed to close tt %s: %s", tt_id, e) metrics["escalation_tt_close_fail"] += 1 if alarm: alarm.set_escalation_close_error( "[%s] %s" % (alarm.managed_object.tt_system.name, e)) else: # Append comment to tt try: log("Appending comment to TT %s", tt_id) tts.add_comment(c_tt_id, subject=subject, body=body, login="******") metrics["escalation_tt_comment"] += 1 except TTError as e: log("Failed to add comment to %s: %s", tt_id, e) metrics["escalation_tt_comment_fail"] += 1 else: log("Failed to add comment to %s: Invalid TT system", tt_id) metrics["escalation_tt_comment_fail"] += 1 if notification_group_id: notification_group = NotificationGroup.get_by_id(notification_group_id) if notification_group: log("Sending notification to group %s", notification_group.name) notification_group.notify(subject, body) metrics["escalation_notify"] += 1 else: log("Invalid notification group %s", notification_group_id)
def post(self, *args, **kwargs): span_ctx = self.request.headers.get("X-NOC-Span-Ctx", 0) span_id = self.request.headers.get("X-NOC-Span", 0) sample = 1 if span_ctx and span_id else 0 # Parse JSON try: req = ujson.loads(self.request.body) except ValueError as e: self.api_error(e) raise tornado.gen.Return() # Parse request id = req.get("id") params = req.get("params", []) method = req.get("method") if not method or not hasattr(self.api_class, method): self.api_error( "Invalid method: '%s'" % method, id=id ) raise tornado.gen.Return() api = self.api_class(self.service, self.request, self) h = getattr(api, method) if not getattr(h, "api", False): self.api_error( "Method is not callable: '%s'" % method, id=id ) raise tornado.gen.Return() calling_service = self.request.headers.get( self.CALLING_SERVICE_HEADER, "unknown" ) self.service.logger.debug( "[RPC call from %s] %s.%s(%s)", calling_service, api.name, method, params ) in_label = None if config.features.forensic: lh = getattr(api, "%s_get_label" % method, None) if lh: in_label = lh(*params) with Span(server=self.service.name, service="api.%s" % method, sample=sample, parent=span_id, context=span_ctx, in_label=in_label) as span: try: if getattr(h, "executor", ""): # Threadpool version executor = self.service.get_executor(h.executor) result = executor.submit(h, *params) else: # Serialized version result = h(*params) if tornado.gen.is_future(result): result = yield result if isinstance(result, Redirect): # Redirect protocol extension self.set_status(307, "Redirect") self.set_header("Location", result.location) self.write(ujson.dumps({ "id": id, "method": result.method, "params": result.params })) else: # Dump output self.write(ujson.dumps({ "id": id, "error": None, "result": result })) except NOCError as e: span.error_code = e.code span.error_text = str(e) self.api_error( "Failed: %s" % e, id=id, code=e.code ) except Exception as e: error_report() span.error_code = ERR_UNKNOWN span.error_text = str(e) self.api_error( "Failed: %s" % e, id=id )
def handle(self, script, object_name, arguments, pretty, yaml_o, use_snmp, access_preference, update_spec, beef_output, *args, **options): # Get object obj = self.get_object(object_name[0]) # Build credentials credentials = self.get_credentials(obj) # Parse arguments args = self.get_script_args(arguments) # Load script script = script[0] if "." not in script: script = "%s.%s" % (obj.profile.name, script) script_class = loader.get_script(script) if not script_class: self.die("Failed to load script %s" % script_class) # Get capabilities caps = obj.get_caps() # if not use_snmp: if "snmp_ro" in credentials: del credentials["snmp_ro"] if "SNMP" in caps: del caps["SNMP"] if access_preference: credentials["access_preference"] = access_preference # Get version info if obj.version: version = { "vendor": obj.vendor.name if obj.vendor else None, "platform": obj.platform.name if obj.platform else None, "version": obj.version.version if obj.version else None, "image": obj.software_image if obj.software_image else None, } else: version = None # Run script service = ServiceStub(pool=obj.pool.name) scr = script_class( service=service, credentials=credentials, capabilities=caps, args=args, version=version, timeout=3600, name=script, ) span_sample = 1 if update_spec or beef_output else 0 with Span(sample=span_sample): result = scr.run() if pretty: pprint.pprint(result) elif yaml_o: import sys yaml.dump(result, sys.stdout) else: self.stdout.write("%s\n" % result) if update_spec: self.update_spec(update_spec, scr) if beef_output: spec = self.update_spec(update_spec, scr, save=False) bef_script_class = loader.get_script( "%s.%s" % (obj.profile.name, "get_beef")) beef_scr = bef_script_class( service=service, credentials=credentials, capabilities=caps, args={"spec": spec.get_spec_request()}, version=version, timeout=3600, name="%s.%s" % (obj.profile.name, "get_beef"), ) bdata = beef_scr.run() beef = Beef.from_json(bdata) storage = StorageStub("osfs:///") sdata = beef.get_data(decode=True) with storage.open_fs() as fs: fs.writebytes(beef_output, bytes(yaml.dump(sdata)))
def handler(self, **kwargs): with Span(sample=0): MACDiscoveryCheck(self).run()