class PingService(Service): name = "ping" # leader_group_name = "ping-%(pool)s" pooled = True require_nsq_writer = True process_name = "noc-%(name).10s-%(pool).5s" PING_CLS = { True: "NOC | Managed Object | Ping OK", False: "NOC | Managed Object | Ping Failed" } def __init__(self): super(PingService, self).__init__() self.messages = [] self.send_callback = None self.mappings_callback = None self.metrics_callback = None self.probes = {} # mo id -> ProbeSetting self.ping = None self.is_throttled = False self.slot_number = 0 self.total_slots = 0 @tornado.gen.coroutine def on_activate(self): # Acquire slot self.slot_number, self.total_slots = yield self.acquire_slot() if self.total_slots > 1: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) else: self.logger.info("Enabling standalone mode") self.logger.info("Setting nice level to -20") try: os.nice(-20) except OSError as e: self.logger.info("Cannot set nice level to -20: %s", e) # metrics["down_objects"] = 0 # Open ping sockets self.ping = Ping(self.ioloop, tos=config.ping.tos) # Send spooled messages every 250ms self.logger.debug("Stating message sender task") self.send_callback = tornado.ioloop.PeriodicCallback( self.send_messages, # @fixme have to be configured 250, self.ioloop) self.send_callback.start() # Start tracking changes self.ioloop.add_callback(self.get_object_mappings) def get_mon_data(self): r = super(PingService, self).get_mon_data() r["throttled"] = self.is_throttled return r def register_message(self, object, timestamp, data): """ Spool message to be sent """ self.messages += [{"ts": timestamp, "object": object, "data": data}] @tornado.gen.coroutine def send_messages(self): """ Periodic task to send collected messages to classifier """ if self.messages: messages, self.messages = self.messages, [] self.mpub("events.%s" % config.pool, messages) @tornado.gen.coroutine def get_object_mappings(self): """ Subscribe and track datastream changes """ # Register RPC aliases client = PingDataStreamClient("cfgping", service=self) # Track stream changes while True: self.logger.info("Starting to track object mappings") try: yield client.query(limit=config.ping.ds_limit, filters=[ "pool(%s)" % config.pool, "shard(%d,%d)" % (self.slot_number, self.total_slots) ], block=1) except NOCError as e: self.logger.info("Failed to get object mappings: %s", e) yield tornado.gen.sleep(1) def update_probe(self, data): if data["id"] in self.probes: self._change_probe(data) else: self._create_probe(data) def delete_probe(self, id): if id not in self.probes: return ps = self.probes[id] ip = self.probes[id].address self.logger.info("Delete probe: %s", ip) ps.task.stop() ps.task = None del self.probes[id] metrics["ping_probe_delete"] += 1 if ps.status is not None and not ps.status: metrics["down_objects"] -= 1 metrics["ping_objects"] = len(self.probes) def _create_probe(self, data): """ Create new ping probe """ self.logger.info("Create probe: %s (%ds)", data["address"], data["interval"]) ps = ProbeSetting(**data) self.probes[data["id"]] = ps pt = PeriodicOffsetCallback(functools.partial(self.ping_check, ps), ps.interval * 1000) ps.task = pt pt.start() metrics["ping_probe_create"] += 1 metrics["ping_objects"] = len(self.probes) def _change_probe(self, data): self.logger.info("Update probe: %s (%ds)", data["address"], data["interval"]) ps = self.probes[data["id"]] if ps.interval != data["interval"]: ps.task.set_callback_time(data["interval"] * 1000) if ps.address != data["address"]: self.logger.info("Changing address: %s -> %s", ps.address, data["address"]) ps.address = data["address"] ps.update(**data) metrics["ping_probe_update"] += 1 metrics["ping_objects"] = len(self.probes) @tornado.gen.coroutine def ping_check(self, ps): """ Perform ping check and set result """ if ps.id not in self.probes: return address = ps.address t0 = time.time() metrics["ping_check_total"] += 1 if ps.time_cond: dt = datetime.datetime.fromtimestamp(t0) if not eval(ps.time_cond, {"T": dt}): metrics["ping_check_skips"] += 1 return rtt, attempts = yield self.ping.ping_check_rtt(ps.address, policy=ps.policy, size=ps.size, count=ps.count, timeout=ps.timeout) s = rtt is not None if s: metrics["ping_check_success"] += 1 else: metrics["ping_check_fail"] += 1 if ps and s != ps.status: if s: metrics["down_objects"] -= 1 else: metrics["down_objects"] += 1 if config.ping.throttle_threshold: # Process throttling down_ratio = (float(metrics["down_objects"]) * 100.0 / float(metrics["ping_objects"])) if self.is_throttled: restore_ratio = config.ping.restore_threshold or config.ping.throttle_threshold if down_ratio <= restore_ratio: self.logger.info( "Leaving throttling mode (%s%% <= %s%%)", down_ratio, restore_ratio) self.is_throttled = False # @todo: Send unthrottling message elif down_ratio > config.ping.throttle_threshold: self.logger.info("Entering throttling mode (%s%% > %s%%)", down_ratio, config.ping.throttle_threshold) self.is_throttled = True # @todo: Send throttling message ts = " (Throttled)" if self.is_throttled else "" self.logger.info("[%s] Changing status to %s%s", address, s, ts) ps.status = s if ps and not self.is_throttled and s != ps.sent_status: self.register_message( ps.id, t0, { "source": "system", "$event": { "class": self.PING_CLS[s], "vars": {} } }) ps.sent_status = s self.logger.debug("[%s] status=%s rtt=%s", address, s, rtt) # Send RTT and attempts metrics to_report_rtt = rtt is not None and ps.report_rtt if (to_report_rtt or ps.report_attempts) and ps.bi_id: lt = time.localtime(t0) fields = ["ping", "date", "ts", "managed_object"] values = [ time.strftime("%Y-%m-%d", lt), time.strftime("%Y-%m-%d %H:%M:%S", lt), str(ps.bi_id) ] if to_report_rtt: fields += ["rtt"] values += [str(int(rtt * 1000000))] if ps.report_attempts: fields += ["attempts"] values += [str(attempts)] self.register_metrics(".".join(fields), ["\t".join(values)])
class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("--in", action="append", dest="input", help="File with addresses") parser.add_argument("--jobs", action="store", type=int, default=100, dest="jobs", help="Concurrent jobs") parser.add_argument("addresses", nargs=argparse.REMAINDER, help="Object name") def handle(self, input, addresses, jobs, *args, **options): self.addresses = set() # Direct addresses for a in addresses: if is_ipv4(a): self.addresses.add(a) # Read addresses from files if input: for fn in input: try: with open(fn) as f: for line in f: line = line.strip() if is_ipv4(line): self.addresses.add(line) except OSError as e: self.die("Cannot read file %s: %s\n" % (fn, e)) # Ping if config.features.use_uvlib: from tornaduv import UVLoop self.stderr.write("Using libuv\n") tornado.ioloop.IOLoop.configure(UVLoop) self.ioloop = IOLoop.current() self.ping = Ping(io_loop=self.ioloop) self.jobs = jobs self.queue = tornado.queues.Queue(self.jobs) for i in range(self.jobs): self.ioloop.spawn_callback(self.ping_worker) self.ioloop.run_sync(self.ping_task) @tornado.gen.coroutine def ping_task(self): for a in self.addresses: yield self.queue.put(a) for i in range(self.jobs): yield self.queue.put(None) yield self.queue.join() @tornado.gen.coroutine def ping_worker(self): while True: a = yield self.queue.get() if a: rtt, attempts = yield self.ping.ping_check_rtt(a, count=1, timeout=1000) if rtt: self.stdout.write("%s %.2fms\n" % (a, rtt * 1000)) else: self.stdout.write("%s FAIL\n" % a) self.queue.task_done() if not a: break