def grok_graphite_data(self, data): STATS.incr('ts.graphite.grok.data', 1) forwards = {} for line in data.splitlines(): elts = line.split(' ') elts = [s.strip() for s in elts if s.strip()] if len(elts) != 3: return mname, value, timestamp = elts[0], elts[1], elts[2] hkey = hashlib.sha1(mname).hexdigest() ts_node_manager = gossiper.find_group_node('ts', hkey) # if it's me that manage this key, I add it in my backend if ts_node_manager == gossiper.uuid: self.logger.debug("I am the TS node manager") try: timestamp = int(timestamp) except ValueError: return value = to_best_int_float(value) if value is None: continue tsmgr.tsb.add_value(timestamp, mname, value) # not me? stack a forwarder else: self.logger.debug("The node manager for this Ts is ", ts_node_manager) l = forwards.get(ts_node_manager, []) l.append(line) forwards[ts_node_manager] = l for (uuid, lst) in forwards.items(): node = gossiper.get(uuid) # maybe the node disapear? bail out, we are not lucky if node is None: continue packets = [] # first compute the packets buf = '' for line in lst: buf += line + '\n' if len(buf) > 1024: packets.append(buf) buf = '' if buf != '': packets.append(buf) # UDP sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) for packet in packets: # do NOT use the node['port'], it's the internal communication, not the graphite one! sock.sendto(packet, (node['addr'], self.graphite_port)) sock.close() '''
def lookup_for_nodes(self, dom): self.logger.debug('Querying %s for managed domaine: %s' % (dom, self.domain)) if not self.domain.endswith(dom): self.logger.debug('Domain %s is not matching managed domain: %s' % (dom, self.domain)) return [] search = self.domain[:-len(dom)] # split into sname.service.datacenter self.logger.debug("Lookup for search %s" % search) elts = search.split('.', 2) if len(elts) != 3: self.logger.error('Bad query, not 3 dots in %s' % search) return [] # NOTE: zone is currently ignored zone = elts[2] # Filter type must be in 2: # - group => look for a group # - name => look for a name (or a display name) filter_type = elts[1] filter_value = elts[0] if filter_type not in ('group', 'name'): self.logger.error( 'This module do not manage this DNS query type: %s' % filter_type) return [] self.logger.debug('Looking in %s nodes' % len(gossiper.nodes)) r = [] if filter_type == 'group': group = filter_value valid_filter_node_uuids = gossiper.find_group_nodes(group) else: # filter by name name = filter_value valid_filter_node_uuids = gossiper.find_nodes_by_name_or_display_name( name) # Now look for real node & addr/ip for node_uuid in valid_filter_node_uuids: node = gossiper.get(node_uuid) if node is None: # magic thread disapearance continue addr = self._get_node_ip(node) if addr is not None: r.append(addr) self.logger.debug('DNS return %s' % r) return r
def insert_node_into_grafana(self, nuuid): node = gossiper.get(nuuid) if node is None: return name = node['name'] addr = node['addr'] port = node['port'] data_source_name = "%s--opsbro--%s" % (name, nuuid) entry = { "name": data_source_name, "type": "graphite", "url": "http://%s:%d" % (addr, port), "access": "proxy" } uri = '%s/api/datasources' % (self.uri) try: r = httper.post(uri, params=entry, headers=self.__get_headers()) self.logger.debug("Result insert", r) except get_http_exceptions() as exp: self.logger.error('Cannot connect to grafana datasources: %s' % exp) return
def main_thread(self): # If the detector did not run, we are not sure about the groups of the local node # so wait for it to be run, so we can generate shinken file ok from start while detecter.did_run == False: time.sleep(1) self.enabled = self.get_parameter('enabled') while not self.enabled: self.enabled = self.get_parameter('enabled') time.sleep(1) if self.cfg_path is not None: self.clean_cfg_dir() # First look at all nodes in the gossip ring and regerate them node_keys = gossiper.nodes.keys() for nid in node_keys: n = gossiper.get(nid) if n is None: continue self.generate_node_file(n) while not stopper.is_stop(): self.logger.debug('Shinken loop, regenerate [%s]' % self.regenerate_flag) # If we can, export all states into the nagios/shinken daemon as passive checks self.export_all_states() time.sleep(1) # If not initialize, skip loop if self.cfg_path is None or gossiper is None: continue # If nothing to do in configuration, skip it too if not self.regenerate_flag: continue self.logger.info('Shinken callback raised, managing events: %s' % self.node_changes) # Set that we will manage all now self.regenerate_flag = False node_ids = self.node_changes self.node_changes = [] for (evt, nid) in node_ids: n = gossiper.get(nid) if evt == 'new-node': if n is None: # maybe someone just delete the node? continue self.logger.info('Manage new node %s' % n) self.generate_node_file(n) self.export_states_uuids.add(nid) elif evt == 'delete-node': self.logger.info('Removing deleted node %s' % nid) self.clean_node_files(nid) elif evt == 'change-node': self.logger.info( 'A node did change, updating its configuration. Node %s' % nid) self.generate_node_file(n) self.export_states_uuids.add(nid) # If we need to reload and have a reload commmand, do it if self.reload_flag and self.reload_command: self.reload_flag = False rc, stdout, stderr = exec_command(self.reload_command) stdout += stderr if rc != 0: self.logger.error('Cannot reload monitoring daemon: %s' % stdout) return self.logger.info('Monitoring daemon reload: OK') payload = {'type': 'shinken-restart'} gossiper.stack_event_broadcast(payload)
def launch_statsd_udp_listener(self): while not stopper.is_stop(): if_group = self.get_parameter('enabled_if_group') self.enabled = gossiper.is_in_group(if_group) # Ok, if we are not enabled, so not even talk to statsd if not self.enabled: self.__close_socket() time.sleep(1) continue # maybe we were enabled, then not, then again, if so re-prepare if self.udp_sock is None: self.__open_socket() # Maybe we f**k on the socket or the numpy lib (maybe installation in progress) if self.udp_sock is None: self.logger.error( 'Seems that the socket or numpy are not realy, postpone the module initialiation' ) time.sleep(1) continue try: data, addr = self.udp_sock.recvfrom( 65535) # buffer size is 1024 bytes except socket.timeout: # loop until we got something continue self.logger.debug("UDP: received message:", data, addr) # No data? bail out :) if len(data) == 0: continue self.logger.debug("GETDATA", data) for line in data.splitlines(): # avoid invalid lines if '|' not in line: continue elts = line.split('|', 1) # invalid, no type in the right part if len(elts) == 1: continue _name_value = elts[0].strip() # maybe it's an invalid name... if ':' not in _name_value: continue _nvs = _name_value.split(':') if len(_nvs) != 2: continue mname = _nvs[0].strip() # We have a ral value, so we will allow now smaller wait time self.did_have_metrics = True # Two cases: it's for me or not hkey = hashlib.sha1(mname).hexdigest() ts_node_manager = gossiper.find_group_node('ts', hkey) # if it's me that manage this key, I add it in my backend if ts_node_manager != gossiper.uuid: node = gossiper.get(ts_node_manager) # threads are dangerous things... if node is None: continue # TODO: do bulk send of this, like for graphite sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # do NOT use the node['port'], it's the internal communication, not the graphite one! sock.sendto(line, (node['addr'], self.statsd_port)) sock.close() continue # Here we are sure it's really for us, so manage it :) value = to_best_int_float(_nvs[1].strip()) if not mname or value is None: continue # Look at the type of the data _type = elts[1].strip() if len(_type) == 0: continue ## Gauge: <metric name>:<value>|g elif _type == 'g': self.nb_data += 1 self.logger.log('GAUGE', mname, value) with self.stats_lock: gentry = self.gauges.get(mname, None) if gentry is None: # sum, nb, min, max gentry = (0.0, 0, None, None) _sum, nb, _min, _max = gentry _sum += value nb += 1 if _min is None or value < _min: _min = value if _max is None or value > _max: _max = value self.gauges[mname] = (_sum, nb, _min, _max) self.logger.debug('NEW GAUGE', mname, self.gauges[mname]) ## Timers: <metric name>:<value>|ms ## But also ## Histograms: <metric name>:<value>|h elif _type == 'ms' or _type == 'h': self.logger.debug('timers', mname, value) # TODO: avoid the SET each time timer = self.timers.get(mname, []) timer.append(value) self.timers[mname] = timer ## Counters: <metric name>:<value>|c[|@<sample rate>] elif _type == 'c': self.nb_data += 1 self.logger.info('COUNTER', mname, value, "rate", 1) with self.stats_lock: cvalue, ccount = self.counters.get(mname, (0, 0)) self.counters[mname] = (cvalue + value, ccount + 1) self.logger.debug('NEW COUNTER', mname, self.counters[mname]) ## Meters: <metric name>:<value>|m elif _type == 'm': self.logger.debug('METERs', mname, value) else: # unknow type, maybe a c[|@<sample rate>] if _type[0] == 'c': self.nb_data += 1 if not '|' in _type: continue srate = _type.split('|')[1].strip() if len(srate) == 0 or srate[0] != '@': continue try: rate = float(srate[1:]) except ValueError: continue # Invalid rate, 0.0 is invalid too ;) if rate <= 0.0 or rate > 1.0: continue self.logger.debug('COUNTER', mname, value, "rate", rate) with self.stats_lock: cvalue, ccount = self.counters.get(mname, (0, 0)) self.logger.debug('INCR counter', (value / rate)) self.counters[mname] = (cvalue + (value / rate), ccount + 1 / rate) self.logger.debug('NEW COUNTER', mname, self.counters[mname])
def do_render(targets, _from): response.content_type = 'application/json' if not targets: return abort(400, 'Invalid target') # Default past values, round at an hour now = int(time.time()) pastraw = int(time.time()) - 86400 past = divmod(pastraw, 3600)[0] * 3600 found = False # Try -Xd m = re.match(r'-(\d*)d', _from, re.M | re.I) if m: found = True nbdays = int(m.group(1)) pastraw = int(time.time()) - (nbdays * 86400) past = divmod(pastraw, 86400)[0] * 86400 # Try -Xh m = re.match(r'-(\d*)h', _from, re.M | re.I) if m: found = True nbhours = int(m.group(1)) pastraw = int(time.time()) - (nbhours * 3600) past = divmod(pastraw, 3600)[0] * 3600 # Try -Xhours if not found: m = re.match(r'-(\d*)hours', _from, re.M | re.I) if m: found = True nbhours = int(m.group(1)) pastraw = int(time.time()) - (nbhours * 3600) past = divmod(pastraw, 3600)[0] * 3600 # Try -Xmin if not found: m = re.match(r'-(\d*)min', _from, re.M | re.I) if m: found = True nbminutes = int(m.group(1)) pastraw = int(time.time()) - (nbminutes * 60) past = divmod(pastraw, 60)[0] * 60 # absolute value maybe? if not found: m = re.match(r'(\d*)', _from, re.M | re.I) if m: found = True past = divmod(int(m.group(1)), 3600)[0] * 3600 if not found: return abort(400, 'Invalid range') # Ok now got the good values res = [] for target in targets: nuuid = gossiper.find_group_node('ts', target) n = None if nuuid: n = gossiper.get(nuuid) nname = '' if n: nname = n['name'] self.logger.debug('HTTP ts: target %s is managed by %s(%s)' % (target, nname, nuuid)) # that's me or the other is no more there? if nuuid == gossiper.uuid or n is None: self.logger.debug('HTTP ts: /render, my job to manage %s' % target) # Maybe I am also the TS manager of these data? if so, get the TS backend data for this min_e = hour_e = day_e = None self.logger.debug('HTTP RENDER founded TS %s' % tsmgr.tsb.data) min_e = tsmgr.tsb.data.get('min::%s' % target, None) hour_e = tsmgr.tsb.data.get('hour::%s' % target, None) day_e = tsmgr.tsb.data.get('day::%s' % target, None) self.logger.debug('HTTP TS RENDER, FOUNDED TS data %s %s %s' % (min_e, hour_e, day_e)) # Get from the past, but start at the good hours offset t = past r = [] while t < now: # Maybe the time match a hour we got in memory, if so take there if hour_e and hour_e['hour'] == t: self.logger.debug('HTTP TS RENDER match memory HOUR, take this value instead') raw_values = hour_e['values'][:] # copy instead of cherrypick, because it can move/append for i in range(60): # Get teh value and the time e = raw_values[i] tt = t + 60 * i r.append((e, tt)) if e: self.logger.debug('GOT NOT NULL VALUE from RENDER MEMORY cache %s:%s' % (e, tt)) else: # no memory match, got look in the KS part ukey = '%s::h%d' % (target, t) raw64 = kvmgr.get_key(ukey) if raw64 is None: for i in range(60): # Get the value and the time tt = t + 60 * i r.append((None, tt)) else: raw = base64.b64decode(raw64) v = pickle.loads(raw) raw_values = v['values'] for i in range(60): # Get teh value and the time e = raw_values[i] tt = t + 60 * i r.append((e, tt)) # Ok now the new hour :) t += 3600 # Now build the final thing res.append({"target": target, "datapoints": r}) else: # someone else job, rely the question uri = 'http://%s:%s/render/?target=%s&from=%s' % (n['addr'], n['port'], target, _from) try: self.logger.debug('TS: (get /render) relaying to %s: %s' % (n['name'], uri)) r = httper.get(uri) self.logger.debug('TS: get /render founded (%d)' % len(r)) v = jsoner.loads(r) self.logger.debug("TS /render relay GOT RETURN", v, "AND RES", res) res.extend(v) self.logger.debug("TS /render res is now", res) except get_http_exceptions() as exp: self.logger.debug('TS: /render relay error asking to %s: %s' % (n['name'], str(exp))) continue self.logger.debug('TS RENDER FINALLY RETURN', res) return jsoner.dumps(res)
def launch_collector_thread(self): last_collector_check = 0 while not stopper.is_stop(): collector_group = self.get_parameter('collector-group') collector_enabled = gossiper.is_in_group(collector_group) if not collector_enabled: self.logger.debug('IMRANE: not a collector thread') time.sleep(1) continue self.logger.debug('IMRANE: collector loop') self.logger.debug('IMRANE: manage: %s' % self.queue) imrane_collector = None for collector in collectormgr.collectors.values(): name = collector['name'] if name == 'imrane': imrane_collector = collector break if imrane_collector is None: self.logger.error( 'IMRANE: cannot find the imrane collector, skiping this loop' ) time.sleep(1) continue # Maybe this collector did not run since we last look at it, if so, skip it last_check = imrane_collector['last_check'] if last_check == last_collector_check: self.logger.debug( 'IMRANE: the collector did not run since the last loop, skiping this turn' ) time.sleep(1) continue last_collector_check = last_check results = imrane_collector['results'] self.logger.info('IMRANE: collector result: %s' % results) our_node = gossiper.get(gossiper.uuid) our_node_name = our_node['name'] agregator_group = self.get_parameter('agregator-group') agregator_nodes = gossiper.find_group_nodes(agregator_group) if len(agregator_nodes) == 0: self.logger.error( 'IMRANE ERROR: there are no agregator nodes, skiping data sending' ) time.sleep(1) continue agregator_node_uuid = random.choice(agregator_nodes) agregator_node = gossiper.get(agregator_node_uuid) if agregator_node is None: # oups: thread race bug time.sleep(1) continue address = agregator_node['addr'] port = agregator_node['port'] display_name = agregator_node['display_name'] self.logger.info('IMRANE: did choose %s (%s:%s) for sending' % (display_name, address, port)) uri = 'http://%s:%s/imrane' % (address, port) try: r = httper.post( uri, params={ 'results': results, 'from': our_node_name }, headers={'Content-Type': 'application/json;charset=UTF-8'}) self.logger.debug("Result insert", r) except get_http_exceptions() as exp: self.logger.error('Cannot connect to agregator: %s' % exp) # always sleep to not hammer the CPU time.sleep(1)