Beispiel #1
0
 def grok_graphite_data(self, data):
     STATS.incr('ts.graphite.grok.data', 1)
     forwards = {}
     for line in data.splitlines():
         elts = line.split(' ')
         elts = [s.strip() for s in elts if s.strip()]
         
         if len(elts) != 3:
             return
         mname, value, timestamp = elts[0], elts[1], elts[2]
         hkey = hashlib.sha1(mname).hexdigest()
         ts_node_manager = gossiper.find_group_node('ts', hkey)
         # if it's me that manage this key, I add it in my backend
         if ts_node_manager == gossiper.uuid:
             self.logger.debug("I am the TS node manager")
             try:
                 timestamp = int(timestamp)
             except ValueError:
                 return
             value = to_best_int_float(value)
             if value is None:
                 continue
             tsmgr.tsb.add_value(timestamp, mname, value)
         # not me? stack a forwarder
         else:
             self.logger.debug("The node manager for this Ts is ", ts_node_manager)
             l = forwards.get(ts_node_manager, [])
             l.append(line)
             forwards[ts_node_manager] = l
     
     for (uuid, lst) in forwards.items():
         node = gossiper.get(uuid)
         # maybe the node disapear? bail out, we are not lucky
         if node is None:
             continue
         packets = []
         # first compute the packets
         buf = ''
         for line in lst:
             buf += line + '\n'
             if len(buf) > 1024:
                 packets.append(buf)
                 buf = ''
         if buf != '':
             packets.append(buf)
         
         # UDP
         sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         for packet in packets:
             # do NOT use the node['port'], it's the internal communication, not the graphite one!
             sock.sendto(packet, (node['addr'], self.graphite_port))
         sock.close()
         
         '''
Beispiel #2
0
    def lookup_for_nodes(self, dom):
        self.logger.debug('Querying %s for managed domaine: %s' %
                          (dom, self.domain))
        if not self.domain.endswith(dom):
            self.logger.debug('Domain %s is not matching managed domain: %s' %
                              (dom, self.domain))
            return []
        search = self.domain[:-len(dom)]
        # split into sname.service.datacenter
        self.logger.debug("Lookup for search %s" % search)
        elts = search.split('.', 2)
        if len(elts) != 3:
            self.logger.error('Bad query, not 3 dots in %s' % search)
            return []
        # NOTE: zone is currently ignored
        zone = elts[2]
        # Filter type must be in 2:
        # - group => look for a group
        # - name  => look for a name (or a display name)
        filter_type = elts[1]
        filter_value = elts[0]
        if filter_type not in ('group', 'name'):
            self.logger.error(
                'This module do not manage this DNS query type: %s' %
                filter_type)
            return []

        self.logger.debug('Looking in %s nodes' % len(gossiper.nodes))
        r = []

        if filter_type == 'group':
            group = filter_value
            valid_filter_node_uuids = gossiper.find_group_nodes(group)

        else:  # filter by name
            name = filter_value
            valid_filter_node_uuids = gossiper.find_nodes_by_name_or_display_name(
                name)

        # Now look for real node & addr/ip
        for node_uuid in valid_filter_node_uuids:
            node = gossiper.get(node_uuid)
            if node is None:  # magic thread disapearance
                continue
            addr = self._get_node_ip(node)
            if addr is not None:
                r.append(addr)
        self.logger.debug('DNS return %s' % r)
        return r
Beispiel #3
0
 def insert_node_into_grafana(self, nuuid):
     node = gossiper.get(nuuid)
     if node is None:
         return
     name = node['name']
     addr = node['addr']
     port = node['port']
     data_source_name = "%s--opsbro--%s" % (name, nuuid)
     entry = {
         "name": data_source_name,
         "type": "graphite",
         "url": "http://%s:%d" % (addr, port),
         "access": "proxy"
     }
     uri = '%s/api/datasources' % (self.uri)
     try:
         r = httper.post(uri, params=entry, headers=self.__get_headers())
         self.logger.debug("Result insert", r)
     except get_http_exceptions() as exp:
         self.logger.error('Cannot connect to grafana datasources: %s' %
                           exp)
         return
Beispiel #4
0
    def main_thread(self):
        # If the detector did not run, we are not sure about the groups of the local node
        # so wait for it to be run, so we can generate shinken file ok from start
        while detecter.did_run == False:
            time.sleep(1)

        self.enabled = self.get_parameter('enabled')
        while not self.enabled:
            self.enabled = self.get_parameter('enabled')
            time.sleep(1)

        if self.cfg_path is not None:
            self.clean_cfg_dir()
            # First look at all nodes in the gossip ring and regerate them
            node_keys = gossiper.nodes.keys()
            for nid in node_keys:
                n = gossiper.get(nid)
                if n is None:
                    continue
                self.generate_node_file(n)

        while not stopper.is_stop():
            self.logger.debug('Shinken loop, regenerate [%s]' %
                              self.regenerate_flag)

            # If we can, export all states into the nagios/shinken daemon as passive checks
            self.export_all_states()

            time.sleep(1)

            # If not initialize, skip loop
            if self.cfg_path is None or gossiper is None:
                continue

            # If nothing to do in configuration, skip it too
            if not self.regenerate_flag:
                continue

            self.logger.info('Shinken callback raised, managing events: %s' %
                             self.node_changes)
            # Set that we will manage all now
            self.regenerate_flag = False
            node_ids = self.node_changes
            self.node_changes = []
            for (evt, nid) in node_ids:
                n = gossiper.get(nid)
                if evt == 'new-node':
                    if n is None:  # maybe someone just delete the node?
                        continue
                    self.logger.info('Manage new node %s' % n)
                    self.generate_node_file(n)
                    self.export_states_uuids.add(nid)
                elif evt == 'delete-node':
                    self.logger.info('Removing deleted node %s' % nid)
                    self.clean_node_files(nid)
                elif evt == 'change-node':
                    self.logger.info(
                        'A node did change, updating its configuration. Node %s'
                        % nid)
                    self.generate_node_file(n)
                    self.export_states_uuids.add(nid)

            # If we need to reload and have a reload commmand, do it
            if self.reload_flag and self.reload_command:
                self.reload_flag = False
                rc, stdout, stderr = exec_command(self.reload_command)
                stdout += stderr
                if rc != 0:
                    self.logger.error('Cannot reload monitoring daemon: %s' %
                                      stdout)
                    return

                self.logger.info('Monitoring daemon reload: OK')
                payload = {'type': 'shinken-restart'}
                gossiper.stack_event_broadcast(payload)
Beispiel #5
0
    def launch_statsd_udp_listener(self):
        while not stopper.is_stop():

            if_group = self.get_parameter('enabled_if_group')
            self.enabled = gossiper.is_in_group(if_group)

            # Ok, if we are not enabled, so not even talk to statsd
            if not self.enabled:
                self.__close_socket()
                time.sleep(1)
                continue

            # maybe we were enabled, then not, then again, if so re-prepare
            if self.udp_sock is None:
                self.__open_socket()

            # Maybe we f**k on the socket or the numpy lib (maybe installation in progress)
            if self.udp_sock is None:
                self.logger.error(
                    'Seems that the socket or numpy are not realy, postpone the module initialiation'
                )
                time.sleep(1)
                continue
            try:
                data, addr = self.udp_sock.recvfrom(
                    65535)  # buffer size is 1024 bytes
            except socket.timeout:  # loop until we got something
                continue

            self.logger.debug("UDP: received message:", data, addr)
            # No data? bail out :)
            if len(data) == 0:
                continue
            self.logger.debug("GETDATA", data)

            for line in data.splitlines():
                # avoid invalid lines
                if '|' not in line:
                    continue
                elts = line.split('|', 1)
                # invalid, no type in the right part
                if len(elts) == 1:
                    continue

                _name_value = elts[0].strip()
                # maybe it's an invalid name...
                if ':' not in _name_value:
                    continue
                _nvs = _name_value.split(':')
                if len(_nvs) != 2:
                    continue
                mname = _nvs[0].strip()

                # We have a ral value, so we will allow now smaller wait time
                self.did_have_metrics = True

                # Two cases: it's for me or not
                hkey = hashlib.sha1(mname).hexdigest()
                ts_node_manager = gossiper.find_group_node('ts', hkey)
                # if it's me that manage this key, I add it in my backend
                if ts_node_manager != gossiper.uuid:
                    node = gossiper.get(ts_node_manager)
                    # threads are dangerous things...
                    if node is None:
                        continue

                    # TODO: do bulk send of this, like for graphite
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    # do NOT use the node['port'], it's the internal communication, not the graphite one!
                    sock.sendto(line, (node['addr'], self.statsd_port))
                    sock.close()
                    continue

                # Here we are sure it's really for us, so manage it :)
                value = to_best_int_float(_nvs[1].strip())
                if not mname or value is None:
                    continue

                # Look at the type of the data
                _type = elts[1].strip()
                if len(_type) == 0:
                    continue

                ## Gauge: <metric name>:<value>|g
                elif _type == 'g':
                    self.nb_data += 1
                    self.logger.log('GAUGE', mname, value)
                    with self.stats_lock:
                        gentry = self.gauges.get(mname, None)
                        if gentry is None:
                            # sum, nb, min, max
                            gentry = (0.0, 0, None, None)
                        _sum, nb, _min, _max = gentry
                        _sum += value
                        nb += 1
                        if _min is None or value < _min:
                            _min = value
                        if _max is None or value > _max:
                            _max = value
                        self.gauges[mname] = (_sum, nb, _min, _max)
                        self.logger.debug('NEW GAUGE', mname,
                                          self.gauges[mname])

                ## Timers: <metric name>:<value>|ms
                ## But also
                ## Histograms: <metric name>:<value>|h
                elif _type == 'ms' or _type == 'h':
                    self.logger.debug('timers', mname, value)
                    # TODO: avoid the SET each time
                    timer = self.timers.get(mname, [])
                    timer.append(value)
                    self.timers[mname] = timer
                ## Counters: <metric name>:<value>|c[|@<sample rate>]
                elif _type == 'c':
                    self.nb_data += 1
                    self.logger.info('COUNTER', mname, value, "rate", 1)
                    with self.stats_lock:
                        cvalue, ccount = self.counters.get(mname, (0, 0))
                        self.counters[mname] = (cvalue + value, ccount + 1)
                        self.logger.debug('NEW COUNTER', mname,
                                          self.counters[mname])
                        ## Meters: <metric name>:<value>|m
                elif _type == 'm':
                    self.logger.debug('METERs', mname, value)
                else:  # unknow type, maybe a c[|@<sample rate>]
                    if _type[0] == 'c':
                        self.nb_data += 1
                        if not '|' in _type:
                            continue
                        srate = _type.split('|')[1].strip()
                        if len(srate) == 0 or srate[0] != '@':
                            continue
                        try:
                            rate = float(srate[1:])
                        except ValueError:
                            continue
                        # Invalid rate, 0.0 is invalid too ;)
                        if rate <= 0.0 or rate > 1.0:
                            continue
                        self.logger.debug('COUNTER', mname, value, "rate",
                                          rate)
                        with self.stats_lock:
                            cvalue, ccount = self.counters.get(mname, (0, 0))
                            self.logger.debug('INCR counter', (value / rate))
                            self.counters[mname] = (cvalue + (value / rate),
                                                    ccount + 1 / rate)
                            self.logger.debug('NEW COUNTER', mname,
                                              self.counters[mname])
Beispiel #6
0
 def do_render(targets, _from):
     response.content_type = 'application/json'
     
     if not targets:
         return abort(400, 'Invalid target')
     # Default past values, round at an hour
     now = int(time.time())
     pastraw = int(time.time()) - 86400
     past = divmod(pastraw, 3600)[0] * 3600
     
     found = False
     # Try -Xd
     m = re.match(r'-(\d*)d', _from, re.M | re.I)
     if m:
         found = True
         nbdays = int(m.group(1))
         pastraw = int(time.time()) - (nbdays * 86400)
         past = divmod(pastraw, 86400)[0] * 86400
     # Try -Xh
     m = re.match(r'-(\d*)h', _from, re.M | re.I)
     if m:
         found = True
         nbhours = int(m.group(1))
         pastraw = int(time.time()) - (nbhours * 3600)
         past = divmod(pastraw, 3600)[0] * 3600
     # Try -Xhours
     if not found:
         m = re.match(r'-(\d*)hours', _from, re.M | re.I)
         if m:
             found = True
             nbhours = int(m.group(1))
             pastraw = int(time.time()) - (nbhours * 3600)
             past = divmod(pastraw, 3600)[0] * 3600
     # Try -Xmin
     if not found:
         m = re.match(r'-(\d*)min', _from, re.M | re.I)
         if m:
             found = True
             nbminutes = int(m.group(1))
             pastraw = int(time.time()) - (nbminutes * 60)
             past = divmod(pastraw, 60)[0] * 60
     # absolute value maybe?
     if not found:
         m = re.match(r'(\d*)', _from, re.M | re.I)
         if m:
             found = True
             past = divmod(int(m.group(1)), 3600)[0] * 3600
     
     if not found:
         return abort(400, 'Invalid range')
     
     # Ok now got the good values
     res = []
     for target in targets:
         
         nuuid = gossiper.find_group_node('ts', target)
         n = None
         if nuuid:
             n = gossiper.get(nuuid)
         nname = ''
         if n:
             nname = n['name']
         self.logger.debug('HTTP ts: target %s is managed by %s(%s)' % (target, nname, nuuid))
         # that's me or the other is no more there?
         if nuuid == gossiper.uuid or n is None:
             self.logger.debug('HTTP ts: /render, my job to manage %s' % target)
             
             # Maybe I am also the TS manager of these data? if so, get the TS backend data for this
             min_e = hour_e = day_e = None
             
             self.logger.debug('HTTP RENDER founded TS %s' % tsmgr.tsb.data)
             min_e = tsmgr.tsb.data.get('min::%s' % target, None)
             hour_e = tsmgr.tsb.data.get('hour::%s' % target, None)
             day_e = tsmgr.tsb.data.get('day::%s' % target, None)
             self.logger.debug('HTTP TS RENDER, FOUNDED TS data %s %s %s' % (min_e, hour_e, day_e))
             
             # Get from the past, but start at the good hours offset
             t = past
             r = []
             
             while t < now:
                 # Maybe the time match a hour we got in memory, if so take there
                 if hour_e and hour_e['hour'] == t:
                     self.logger.debug('HTTP TS RENDER match memory HOUR, take this value instead')
                     raw_values = hour_e['values'][:]  # copy instead of cherrypick, because it can move/append
                     for i in range(60):
                         # Get teh value and the time
                         e = raw_values[i]
                         tt = t + 60 * i
                         r.append((e, tt))
                         if e:
                             self.logger.debug('GOT NOT NULL VALUE from RENDER MEMORY cache %s:%s' % (e, tt))
                 else:  # no memory match, got look in the KS part
                     ukey = '%s::h%d' % (target, t)
                     raw64 = kvmgr.get_key(ukey)
                     if raw64 is None:
                         for i in range(60):
                             # Get the value and the time
                             tt = t + 60 * i
                             r.append((None, tt))
                     else:
                         raw = base64.b64decode(raw64)
                         v = pickle.loads(raw)
                         raw_values = v['values']
                         for i in range(60):
                             # Get teh value and the time
                             e = raw_values[i]
                             tt = t + 60 * i
                             r.append((e, tt))
                 # Ok now the new hour :)
                 t += 3600
             # Now build the final thing
             res.append({"target": target, "datapoints": r})
         else:  # someone else job, rely the question
             uri = 'http://%s:%s/render/?target=%s&from=%s' % (n['addr'], n['port'], target, _from)
             try:
                 self.logger.debug('TS: (get /render) relaying to %s: %s' % (n['name'], uri))
                 r = httper.get(uri)
                 self.logger.debug('TS: get /render founded (%d)' % len(r))
                 v = jsoner.loads(r)
                 self.logger.debug("TS /render relay GOT RETURN", v, "AND RES", res)
                 res.extend(v)
                 self.logger.debug("TS /render res is now", res)
             except get_http_exceptions() as exp:
                 self.logger.debug('TS: /render relay error asking to %s: %s' % (n['name'], str(exp)))
                 continue
     
     self.logger.debug('TS RENDER FINALLY RETURN', res)
     return jsoner.dumps(res)
Beispiel #7
0
    def launch_collector_thread(self):
        last_collector_check = 0
        while not stopper.is_stop():
            collector_group = self.get_parameter('collector-group')
            collector_enabled = gossiper.is_in_group(collector_group)

            if not collector_enabled:
                self.logger.debug('IMRANE: not a collector thread')
                time.sleep(1)
                continue
            self.logger.debug('IMRANE: collector loop')
            self.logger.debug('IMRANE: manage: %s' % self.queue)
            imrane_collector = None
            for collector in collectormgr.collectors.values():
                name = collector['name']
                if name == 'imrane':
                    imrane_collector = collector
                    break
            if imrane_collector is None:
                self.logger.error(
                    'IMRANE: cannot find the imrane collector, skiping this loop'
                )
                time.sleep(1)
                continue

            # Maybe this collector did not run since we last look at it, if so, skip it
            last_check = imrane_collector['last_check']
            if last_check == last_collector_check:
                self.logger.debug(
                    'IMRANE: the collector did not run since the last loop, skiping this turn'
                )
                time.sleep(1)
                continue
            last_collector_check = last_check

            results = imrane_collector['results']
            self.logger.info('IMRANE: collector result: %s' % results)

            our_node = gossiper.get(gossiper.uuid)
            our_node_name = our_node['name']

            agregator_group = self.get_parameter('agregator-group')
            agregator_nodes = gossiper.find_group_nodes(agregator_group)
            if len(agregator_nodes) == 0:
                self.logger.error(
                    'IMRANE ERROR: there are no agregator nodes, skiping data sending'
                )
                time.sleep(1)
                continue

            agregator_node_uuid = random.choice(agregator_nodes)
            agregator_node = gossiper.get(agregator_node_uuid)
            if agregator_node is None:  # oups: thread race bug
                time.sleep(1)
                continue

            address = agregator_node['addr']
            port = agregator_node['port']
            display_name = agregator_node['display_name']
            self.logger.info('IMRANE: did choose %s (%s:%s) for sending' %
                             (display_name, address, port))

            uri = 'http://%s:%s/imrane' % (address, port)
            try:
                r = httper.post(
                    uri,
                    params={
                        'results': results,
                        'from': our_node_name
                    },
                    headers={'Content-Type': 'application/json;charset=UTF-8'})
                self.logger.debug("Result insert", r)
            except get_http_exceptions() as exp:
                self.logger.error('Cannot connect to agregator: %s' % exp)

            # always sleep to not hammer the CPU
            time.sleep(1)