def get_stats(self):
        """Retrieves stats from ceph pools"""

        ceph_cluster = "%s-%s" % (self.prefix, self.cluster)

        data = { ceph_cluster: {} }

        stats_output = None
        try:
            stats_output = subprocess.check_output('ceph osd pool stats -f json', shell=True)
        except Exception as exc:
            collectd.error("ceph-io: failed to ceph pool stats :: %s :: %s"
                    % (exc, traceback.format_exc()))
            return

        if stats_output is None:
            collectd.error('ceph-io: failed to ceph osd pool stats :: output was None')

        json_stats_data = json.loads(stats_output)

        # push osd pool stats results
        for pool in json_stats_data:
            pool_key = "pool-%s" % pool['pool_name']
            data[ceph_cluster][pool_key] = {}
            pool_data = data[ceph_cluster][pool_key] 
            for stat in ('read_bytes_sec', 'write_bytes_sec', 'op_per_sec'):
                pool_data[stat] = pool['client_io_rate'][stat] if pool['client_io_rate'].has_key(stat) else 0

        return data
def logger(t, msg):
    if t == 'err':
        collectd.error('%s: %s' % (NAME, msg))
    if t == 'warn':
        collectd.warning('%s: %s' % (NAME, msg))
    elif t == 'verb' and VERBOSE_LOGGING == True:
        collectd.info('%s: %s' % (NAME, msg))
def fetch_json(url):
    """Fetch json from url"""
    try:
        return json.load(urllib2.urlopen(url, timeout=5))
    except urllib2.URLError, e:
        collectd.error("mesos-tasks plugin: Error connecting to %s - %r" % (url, e))
        return None
def plugin_write(vl, config):
    try:
        session = boto3.session.Session(region_name=config.aws_region)
        client_config = botocore.client.Config(connect_timeout=5, read_timeout=5)
        client = session.client('cloudwatch', config=client_config)
        metrics_list = list(metrics(vl, config))
        ts = datetime.fromtimestamp(vl.time)
        data = []

        for i, v in enumerate(vl.values):
            fullname, unit, dims = metrics_list[i]
            name = fullname[:255]
            if len(name) < len(fullname):
                collectd.warning('Metric name was truncated for CloudWatch: {}'.format(fullname))

            data.append(dict(
                MetricName=name,
                Timestamp=ts,
                Value=v,
                Unit=unit,
                Dimensions=dims
            ))

        client.put_metric_data(Namespace=vl.plugin, MetricData=data)
    except Exception, e:
        collectd.error(str(e))
    def get_stats(self):
        """Retrieves stats regarding latency to write to a test pool"""

        ceph_cluster = "%s-%s" % (self.prefix, self.cluster)

        data = { ceph_cluster: {} }

        output = None
        try:
            output = subprocess.check_output(
              "timeout 30s rados -p data bench 10 write -t 1 -b 65536 2>/dev/null | grep -i latency | awk '{print 1000*$3}'", shell=True)
        except Exception as exc:
            collectd.error("ceph-latency: failed to run rados bench :: %s :: %s"
                    % (exc, traceback.format_exc()))
            return

        if output is None:
            collectd.error('ceph-latency: failed to run rados bench :: output was None')

        results = output.split('\n')
        # push values
        data[ceph_cluster]['cluster'] = {}
        data[ceph_cluster]['cluster']['avg_latency'] = results[0]
        data[ceph_cluster]['cluster']['stddev_latency'] = results[1]
        data[ceph_cluster]['cluster']['max_latency'] = results[2]
        data[ceph_cluster]['cluster']['min_latency'] = results[3]

        return data
def kairosdb_send_http_data(data, json):
    collectd.debug('Json=%s' % json)
    data['lock'].acquire()
    
    if not kairosdb_connect(data):
        data['lock'].release()
        collectd.warning('kairosdb_writer: no connection to kairosdb server')
        return

    response = ''
    try:
        headers = {'Content-type': 'application/json', 'Connection': 'keep-alive'}
        data['conn'].request('POST', '/api/v1/datapoints', json, headers)
        res = data['conn'].getresponse()
        response = res.read()
        collectd.debug('Response code: %d' % res.status)

        if res.status == 204:
            exit_code = True
        else:
            collectd.error(response)
            exit_code = False

    except httplib.ImproperConnectionState, e:
        collectd.error('Lost connection to kairosdb server: %s' % e.message)
        data['conn'] = None
        exit_code = False
def kairosdb_config(c):
    global host, port, host_separator, \
        metric_separator, lowercase_metric_names, protocol, \
        tags_map, metric_name, add_host_tag, formatter, uri
        
    for child in c.children:
        if child.key == 'AddHostTag':
            add_host_tag = child.values[0]
        elif child.key == 'KairosDBURI':
            uri = child.values[0]
        elif child.key == 'TypesDB':
            for v in child.values:
                kairosdb_parse_types_file(v)
        elif child.key == 'LowercaseMetricNames':
            lowercase_metric_names = child.values[0]
        elif child.key == 'MetricName':
            metric_name = str(child.values[0])
        elif child.key == 'HostSeparator':
            host_separator = child.values[0]
        elif child.key == 'MetricSeparator':
            metric_separator = child.values[0]
        elif child.key == 'Formatter':
            formatter_path = child.values[0]
            try:
                formatter = imp.load_source('formatter', formatter_path)
                # formatter = source.Formatter()
            except:
                raise Exception('Could not load formatter %s %s' % (formatter_path, format_exc()))
        elif child.key == 'Tags':
            for v in child.values:
                tag_parts = v.split("=")
                if len(tag_parts) == 2:
                    tags_map[tag_parts[0]] = tag_parts[1]
                else:
                    collectd.error("Invalid tag: %s" % tag)
Exemple #8
0
def collect_buddyinfo():
    if os.path.exists(buddy_fname):
        with open(buddy_fname) as f:
            for line in f:
               match = re_buddyinfo.search(line)
               if not match:
                  continue;
               node = match.group('node')
               zone = match.group('zone')
               free_pages = match.group('pages').strip().split()
               stats_current[(node, zone, 'val')] = free_pages
               stats_current[(node, zone, 'ts')] = time.time()
               key_val = dict(zip(white_list, free_pages))
               metric = collectd.Values()
               metric.host = host_name
               metric.plugin = 'buddyinfo'
               metric.plugin_instance = node
               metric.type = 'gauge'
               for k in range(0, len(white_list)):
                  metric.type_instance = 'zone_' + zone + '.' 
                  metric.type_instance += white_list[k]
                  metric.values = [free_pages[k]]
                  metric.dispatch()
            f.close()
    else:
        collectd.error('buddyinfo: procfs path: %s does not exist' 
                       % (buddy_fname))
def kairosdb_connect(data):
    #collectd.info(repr(data))
    if not data['conn'] and protocol == 'http':
        data['conn'] = httplib.HTTPConnection(data['host'], data['port'])
        return True
        
    elif not data['conn'] and protocol == 'https':
        data['conn'] = httplib.HTTPSConnection(data['host'], data['port'])
        return True

    elif not data['conn'] and protocol == 'telnet':
        # only attempt reconnect every 10 seconds if protocol of type Telnet
        now = time()
        if now - data['last_connect_time'] < 10:
            return False

        data['last_connect_time'] = now
        collectd.info('connecting to %s:%s' % (data['host'], data['port']))
        try:
            data['conn'] = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            data['conn'].connect((data['host'], data['port']))
            return True
        except:
            collectd.error('error connecting socket: %s' % format_exc())
            return False
    else:
        return True
def read_callback():
    log_verbose('Read callback called')
    for port_str in REDIS_PORTS.split(','):
        port=int(port_str)
        info = fetch_info(port)

        if not info:
            collectd.error('redis plugin: No info received')
            continue

        # send high-level values
        dispatch_value(info, 'uptime_in_seconds','gauge',port)
        dispatch_value(info, 'connected_clients', 'gauge',port)
        dispatch_value(info, 'connected_slaves', 'gauge',port)
        dispatch_value(info, 'blocked_clients', 'gauge',port)
        dispatch_value(info, 'evicted_keys', 'gauge',port)
        dispatch_value(info, 'used_memory', 'bytes',port)
        dispatch_value(info, 'changes_since_last_save', 'gauge',port)
        dispatch_value(info, 'total_connections_received', 'counter',port,
                       'connections_recieved')
        dispatch_value(info, 'total_commands_processed', 'counter',port,
                       'commands_processed')

        # database and vm stats
        for key in info:
            if key.startswith('vm_stats_'):
                dispatch_value(info, key, 'gauge',port)
            if key.startswith('db'):
                dispatch_value(info[key], 'keys', 'gauge',port, '%s-keys' % key)
Exemple #11
0
def plugin_read(unused_input_data=None):
  """Handles collectd's 'read' interface for mlab plugin."""

  vs_prefix = _PROC_VIRTUAL
  vs_dlimits = read_vsys_data('vs_xid_dlimits', _VSYS_FRONTEND_VERSION)
  report_cpuavg_for_system(_PROC_STAT)
  report_meta_metrics(_PROC_PID_STAT)
  uptime = read_system_uptime()
  for entry in os.listdir(vs_prefix):
    entry_path = os.path.join(vs_prefix, entry)
    if not os.path.isdir(entry_path):
      continue

    if entry not in _vs_xid_names:
      init_vserver_xid_names()  # Try reloading names to get new vserver names.
      # Skip, if still not present.
      if entry not in _vs_xid_names:
        collectd.error(('mlab: no vserver name found for xid %s after '
                        'reloading names.') % entry)
        continue

    vs_name = _vs_xid_names[entry]
    if vs_name in _config_exclude_slices:
      # Do not collect any stats for this slice.
      continue

    vs_host = slicename_to_hostname(vs_name)

    report_cpu_for_vserver(vs_host, entry_path)
    report_network_for_vserver(vs_host, entry_path)
    report_limits_for_vserver(vs_host, entry_path)
    report_threads_for_vserver(vs_host, entry_path, uptime)
    if entry in vs_dlimits:
      report_quota_for_vserver(vs_host, vs_dlimits[entry])
Exemple #12
0
def read_vsys_data(command, version):
  """Runs vsys 'command' and returns results as dict.

  See command notes for description of returned data format.

  Args:
    command: str, name of script or command to execute in vsys backend.
    version: int, expected version of backend response.
  Returns:
    dict, results of 'command'.
  """
  # Send request through vsys (for slice context).
  data = read_vsys_data_direct(command)

  if 'data' not in data:
    collectd.error('%s: returned value has no "data" field.' % command)
    return {}

  if 'version' not in data:
    collectd.error('%s: returned value has no "version" field.' % command)
    return {}

  if 'message_type' in data and data['message_type'] != command:
    collectd.error('Returned message_type does not match request.')
    collectd.error('Requested: %s' % command)
    collectd.error('Received : %s' % data['message_type'])
    return {}

  if data['version'] != version:
    msg = '%s: version mismatch: found (%d), expected (%d)' % (
          command, data['version'], version)
    collectd.warning(msg)

  return data['data']
Exemple #13
0
def init_stats_cache():
   global white_list

   if os.path.exists(buddy_fname):
      num_buckets = 0
      with open(buddy_fname) as f:
         for line in f:
            match = re_buddyinfo.search(line)
            if not match:
               collectd.error('buddyinfo: unknown line pattern: %s' % (line))
               continue;
            node = match.group('node')
            zone = match.group('zone')
            free_pages = match.group('pages').strip().split()
            num_buckets = len(free_pages)
            if node not in node_list:
               node_list.append(node)
            if zone not in zone_list:
               zone_list.append(zone)
            stats_cache[(node, zone, 'val')] = free_pages
            stats_cache[(node, zone, 'ts')] = time.time()
      f.close()
      for i in range(0, num_buckets):
         white_list.append('free_pages_' + str(4*2**i) + 'K')
      collectd.info('buddyinfo: node_list : %s' % (node_list))
      collectd.info('buddyinfo: zone_list : %s' % (zone_list))
      collectd.info('buddyinfo: white_list: %s' % (white_list))
   else:
      collectd.info('buddyinfo: init_stats_cache: path: %s does not exist' 
                    % (buddy_fname))
Exemple #14
0
def get_stats(socket):
    """
        Makes two calls to haproxy to fetch server info and server stats.
        Returns the dict containing metric name as the key and a tuple of metric value and the dict of dimensions if any
    """
    if socket is None:
        collectd.error("Socket configuration parameter is undefined. Couldn't get the stats")
        return
    stats = [ ]
    haproxy = HAProxySocket(socket)

    try:
        server_info = haproxy.get_server_info()
        server_stats = haproxy.get_server_stats()
    except socket.error:
        collectd.warning(
            'status err Unable to connect to HAProxy socket at %s' %
            socket)
        return stats

    for key, val in server_info.iteritems():
        try:
            stats.append((key, int(val), None))
        except (TypeError, ValueError):
            pass
    for statdict in server_stats:
        if not (statdict['svname'].lower() in PROXY_MONITORS or statdict['pxname'].lower() in PROXY_MONITORS):
              continue
        for metricname, val in statdict.items():
            try:
                stats.append((metricname, int(val), {'proxy_name': statdict['pxname'], 'service_name': statdict['svname']}))
            except (TypeError, ValueError):
                pass
    return stats
 def config(self, cfg):
     if "Module.interval" in cfg:
         try:
             self.interval = int(cfg["Module.interval"][0])
             collectd.info("MetricWriteTracker.interval == {}".format(self.interval))
         except ValueError:
             collectd.error("module {0}, interval parameter must be an integer".format(self.__module__))
Exemple #16
0
def read_properties(*cmd):
    properties = AutoVivification()
    out = None
    real_cmd = (CONF['scli_wrap'],CONF['scli_user'],CONF['scli_password']) + cmd
    my_verbose('Executing command: %s %s ******* %s' % (CONF['scli_wrap'], CONF['scli_user'], " ".join(str(v) for v in cmd)))

    try:
        out = subprocess.check_output(real_cmd, stderr=subprocess.STDOUT)
        my_debug('scli output: ' + out)
    except Exception as e:
        collectd.error('ScaleIO: error on executing scli command %s --- %s' %
            (e, traceback.format_exc()))
        return

    if 'Failed to connect to MDM 127.0.0.1:6611' in out:
        my_verbose('plugin is running on non-primary/active MDM, skipping data collection')


    group_name = None
    group_regex = re.compile("^([^\s]+)\s([^:]+)")
    kv_regex = re.compile("^\s+([^\s]+)\s+(.*)$")
    for line in out.split('\n'):
        new_group_match = group_regex.match(line)
        if new_group_match:
            group_name = new_group_match.group(2)
        else:
            kv_match = kv_regex.match(line)
            if kv_match:
                properties[group_name][kv_match.group(1)] = kv_match.group(2)

    my_verbose('Read properties: %s' % (json.dumps(properties)))
    rectify_dict(properties)
    my_debug('Properties after rectify: %s' % (json.dumps(properties)))
    return properties
    def dispatch(self, stats):
        """
        Dispatches the given stats.

        stats should be something like:

        {'plugin': {'plugin_instance': {'type': {'type_instance': <value>, ...}}}}
        """
        if not stats:
            collectd.error("%s: failed to retrieve stats" % self.prefix)
            return

        self.logdebug("dispatching %d new stats :: %s" % (len(stats), stats))
        try:
            for plugin in stats.keys():
                for plugin_instance in stats[plugin].keys():
                    for type in stats[plugin][plugin_instance].keys():
                        type_value = stats[plugin][plugin_instance][type]
                        if not isinstance(type_value, dict):
                            self.dispatch_value(plugin, plugin_instance, type, None, type_value)
                        else:
                          for type_instance in stats[plugin][plugin_instance][type].keys():
                                self.dispatch_value(plugin, plugin_instance,
                                        type, type_instance,
                                        stats[plugin][plugin_instance][type][type_instance])
        except Exception as exc:
            collectd.error("%s: failed to dispatch values :: %s :: %s"
                    % (self.prefix, exc, traceback.format_exc()))
Exemple #18
0
def read():
    #collectd.info("read")
    
    for name in CONFIG:
        
      while True:
        #collectd.info("querying: " + name)
        try:
            switch = ENV.get_switch(name)

            v1 = collectd.Values(plugin='wemo')
            v1.type = 'power'
            v1.type_instance = 'power'
            v1.plugin_instance = name

            power = switch.current_power/1000.0

            collectd.info("Got power from %s = %fW" % (name, power))

            v1.values = [power]
            v1.dispatch()
        except UnknownDevice:
            collectd.error("Unknown device: " + name)
        except ConnectionError:
            ENV.start()
            ENV.discover()
            continue

        break

    env = None
def dispatch_data(data,composite_key,type,plugin_instance=None,type_instance=None):
	""" Dispatch the Data"""
	log_verbose('In Dispatch')
	log_verbose('===========')
	
	if plugin_instance is None:
		plugin_instance = 'unkown_splunkData'
		collectd.error('splunkData plugin: Data key not found: %s' %kcomposite_key)

	if type_instance is None:
		type_instance = composite_key

	#split the composite key into it's components	
	keys = composite_key.split('/')

	# assigning value of first data item to value
        # this should speed up recursive lookup done below in for loop
        key = keys.pop(0)
	value = data[key]

	# recurse until final value is found!
	for key in keys:
		value = value[key]

	log_verbose('Sending Value: %s=%s' % (type_instance, value))

	val = collectd.Values(plugin='splunkData')
	val.type = type
	val.type_instance = type_instance
	val.plugin_instance = plugin_instance
	val.values = [value]
	val.dispatch()
def logger(t, msg):
    if t == "err":
        collectd.error("%s: %s" % (NAME, msg))
    if t == "warn":
        collectd.warning("%s: %s" % (NAME, msg))
    elif t == "verb" and VERBOSE_LOGGING == True:
        collectd.info("%s: %s" % (NAME, msg))
def read_callback():
    """Get stats for all the servers in the cluster."""
    for conf in CONFIGS:
        for host in conf['hosts']:
            try:
                zk = ZooKeeperServer(host, conf['port'])
                stats = zk.get_stats()
                for k, v in stats.items():
                    try:
                        val = collectd.Values(plugin='zookeeper',
                                              meta={'0': True})
                        val.type = 'counter' if k in COUNTERS else 'gauge'
                        val.type_instance = k
                        val.values = [v]
                        val.plugin_instance = conf['instance']
                        val.dispatch()
                    except (TypeError, ValueError):
                        collectd.error(('error dispatching stat; host=%s, '
                                        'key=%s, val=%s') % (host, k, v))
                        pass
            except socket.error:
                # Ignore because the cluster can still work even
                # if some servers fail completely.
                # This error should be also visible in a variable
                # exposed by the server in the statistics.
                log('unable to connect to server "%s"' % (host))

    return stats
def read(data=None):
    starttime = time.time()

    gnocchi = client.Client(session=keystone_session)
    try:
        status = gnocchi.status.get()
        metric = collectd.Values()
        metric.plugin = 'gnocchi_status'
        metric.interval = INTERVAL
        metric.type = 'gauge'
        metric.type_instance = 'measures'
        metric.values = [status['storage']['summary']['measures']]
        metric.dispatch()

        metric = collectd.Values()
        metric.plugin = 'gnocchi_status'
        metric.interval = INTERVAL
        metric.type = 'gauge'
        metric.type_instance = 'metrics'
        metric.values = [status['storage']['summary']['metrics']]
        metric.dispatch()
    except Exception as err:
        collectd.error(
            'gnocchi_status: Exception getting status: {}'
            .format(err))

    timediff = time.time() - starttime
    if timediff > INTERVAL:
        collectd.warning(
            'gnocchi_status: Took: {} > {}'
            .format(round(timediff, 2), INTERVAL))
Exemple #23
0
 def zope_read(self, data=None):
     self.logger('verb', 'read_callback')
     for metric in self._metrics:
         try:
             s = self.connect()
         except:
             collectd.error('Fail to connect to %s:%s' % (self._zmonitor_hostname,
                                                          self._zmonitor_port))
             return
         self.logger('verb', 'fetch %s' % metric)
         metricid = self.metric_configs[metric].get('alias', metric)
         s.sendall("%s\n" % metricid)
         output = None
         while 1:
             data = s.recv(1024)
             if data == "":
                 break
             else:
                 output = data
         s.close()
         if output is not None:
             data = self.strip_data(output, metric)
         self.logger('verb', 'got %s' % data)
         if data == '' or data is None:
             collectd.error('Recevied not data for %s' % metric)
             return
         values = collectd.Values(type=self.metric_configs[metric]['type'], plugin='zope')
         values.dispatch(plugin_instance='%s' % self._cluster_name,
                         type_instance=self.metric_configs[metric]['type_instance'],
                         values=(data, ))
def read_callback():
    log_verbose('Read callback called')
    info = fetch_info()

    if not info:
        collectd.error('redis plugin: No info received')
        return

    # send high-level values
    dispatch_value(info, 'uptime_in_seconds','gauge')
    dispatch_value(info, 'connected_clients', 'gauge')
    dispatch_value(info, 'connected_slaves', 'gauge')
    dispatch_value(info, 'blocked_clients', 'gauge')
    dispatch_value(info, 'used_memory', 'bytes')
    dispatch_value(info, 'mem_fragmentation_ratio', 'gauge')
    dispatch_value(info, 'changes_since_last_save', 'gauge')
    dispatch_value(info, 'total_connections_received', 'counter',
                   'connections_received')
    dispatch_value(info, 'total_commands_processed', 'counter',
                   'commands_processed')
    dispatch_value(info, 'used_cpu_sys', 'counter', 'cpu_sys')
    dispatch_value(info, 'used_cpu_user', 'counter', 'cpu_user')
    dispatch_value(info, 'used_cpu_sys_children', 'counter', 'cpu_sys_children')
    dispatch_value(info, 'used_cpu_user_children', 'counter', 'cpu_user_children')
    
    # database and vm stats
    for key in info:
        if key.startswith('vm_stats_'):
            dispatch_value(info, key, 'gauge')
        if key.startswith('db'):
            dispatch_value(info[key], 'keys', 'gauge', '%s-keys' % key)
Exemple #25
0
 def logger(self, t, msg):
     if t == 'err':
         collectd.error('%s: %s' % (self.name, msg))
     if t == 'warn':
         collectd.warning('%s: %s' % (self.name, msg))
     elif t == 'verb' and self.verbose:
         collectd.info('%s: %s' % (self.name, msg))
    def get_stats(self):
        """Retrieves stats from ceph pgs"""

        ceph_cluster = "%s" % self.cluster

        data = { ceph_cluster: { 'pg': { } }  }
        output = None
        try:
            output = subprocess.check_output('ceph pg dump --format json',shell=True)
        except Exception as exc:
            collectd.error("ceph-pg: failed to ceph pg dump :: %s :: %s"
                    % (exc, traceback.format_exc()))
            return

        if output is None:
            collectd.error('ceph-pg: failed to ceph osd dump :: output was None')
        
        json_data = json.loads(output)

        pg_data = data[ceph_cluster]['pg']
        # number of pgs in each possible state
        for pg in json_data['pg_stats']:
            for state in pg['state'].split('+'):
                if not pg_data.has_key(state):
                    pg_data[state] = 0
                pg_data[state] += 1
    
        return data
Exemple #27
0
 def err(self, message):
     """ Log an error message """
     fmsg = '%s:ERR %s' % (self.plugin_name, message)
     if not self.debug_mode:
         collectd.error(fmsg)
     else:
         print(fmsg)
def fetch_stats(conf):
    try:
        result = json.load(urllib2.urlopen(conf['mesos_url'], timeout=10))
    except urllib2.URLError, e:
        collectd.error('%s plugin: Error connecting to %s - %r' %
                       (PREFIX, conf['mesos_url'], e))
        return None
    def get_stats(self):
        """Retrieves stats from ceph buckets"""

        ceph_cluster = "%s-%s" % (self.prefix, self.cluster)

        data = { ceph_cluster: {} }

        stats_output = None
        try:
            stats_output = subprocess.check_output('radosgw-admin bucket stats', shell=True)
        except Exception as exc:
            collectd.error("ceph-rgw-bucket: failed to ceph pool stats :: %s :: %s"
                    % (exc, traceback.format_exc()))
            return

        if stats_output is None:
            collectd.error('ceph-rgw-bucket: failed to ceph osd pool stats :: output was None')

        json_stats_data = json.loads(stats_output)

        # rgw bucket stats results

        for idx, bucket in enumerate(json_stats_data):
            bucket_key = "bucket-%s" % bucket['bucket']
            data[ceph_cluster][bucket_key] = {}
            bucket_data = data[ceph_cluster][bucket_key]
            for stat in ('size_kb', 'size_kb_actual', 'num_objects'):
                if bucket['usage'].has_key('rgw.main'):
                    bucket_data[stat] = bucket['usage']['rgw.main'][stat] if bucket['usage']['rgw.main'].has_key(stat) else 0

        return data
def dispatch_value(info, key, type, plugin_instance=None, type_instance=None):
    """Read a key from info response data and dispatch a value"""


    if key not in info:
        collectd.warning('redis_info plugin: Info key not found: %s' % key)
        return

    if plugin_instance is None:
        plugin_instance = 'unknown redis'
        collectd.error('redis_info plugin: plugin_instance is not set, Info key: %s' % key)

    if not type_instance:
        type_instance = key

    try:
        value = int(info[key])
    except ValueError:
        value = float(info[key])

    log_verbose('Sending value: %s=%s' % (type_instance, value))

    val = collectd.Values(plugin='redis_info')
    val.type = type
    val.type_instance = type_instance
    val.plugin_instance = plugin_instance
    val.values = [value]
    val.meta={'0': True}
    val.dispatch()
Exemple #31
0
 def error(self, msg):
     collectd.error('{name}: {msg}'.format(name=PLUGIN_NAME, msg=msg))
def config_callback(conf):
    """Receive configuration block"""
    project_name = "demo"
    project_domainid = "default"
    user_domainid = "default"
    region_name = None
    interval = 10
    testing = False
    ssl_verify = True
    OPENSTACK_CLIENT = {}
    plugin_conf = {}
    custom_dimensions = {}
    http_timeout = None
    request_batch_size = 5
    nova_list_servers_search_opts = {}

    query_server_metrics = True
    query_hypervisor_metrics = True

    required_keys = frozenset(("authurl", "username", "password"))

    for node in conf.children:
        try:
            if node.key.lower() in required_keys:
                plugin_conf[node.key.lower()] = node.values[0]
            elif node.key.lower() == "projectname":
                project_name = node.values[0]
            elif node.key.lower() == "projectdomainid":
                project_domainid = node.values[0]
            elif node.key.lower() == "userdomainid":
                user_domainid = node.values[0]
            elif node.key.lower() == "regionname":
                if node.values[0]:
                    region_name = node.values[0]
            elif node.key.lower() == "dimension":
                if len(node.values) == 2:
                    custom_dimensions.update({node.values[0]: node.values[1]})
                else:
                    collectd.warning(
                        "WARNING: Check configuration setting for %s" %
                        node.key)
            elif node.key.lower() == "interval":
                interval = node.values[0]
            elif node.key.lower() == "sslverify":
                ssl_verify = node.values[0]
            elif node.key.lower() == "httptimeout":
                http_timeout = node.values[0]
            elif node.key.lower() == "requestbatchsize":
                request_batch_size = int(node.values[0])
            elif node.key.lower() == "queryservermetrics":
                query_server_metrics = node.values[0]
            elif node.key.lower() == "queryhypervisormetrics":
                query_hypervisor_metrics = node.values[0]
            elif node.key.lower() == "novalistserverssearchopts":
                nova_list_servers_search_opts = yaml.load(
                    node.values[0], Loader=yaml.FullLoader)
                if not isinstance(nova_list_servers_search_opts, dict):
                    raise TypeError(
                        "NovaListSeverSearchOpts must be a string representation of yaml mapping. Received {0}."
                        .format(node.values[0]))
            elif node.key.lower() == "testing":
                testing = node.values[0]
        except Exception as e:
            collectd.error(
                "Failed to load the configuration {0} due to {1}".format(
                    node.key, e))
            raise e

    OPENSTACK_CLIENT["query_server_metrics"] = query_server_metrics
    OPENSTACK_CLIENT["query_hypervisor_metrics"] = query_hypervisor_metrics

    for key in required_keys:
        try:
            plugin_conf[key]
        except KeyError:
            raise KeyError("Missing required config setting: %s" % key)

    try:
        novametrics = NovaMetrics(
            auth_url=plugin_conf["authurl"],
            username=plugin_conf["username"],
            password=plugin_conf["password"],
            project_name=project_name,
            project_domain_id=project_domainid,
            user_domain_id=user_domainid,
            region_name=region_name,
            ssl_verify=ssl_verify,
            http_timeout=http_timeout,
            request_batch_size=request_batch_size,
            list_servers_search_opts=nova_list_servers_search_opts,
        )
        OPENSTACK_CLIENT["nova"] = novametrics

        cindermetrics = CinderMetrics(
            auth_url=plugin_conf["authurl"],
            username=plugin_conf["username"],
            password=plugin_conf["password"],
            project_name=project_name,
            project_domain_id=project_domainid,
            user_domain_id=user_domainid,
            region_name=region_name,
            ssl_verify=ssl_verify,
            http_timeout=http_timeout,
        )
        OPENSTACK_CLIENT["cinder"] = cindermetrics

        neutronmetrics = NeutronMetrics(plugin_conf["authurl"],
                                        plugin_conf["username"],
                                        plugin_conf["password"], project_name,
                                        project_domainid, user_domainid,
                                        region_name, ssl_verify, http_timeout)
        OPENSTACK_CLIENT["neutron"] = neutronmetrics
        OPENSTACK_CLIENT["custdims"] = custom_dimensions

    except Exception as e:
        collectd.error(
            "Failed to authenticate Openstack client due to {0}".format(e))

    if testing:
        return plugin_conf, OPENSTACK_CLIENT

    collectd.register_read(read_callback,
                           interval,
                           data=OPENSTACK_CLIENT,
                           name=project_name)
Exemple #33
0
    processed_stats = docker_dependency_resolver.get_DockerFormatter().process_stats(raw_stats)

    for container_name, container_stats in processed_stats.iteritems():
        timestamp = None

        if 'read' in container_stats:
            timestamp = container_stats['read']
            del (container_stats['read'])

        for metric_name, metric_value in container_stats.iteritems():
            collectd_dependency_resolver.get_Exporter().export(container_name, metric_name, metric_value, timestamp)

    docker_dependency_resolver.get_ContainerStatsStreamPool().keep_streams_running(running_container_names)


def init():
    collectd.register_read(read)


try:
    collectd_dependency_resolver = CollectdDependencyResolver.get_Resolver(collectd)

    docker_dependency_resolver = DockerDependencyResolver.get_Resolver(collectd_dependency_resolver.get_Logger(), socket_url, timeout)

    collectd.register_config(configure)
    collectd.register_init(init)

except Exception as exception:
    collectd.error('collectd-docker-stats-plugin: plugin stopped because of exception: '.format(exception.message))
    def kairosdb_write(self, values, data=None):
        # noinspection PyBroadException
        try:
            # collectd.info(repr(v))
            if values.type not in self.types:
                collectd.warning(
                    'kairosdb_writer: do not know how to handle type %s. do you have all your types.db files configured?'
                    % values.type)
                return

            v_type = self.types[values.type]

            if len(v_type) != len(values.values):
                collectd.warning(
                    'kairosdb_writer: differing number of values for type %s' %
                    values.type)
                return

            hostname = values.host.replace('.', self.host_separator)

            tags = self.tags_map.copy()
            if self.add_host_tag:
                tags['host'] = hostname

            plugin = values.plugin
            plugin_instance = ''
            if values.plugin_instance:
                plugin_instance = self.sanitize_field(values.plugin_instance)

            type_name = values.type
            type_instance = ''
            if values.type_instance:
                type_instance = self.sanitize_field(values.type_instance)

            # collectd.info('plugin: %s plugin_instance: %s type: %s type_instance: %s' % (plugin, plugin_instance, type_name, type_instance))

            default_name = self.metric_name % {
                'host': hostname,
                'plugin': plugin,
                'plugin_instance': plugin_instance,
                'type': type_name,
                'type_instance': type_instance
            }

            if self.pluginsToFormatter and plugin in self.pluginsToFormatter:
                name, tags = self.pluginsToFormatter[plugin].format_metric(
                    self.metric_name, tags, hostname, plugin, plugin_instance,
                    type_name, type_instance)
            elif self.formatter:
                name, tags = self.formatter.format_metric(
                    self.metric_name, tags, hostname, plugin, plugin_instance,
                    type_name, type_instance)
            else:
                name = default_name

            # Remove dots for missing pieces
            name = name.replace('..', '.')
            name = name.rstrip('.')

            # collectd.info('Metric: %s' % name)

            type_list = list(v_type)
            values_list = list(values.values)

            if plugin in self.convert_rates:
                i = 0
                type_list = []
                values_list = []
                for value in values.values:
                    if self.is_counter(v_type[i]):
                        counter = "%s.%s" % (default_name, v_type[i][0])

                        with data['lock']:
                            if value is not None:
                                if counter in self.counters_map:
                                    old_value = self.counters_map[counter]
                                    try:
                                        rate = (value - old_value['value']) / (
                                            values.time -
                                            old_value['timestamp'])
                                        values_list.append(rate)
                                        type_list.append([
                                            v_type[i][0] + '_rate', 'GAUGE',
                                            '0', 'U'
                                        ])
                                    except ZeroDivisionError:
                                        collectd.error(
                                            "Timestamp values are identical (caused divide by error) for %s"
                                            + default_name)
                                self.counters_map[counter] = {
                                    'value': value,
                                    'timestamp': values.time
                                }
                    else:
                        values_list.append(value)
                        type_list.append(v_type[i])
                    i += 1

            if self.protocol == 'http' or self.protocol == 'https':
                self.kairosdb_write_http_metrics(data, type_list, values.time,
                                                 values_list, name, tags)
            else:
                self.kairosdb_write_telnet_metrics(data, type_list,
                                                   values.time, values_list,
                                                   name, tags)
        except Exception:
            collectd.error(traceback.format_exc())
                    exit_code = False

            except httplib.ImproperConnectionState, e:
                collectd.error('Lost connection to kairosdb server: %s' %
                               e.message)
                data['conn'] = None
                exit_code = False

            except httplib.HTTPException, e:
                collectd.error('Error sending http data: %s' % e.message)
                if response:
                    collectd.error(response)
                exit_code = False

            except Exception, e:
                collectd.error('Error sending http data: %s' % str(e))
                exit_code = False

            return exit_code

    def kairosdb_write(self, values, data=None):
        # noinspection PyBroadException
        try:
            # collectd.info(repr(v))
            if values.type not in self.types:
                collectd.warning(
                    'kairosdb_writer: do not know how to handle type %s. do you have all your types.db files configured?'
                    % values.type)
                return

            v_type = self.types[values.type]
Exemple #36
0
def init_func():
    """Init the plugin."""

    # do nothing till config is complete.
    if obj.config_complete() is False:
        return pc.PLUGIN_PASS

    if obj._node_ready is False:
        obj.node_ready()
        return pc.PLUGIN_PASS

    obj.hostname = socket.gethostname()

    # Determine the full list of logical cpus for this host
    obj.logical_cpus = get_logical_cpus()

    # Determine the subset of logical platform cpus that we want to monitor
    obj.cpu_list = get_platform_cpulist()
    if obj.debug:
        collectd.info('%s configured platform cpu list: %r'
                      % (PLUGIN_DEBUG, obj.cpu_list))

    # Ensure that the platform cpus are a subset of actual logical cpus
    if not (all(x in obj.logical_cpus for x in obj.cpu_list)):
        collectd.error('%s cpulist %r is not a subset of host logical cpus %r'
                       % (PLUGIN, obj.cpu_list, obj.logical_cpus))
        return pc.PLUGIN_FAIL

    # Monitor all logical cpus if no platform cpus have been specified
    if not obj.cpu_list:
        obj.cpu_list = obj.logical_cpus
    obj.number_platform_cpus = len(obj.cpu_list)

    collectd.info('%s found %d cpus total; monitoring %d cpus, cpu list: %s'
                  % (PLUGIN,
                     len(obj.logical_cpus),
                     obj.number_platform_cpus,
                     pc.format_range_set(obj.cpu_list)))

    # Check schedstat version
    version = 0
    try:
        with open(SCHEDSTAT, 'r') as f:
            line = f.readline()
            match = re_schedstat_version.search(line)
            if match:
                version = int(match.group(1))
    except Exception as err:
        collectd.error('%s Cannot read schedstat, error=%s' % (PLUGIN, err))
        return pc.PLUGIN_FAIL
    if version != SCHEDSTAT_SUPPORTED_VERSION:
        obj.schedstat_supported = False
        collectd.error('%s unsupported schedstat version [%d]'
                       % (PLUGIN, version))
        return pc.PLUGIN_FAIL

    # Gather initial cputime state information.
    update_cpu_data(init=True)

    obj.init_completed()
    return pc.PLUGIN_PASS
Exemple #37
0
def update_cpu_data(init=False):
    """Gather cputime info and Update platform cpu occupancy metrics.

    This gathers current per-cpu cputime information from schedstats
    and per-cgroup cputime information from cgroup cpuacct.

    This calculates the average cpu occupancy of the platform cores
    since this routine was last run.
    """

    # Get epoch time in floating seconds
    now = time.time()

    # Calculate elapsed time delta since last run
    obj.elapsed_ms = float(pc.ONE_THOUSAND) * (now - obj._t0[TIMESTAMP])

    # Prevent calling this routine too frequently (<= 1 sec)
    if not init and obj.elapsed_ms <= 1000.0:
        return

    t1 = {}
    t1[TIMESTAMP] = now
    if obj.schedstat_supported:
        # Get current per-cpu cumulative cputime usage from /proc/schedstat.
        cputimes = read_schedstat()
        for cpu in obj.cpu_list:
            t1[cpu] = cputimes[cpu]
    else:
        return

    # Get current cpuacct usages based on cgroup hierarchy
    t1_cpuacct = get_cpuacct()

    # Refresh the k8s pod information if we have discovered new cgroups
    cg_pods = set(t1_cpuacct[pc.GROUP_PODS].keys())
    if not cg_pods.issubset(obj.k8s_pods):
        if obj.debug:
            collectd.info('%s Refresh k8s pod information.' % (PLUGIN_DEBUG))
        obj.k8s_pods = set()
        pods = obj._k8s_client.kube_get_local_pods()
        for i in pods:
            # NOTE: parent pod cgroup name contains annotation config.hash as
            # part of its name, otherwise it contains the pod uid.
            uid = i.metadata.uid
            if ((i.metadata.annotations) and
                    (pc.POD_ANNOTATION_KEY in i.metadata.annotations)):
                hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY,
                                                      None)
                if hash_uid:
                    if obj.debug:
                        collectd.info('%s POD_ANNOTATION_KEY: '
                                      'hash=%s, uid=%s, '
                                      'name=%s, namespace=%s, qos_class=%s'
                                      % (PLUGIN_DEBUG,
                                         hash_uid,
                                         i.metadata.uid,
                                         i.metadata.name,
                                         i.metadata.namespace,
                                         i.status.qos_class))
                    uid = hash_uid

            obj.k8s_pods.add(uid)
            if uid not in obj._cache:
                obj._cache[uid] = pc.POD_object(i.metadata.uid,
                                                i.metadata.name,
                                                i.metadata.namespace,
                                                i.status.qos_class)
    # Remove stale _cache entries
    remove_uids = set(obj._cache.keys()) - obj.k8s_pods
    for uid in remove_uids:
        del obj._cache[uid]

    # Save initial state information
    if init:
        obj._t0 = copy.deepcopy(t1)
        obj._t0_cpuacct = copy.deepcopy(t1_cpuacct)
        return

    # Aggregate cputime delta for platform logical cpus using integer math
    cputime_ms = 0.0
    for cpu in obj.cpu_list:
        # Paranoia check, we should never hit this.
        if cpu not in obj._t0:
            collectd.error('%s cputime initialization error' % (PLUGIN))
            break
        cputime_ms += float(t1[cpu] - obj._t0[cpu])
    cputime_ms /= float(pc.ONE_MILLION)

    # Calculate average occupancy of platform logical cpus
    occupancy = 0.0
    if obj.number_platform_cpus > 0 and obj.elapsed_ms > 0:
        occupancy = float(pc.ONE_HUNDRED) * float(cputime_ms) \
            / float(obj.elapsed_ms) / obj.number_platform_cpus
    else:
        occupancy = 0.0
    obj._data[PLATFORM_CPU_PERCENT] = occupancy
    if obj.debug:
        collectd.info('%s %s elapsed = %.1f ms, cputime = %.1f ms, '
                      'n_cpus = %d, occupancy = %.2f %%'
                      % (PLUGIN_DEBUG,
                         PLATFORM_CPU_PERCENT,
                         obj.elapsed_ms,
                         cputime_ms,
                         obj.number_platform_cpus,
                         occupancy))

    # Calculate cpuacct delta for cgroup hierarchy, dropping transient cgroups
    cpuacct = {}
    for i in t1_cpuacct.keys():
        cpuacct[i] = {}
        for k, v in t1_cpuacct[i].items():
            if i in obj._t0_cpuacct and k in obj._t0_cpuacct[i]:
                cpuacct[i][k] = v - obj._t0_cpuacct[i][k]
            else:
                cpuacct[i][k] = v

    # Summarize cpuacct usage for various groupings
    for g in pc.OVERALL_GROUPS:
        cpuacct[pc.GROUP_OVERALL][g] = 0.0

    # Aggregate cpuacct usage by K8S pod
    for uid in cpuacct[pc.GROUP_PODS]:
        acct = cpuacct[pc.GROUP_PODS][uid]
        if uid in obj._cache:
            pod = obj._cache[uid]
        else:
            collectd.warning('%s uid %s not found' % (PLUGIN, uid))
            continue

        # K8S platform system usage, i.e., essential: kube-system
        if pod.namespace in pc.K8S_NAMESPACE_SYSTEM:
            cpuacct[pc.GROUP_OVERALL][pc.GROUP_K8S_SYSTEM] += acct

        # K8S platform addons usage, i.e., non-essential: monitor, openstack
        if pod.namespace in pc.K8S_NAMESPACE_ADDON:
            cpuacct[pc.GROUP_OVERALL][pc.GROUP_K8S_ADDON] += acct

    # Calculate base cpuacct usage (i.e., base tasks, exclude K8S and VMs)
    # e.g., docker, system.slice, user.slice
    for name in cpuacct[pc.GROUP_FIRST]:
        if name in pc.BASE_GROUPS:
            cpuacct[pc.GROUP_OVERALL][pc.GROUP_BASE] += \
                cpuacct[pc.GROUP_FIRST][name]
        elif name not in pc.BASE_GROUPS_EXCLUDE:
            collectd.warning('%s could not find cgroup: %s' % (PLUGIN, name))

    # Calculate platform cpuacct usage (this excludes apps)
    for g in pc.PLATFORM_GROUPS:
        cpuacct[pc.GROUP_OVERALL][pc.GROUP_PLATFORM] += \
            cpuacct[pc.GROUP_OVERALL][g]

    # Calculate cgroup based occupancy for overall groupings
    for g in pc.OVERALL_GROUPS:
        cputime_ms = \
            float(cpuacct[pc.GROUP_OVERALL][g]) / float(pc.ONE_MILLION)
        occupancy = float(pc.ONE_HUNDRED) * float(cputime_ms) \
            / float(obj.elapsed_ms) / obj.number_platform_cpus
        obj._data[g] = occupancy
        if obj.debug:
            collectd.info('%s %s elapsed = %.1f ms, cputime = %.1f ms, '
                          'n_cpus = %d, occupancy = %.2f %%'
                          % (PLUGIN_DEBUG,
                             g,
                             obj.elapsed_ms,
                             cputime_ms,
                             obj.number_platform_cpus,
                             occupancy))

    # Update t0 state for the next sample collection
    obj._t0 = copy.deepcopy(t1)
    obj._t0_cpuacct = copy.deepcopy(t1_cpuacct)
def load_json(resp, url):
    try:
        return json.loads(resp)
    except ValueError, e:
        collectd.error("Error parsing JSON for API call (%s) %s" % (e, url))
        return None
def configure_callback(conf):
    """
    Configuration callback. These are accepted configs:
    Server: hostname or ip adresss
    Port: HEC port
    Token: HEC token
    QueueSize: Number, maximum metrics buffer
    SSL: true to use HTTPS
    VerifySSL: True to enable SSL verification
    CertFile: Public key of the signing authority
    Dimension: specify dimension for metrics
      i.e.  Dimension "location:phoenix"
            Dimension "type:dev"
    SplunkMetricTransform: true to use Splunk metric format
    :param conf: configration tree
    """
    dimension_list = []
    for node in conf.children:
        config_key = node.key.lower()
        if config_key == 'server':
            CONFIG['server'] = node.values[0]
        elif config_key == 'port':
            try:
                CONFIG['port'] = int(node.values[0])
            except Exception:
                collectd.error('Invalid type of Port, number is required.')
        elif config_key == 'token':
            CONFIG['token'] = node.values[0]
        elif config_key == 'ssl':
            ssl_val = node.values[0]
            if ssl_val in ['1', 'True']:
                CONFIG['ssl'] = True
            elif ssl_val in ['0', 'False']:
                CONFIG['ssl'] = False
            else:
                collectd.error('Invalid type of ssl, boolean is required.')
        elif config_key == 'verifyssl':
            ssl_val = node.values[0]
            if ssl_val in ['1', 'True']:
                CONFIG['verify_ssl'] = True
            elif ssl_val in ['0', 'False']:
                CONFIG['verify_ssl'] = False
            else:
                collectd.error('Invalid type of ssl, boolean is required.')
        elif config_key == 'queuesize':
            try:
                queue_size = int(node.values[0])
                CONFIG['queue_size'] = queue_size
            except Exception:
                collectd.error(
                    'Invalid type of queue size, number is required.')
        elif config_key == 'batchsize':
            try:
                batch_size = int(node.values[0])
                CONFIG['batch_size'] = batch_size
            except Exception:
                collectd.error(
                    'Invalid type of batch size, number is required.')
        elif config_key == 'certfile':
            CONFIG['cert_file'] = node.values[0]
        elif config_key == 'dimension':
            # if dimension value is empty, we continue
            if (len(node.values) == 0):
                collectd.error("Dimension value is empty")
                continue

            try:
                (key, value) = node.values[0].split(':')
            except ValueError:
                collectd.error("Invalid dimension values: %s" % (node.values))
                continue
            dimension_list.append('%s=%s' % ((key, value)))
        elif config_key == 'splunkmetrictransform':
            should_transform = node.values[0]
            if should_transform in ['1', 'True']:
                CONFIG['splunk_metric_transform'] = True
            elif should_transform in ['0', 'False']:
                CONFIG['splunk_metric_transform'] = False
            else:
                collectd.error(
                    'Invalid type of splunk metric transform, boolean is required'
                )
        else:
            collectd.error('Not supported config key: %s' % (config_key))

    CONFIG[DIMENSION_LIST_KEY] = dimension_list
    collectd.debug("Setting configuration completed.")
    collectd.debug("configuration: {}".format(CONFIG))
Exemple #40
0
 def get_stats(self):
     collectd.error('Not implemented, should be subclassed')
            data[ceph_cluster][osd_id]['snap_trim_queue_len'] = osd[
                'snap_trim_queue_len']
            data[ceph_cluster][osd_id]['num_snap_trimming'] = osd[
                'num_snap_trimming']
            data[ceph_cluster][osd_id]['apply_latency_ms'] = osd[
                'fs_perf_stat']['apply_latency_ms']
            data[ceph_cluster][osd_id]['commit_latency_ms'] = osd[
                'fs_perf_stat']['commit_latency_ms']

        return data


try:
    plugin = CephPGPlugin()
except Exception as exc:
    collectd.error("ceph-pg: failed to initialize ceph pg plugin :: %s :: %s" %
                   (exc, traceback.format_exc()))


def configure_callback(conf):
    """Received configuration information"""
    plugin.config_callback(conf)
    collectd.register_read(read_callback, plugin.interval)


def read_callback():
    """Callback triggerred by collectd on read"""
    plugin.read_callback()


collectd.register_init(CephPGPlugin.reset_sigchld)
collectd.register_config(configure_callback)
def _send_data(config):
    protocol = 'http'
    if config['ssl'] is True:
        protocol = 'https'

    server_uri = '%s://%s:%s/services/collector' % (protocol, config['server'],
                                                    config['port'])

    headers = ('Authorization: Splunk ' +
               config['token'] if CURRENT_OS == 'darwin' else {
                   'Authorization': 'Splunk ' + config['token']
               })
    metrics = []
    while True:
        if not metrics:
            count = 0
            start = time.time()
            timeout = config['timeout'] / 2
            while (time.time() - start <=
                   timeout) and count < config['batch_size']:
                # ITOA-8109: Queue api comes with condition/lock menchanism that handles the
                # case when queue is empty. If queue is empty then it puts this resource to waiting.
                # this way we are not in a infinite loop.
                # source: https://hg.python.org/cpython/file/3a1db0d2747e/Lib/Queue.py#l150
                try:
                    value = config['metric_queue'].get(timeout=timeout)
                    new_metrics = _build_splunk_metrics(value)
                    for m in new_metrics:
                        metrics.append(json.dumps(m))
                        count += 1
                except Queue.Empty:
                    pass

        # If there is message in queue
        try:
            payload = ''.join(metrics)
            collectd.info(
                "payload data to be sent to Splunk: {}".format(payload))
            if config['verify_ssl'] is False:
                if CURRENT_OS == 'darwin':
                    args = [
                        'curl', '-k', server_uri, '-H', headers, '-d', payload
                    ]
                    process = subprocess.Popen(args,
                                               shell=False,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)
                    stdout, stderr = process.communicate()
                else:
                    response = requests.post(server_uri,
                                             data=payload,
                                             headers=headers,
                                             verify=False)
                    if response.status_code != requests.codes.ok:
                        collectd.error(
                            'Failed sending metrics to Splunk. Response code:{}, response content:{}'
                            .format(response.status_code, response.content))

            else:
                if CURRENT_OS == 'darwin':
                    args = [
                        'curl', '-k', '--cert', config['cert_file'],
                        server_uri, '-H', headers, '-d', payload
                    ]
                    process = subprocess.Popen(args,
                                               shell=False,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)
                    stdout, stderr = process.communicate()
                else:
                    response = requests.post(server_uri,
                                             data=payload,
                                             headers=headers,
                                             verify=config['cert_file'])
                    if response.status_code != requests.codes.ok:
                        collectd.error(
                            'Failed sending metric to Splunk. Response code:{}, response content:{}'
                            .format(response.status_code, response.content))

            if config['disable_ssl_warning'] and CURRENT_OS != 'darwin':
                requests.packages.urllib3.disable_warnings(
                    InsecureRequestWarning)
                config['disable_ssl_warning'] = False
        except Exception, e:
            collectd.error('Failed sending metric to Splunk HEC: {}'.format(
                str(e)))
            # Try again in 3 seconds
            time.sleep(3)

        metrics = []
Exemple #43
0
    def read(self):
        """
        read() method collects the data from libvirt and dispatch it
        """
        self.establish_connection()

        if self.error == FAILURE:
            collectd.error("Empty stats dispatch, failed to open connection.")
            return

        for domain in self.conn.listAllDomains(0):
            if not domain.isActive():
                collectd.warning("Failed to collectd interface "
                                 "stats for VM %s, VM is not running!" %
                                 domain.name())
                continue
            tree = ElementTree.fromstring(domain.XMLDesc())
            interfaces = [
                i.get("dev") for i in tree.findall('devices/interface/target')
            ]

            total_rx_pkts = 0
            total_tx_pkts = 0
            total_rx_drops = 0
            total_tx_drops = 0
            total_rx_bytes = 0
            total_tx_bytes = 0

            for iface in interfaces:
                if iface:
                    collectd.info(
                        "Collecting stats for '%s' interface of VM: %s" %
                        (iface, domain.name()))
                    nic_data = self.collectd_nic_stats(domain, iface)
                    self.prev_iface_data[nic_data[IFACE_NAME]] = deepcopy(
                        nic_data)
                    dispatch(nic_data)
                    collectd.info("Data for interface: %s of VM: %s is "
                                  "dispatched" % (iface, domain.name()))

                    total_rx_pkts = total_rx_pkts + nic_data[RX_PKTS]
                    total_tx_pkts = total_tx_pkts + nic_data[TX_PKTS]
                    total_rx_drops = total_rx_drops + nic_data[RX_DROPS]
                    total_tx_drops = total_tx_drops + nic_data[TX_DROPS]
                    total_rx_bytes = total_rx_bytes + nic_data[RX_BYTES]
                    total_tx_bytes = total_tx_bytes + nic_data[TX_BYTES]

            interface = {}
            interface[TYPE] = AGGREGATE
            interface[RX_PKTS] = total_rx_pkts
            interface[TX_PKTS] = total_tx_pkts
            interface[RX_DROPS] = total_rx_drops
            interface[TX_DROPS] = total_tx_drops
            interface[RX_BYTES] = total_rx_bytes
            interface[TX_BYTES] = total_tx_bytes
            self.add_aggregate(domain, interface)
            self.prev_iface_agg[domain.name()] = interface
            dispatch(interface)

        if not self.error:
            self.conn.close()
Exemple #44
0
def log_err(msg):
    collectd.error('COMPAL: ' + msg)
Exemple #45
0
    def get_stats(self):
        """Retrieves stats from ceph pools"""

        ceph_cluster = "%s-%s" % (self.prefix, self.cluster)

        data = {ceph_cluster: {}}

        stats_output = None
        try:
            osd_pool_cmdline = 'ceph osd pool stats -f json --cluster ' + self.cluster
            stats_output = subprocess.check_output(osd_pool_cmdline,
                                                   shell=True)
            cephdf_cmdline = 'ceph df -f json --cluster ' + self.cluster
            df_output = subprocess.check_output(ceph_dfcmdline, shell=True)
        except Exception as exc:
            collectd.error("ceph-pool: failed to ceph pool stats :: %s :: %s" %
                           (exc, traceback.format_exc()))
            return

        if stats_output is None:
            collectd.error(
                'ceph-pool: failed to ceph osd pool stats :: output was None')

        if df_output is None:
            collectd.error('ceph-pool: failed to ceph df :: output was None')

        json_stats_data = json.loads(stats_output)
        json_df_data = json.loads(df_output)

        # push osd pool stats results
        for pool in json_stats_data:
            pool_key = "pool-%s" % pool['pool_name']
            data[ceph_cluster][pool_key] = {}
            pool_data = data[ceph_cluster][pool_key]
            for stat in ('read_bytes_sec', 'write_bytes_sec', 'op_per_sec'):
                pool_data[stat] = pool['client_io_rate'][stat] if pool[
                    'client_io_rate'].has_key(stat) else 0

        # push df results
        for pool in json_df_data['pools']:
            pool_data = data[ceph_cluster]["pool-%s" % pool['name']]
            for stat in ('bytes_used', 'kb_used', 'objects'):
                pool_data[stat] = pool['stats'][stat] if pool['stats'].has_key(
                    stat) else 0

        # push totals from df
        data[ceph_cluster]['cluster'] = {}
        if json_df_data['stats'].has_key('total_bytes'):
            # ceph 0.84+
            data[ceph_cluster]['cluster']['total_space'] = int(
                json_df_data['stats']['total_bytes'])
            data[ceph_cluster]['cluster']['total_used'] = int(
                json_df_data['stats']['total_used_bytes'])
            data[ceph_cluster]['cluster']['total_avail'] = int(
                json_df_data['stats']['total_avail_bytes'])
        else:
            # ceph < 0.84
            data[ceph_cluster]['cluster']['total_space'] = int(
                json_df_data['stats']['total_space']) * 1024.0
            data[ceph_cluster]['cluster']['total_used'] = int(
                json_df_data['stats']['total_used']) * 1024.0
            data[ceph_cluster]['cluster']['total_avail'] = int(
                json_df_data['stats']['total_avail']) * 1024.0

        return data
Exemple #46
0
def get_metrics(conf):
    info = fetch_info(conf)

    if not info:
        collectd.error('redis plugin: No info received')
        return

    plugin_instance = conf['instance']
    if plugin_instance is None:
        plugin_instance = '{host}:{port}'.format(host=conf['host'],
                                                 port=conf['port'])

    # send high-level values
    dispatch_value(info, 'uptime_in_seconds', 'counter', plugin_instance)
    dispatch_value(info, 'connected_clients', 'counter', plugin_instance)
    dispatch_value(info, 'connected_slaves', 'counter', plugin_instance)
    dispatch_value(info, 'blocked_clients', 'counter', plugin_instance)
    dispatch_value(info, 'evicted_keys', 'counter', plugin_instance)
    dispatch_value(info, 'expired_keys', 'counter', plugin_instance)
    dispatch_value(info, 'used_memory', 'bytes', plugin_instance)
    dispatch_value(info, 'used_memory_rss', 'bytes', plugin_instance)
    dispatch_value(info, 'used_memory_peak', 'bytes', plugin_instance)
    dispatch_value(info, 'mem_fragmentation_ratio', 'gauge', plugin_instance)
    dispatch_value(info, 'changes_since_last_save', 'counter', plugin_instance)
    dispatch_value(info, 'total_connections_received', 'counter',
                   plugin_instance, 'connections_received')
    dispatch_value(info, 'total_commands_processed', 'counter',
                   plugin_instance, 'commands_processed')

    dispatch_value(info, 'instantaneous_ops_per_sec', 'counter',
                   plugin_instance, 'instantaneous_ops')
    dispatch_value(info, 'rejected_connections', 'counter', plugin_instance)
    dispatch_value(info, 'pubsub_channels', 'counter', plugin_instance)
    dispatch_value(info, 'pubsub_patterns', 'counter', plugin_instance)
    dispatch_value(info, 'latest_fork_usec', 'counter', plugin_instance)

    # send keyspace hits and misses, if they exist
    if 'keyspace_hits' in info:
        dispatch_value(info, 'keyspace_hits', 'derive', plugin_instance)
    if 'keyspace_misses' in info:
        dispatch_value(info, 'keyspace_misses', 'derive', plugin_instance)

    # send replication stats, but only if they exist (some belong to master only, some to slaves only)
    if 'master_repl_offset' in info:
        dispatch_value(info, 'master_repl_offset', 'gauge', plugin_instance)
    if 'master_last_io_seconds_ago' in info:
        dispatch_value(info, 'master_last_io_seconds_ago', 'gauge',
                       plugin_instance)
    if 'slave_repl_offset' in info:
        dispatch_value(info, 'slave_repl_offset', 'gauge', plugin_instance)

    # database and vm stats
    for key in info:
        if key.startswith('repl_'):
            dispatch_value(info, key, 'gauge', plugin_instance)
        if key.startswith('vm_stats_'):
            dispatch_value(info, key, 'gauge', plugin_instance)
        if key.startswith('db'):
            dispatch_value(info[key], 'keys', 'counter', plugin_instance,
                           '%s-keys' % key)
        if key.startswith('slave'):
            dispatch_value(info[key], 'delay', 'gauge', plugin_instance,
                           '%s-delay' % key)
Exemple #47
0
def read_func():

    # ntp query is for controllers only
    if tsc.nodetype != 'controller':
        return 0

    if obj.init_complete is False:
        init_func()
        return 0

    # get a list if provisioned ntp servers
    _get_ntp_servers()

    # nothing to do while there are no provisioned NTP servers
    if len(obj.server_list_conf) == 0:
        return 0

    # Do NTP Query
    data = subprocess.check_output([PLUGIN_EXEC, PLUGIN_EXEC_OPTIONS])

    # Keep this FIT test code but make it commented out for security
    #
    # if os.path.exists('/var/run/fit/ntpq_data'):
    #    data = ''
    #    collectd.info("%s using ntpq FIT data" % PLUGIN)
    #    with open('/var/run/fit/ntpq_data', 'r') as infile:
    #        for line in infile:
    #            data += line

    if not data:
        collectd.error("%s no data from query" % PLUGIN)
        return 0

    # Get the ntp query output into a list of lines
    obj.ntpq = data.split('\n')

    # keep track of changes ; only log on changes
    reachable_list_changed = False
    unreachable_list_changed = False

    # Manage the selected server name
    #
    # save the old value so we can print a log if the selected server changes
    if obj.selected_server:
        obj.selected_server_save = obj.selected_server
    # always assume no selected server ; till its learned
    obj.selected_server = ''

    # start with a fresh empty list for this new run to populate
    obj.server_list_ntpq = []

    # Loop through the ntpq output.
    # Ignore the first 2 lines ; just header data.
    for i in range(2, len(obj.ntpq)):

        # Ignore empty or lines that are not long enough. IPV4 IP Address is at
        # least 7 characters long, IPV6 2. Adding 1 character for the
        # availability flag.
        if len(obj.ntpq[i]) < 3:
            continue

        # log the ntpq output ; minus the 2 lines of header
        collectd.info("NTPQ: %s" % obj.ntpq[i])

        # Unreachable servers are ones whose line start with a space
        ip = ''
        if obj.ntpq[i][0] == ' ':
            # get the ip address
            # example format of line:['', '132.163.4.102', '', '', '.INIT.',
            # get ip from index [1] of the list
            unreachable = obj.ntpq[i].split(' ')[1]
            if unreachable:
                # check to see if its a controller ip
                # we skip over controller ips
                if _is_controller(unreachable) is False:
                    _add_ip_to_ntpq_server_list(unreachable)
                    if unreachable not in obj.unreachable_servers:
                        if _raise_alarm(unreachable) is False:
                            unreachable_list_changed = True
                            # if the FM call to raise the alarm worked then
                            # add this ip to the unreachable list if its not
                            # already in it
                            _add_unreachable_server(unreachable)

        # Reachable servers are ones whose line start with a '+'
        elif obj.ntpq[i][0] == '+':
            # remove the '+' and get the ip
            ip = obj.ntpq[i].split(' ')[0][1:]

        elif obj.ntpq[i][0] == '*':
            # remove the '*' and get the ip
            cols = obj.ntpq[i].split(' ')
            ip = cols[0][1:]
            if ip:
                ip_family = _is_ip_address(ip)
                obj.peer_selected = _is_controller(ip)
                if ip != obj.selected_server and obj.alarm_raised is True:
                    # a new ntp server is selected, old alarm may not be
                    # valid
                    _clear_base_alarm()
                if obj.peer_selected is False:
                    if obj.selected_server:
                        # done update the selected server if more selections
                        # are found. go with the first one found.
                        collectd.info("%s additional selected server found"
                                      " '%s'; current selection is '%s'" %
                                      (PLUGIN, ip, obj.selected_server))
                    else:
                        # update the selected server list
                        obj.selected_server = ip
                        collectd.debug("%s selected server is '%s'" %
                                       (PLUGIN, obj.selected_server))
                else:
                    # refer to peer
                    refid = ''
                    for i in range(1, len(cols)):
                        if cols[i] != '':
                            refid = cols[i]
                            break

                    if refid not in ('', '127.0.0.1') and \
                            not _is_controller(refid) and \
                            socket.AF_INET == ip_family:
                        # ipv4, peer controller refer to a time source is not
                        # itself or a controller (this node)
                        obj.selected_server = ip
                        collectd.debug("peer controller has a reliable "
                                       "source")

        # anything else is unreachable
        else:
            unreachable = obj.ntpq[i][1:].split(' ')[0]
            if _is_controller(unreachable) is False:
                _add_ip_to_ntpq_server_list(unreachable)
                if unreachable not in obj.unreachable_servers:
                    if _raise_alarm(unreachable) is False:
                        unreachable_list_changed = True
                        # if the FM call to raise the alarm worked then
                        # add this ip to the unreachable list if its not
                        # already in it
                        _add_unreachable_server(unreachable)

        if ip:
            # if the ip is valid then manage it
            if _is_controller(ip) is False:
                _add_ip_to_ntpq_server_list(ip)
                # add the ip to the reachable servers list
                # if its not already there
                if ip not in obj.reachable_servers:
                    obj.reachable_servers.append(ip)
                    reachable_list_changed = True
                # make sure this IP is no longer in the unreachable
                # list and that alarms for it are cleared
                _remove_ip_from_unreachable_list(ip)

    _cleanup_stale_servers()

    if obj.selected_server:
        if obj.selected_server != obj.selected_server_save:
            collectd.info(
                "%s selected server changed from '%s' to '%s'" %
                (PLUGIN, obj.selected_server_save, obj.selected_server))
        obj.selected_server_save = obj.selected_server
        if obj.alarm_raised is True:
            _clear_base_alarm()

    elif obj.alarm_raised is False:
        if obj.peer_selected:
            collectd.info("%s peer is selected" % PLUGIN)
        else:
            collectd.error("%s no selected server" % PLUGIN)
        if _raise_alarm() is False:
            obj.selected_server_save = 'None'

    # only log and act on changes
    if reachable_list_changed is True:
        if obj.reachable_servers:
            collectd.info("%s reachable servers: %s" %
                          (PLUGIN, obj.reachable_servers))
            if obj.alarm_raised is True:
                if obj.selected_server and obj.reachable_servers:
                    _clear_base_alarm()
        else:
            collectd.error("%s no reachable servers" % PLUGIN)
            _raise_alarm()

    # only log changes
    if unreachable_list_changed is True:
        if obj.unreachable_servers:
            collectd.info("%s unreachable servers: %s" %
                          (PLUGIN, obj.unreachable_servers))
        else:
            collectd.info("%s all servers are reachable" % PLUGIN)

    # The sample published to the database is simply the number
    # of reachable servers if one is selected
    if not obj.selected_server:
        sample = 0
    else:
        sample = len(obj.reachable_servers)

    # Dispatch usage value to collectd
    val = collectd.Values(host=obj.hostname)
    val.plugin = 'ntpq'
    val.type = 'absolute'
    val.type_instance = 'reachable'
    val.dispatch(values=[sample])

    return 0
Exemple #48
0
    def read_config(self, cfg):
        """Initializes variables from conf files."""
        for children in cfg.children:
            if children.key == INTERVAL:
                self.interval = children.values[0]
            elif children.key == USE_REST_API:
                use_rest_api = int(children.values[0])
            elif children.key == USER:
                self.username = children.values[0]
            elif children.key == PASSWORD:
                self.password = children.values[0]

        host, port, index = self.get_elastic_search_details()
        elastic["host"] = host
        elastic["port"] = port
        indices["workflow"] = index
        appname = self.get_app_name()
        tag_app_name['oozie'] = appname
        self.cluster_name = self.get_cluster()

        job_history_server["port"] = "19888"
        timeline_server["port"] = "8188"
        oozie["port"] = "11000"
        self.hdfs_port = "50070"
        if not os.path.isdir(jobhistory_copy_dir):
            try:
                os.mkdir(jobhistory_copy_dir)
            except:
                collectd.error("Unable to create job history directory %s" %
                               jobhistory_copy_dir)

        timezone = self.get_time_zone()
        if not timezone:
            collectd.error("Unable to get timezone")

        if self.cluster_name and timezone and self.is_service_running(
            ["OOZIE", "MAPREDUCE2", "SPARK2", "HDFS"]):
            job_history_host = self.get_hadoop_service_details(
                self.url_knox + "/" + self.cluster_name +
                "/services/MAPREDUCE2/components/HISTORYSERVER")
            if job_history_host:
                job_history_server["host"] = job_history_host[0]
            else:
                collectd.error("Unable to get Job_history ip")
            timeline_host = self.get_hadoop_service_details(
                self.url_knox + "/" + self.cluster_name +
                "/services/YARN/components/APP_TIMELINE_SERVER")
            if timeline_host:
                timeline_server["host"] = timeline_host[0]
            else:
                collectd.error("Unable to get timeline_server ip")
            oozie_host = self.get_hadoop_service_details(
                self.url_knox + "/" + self.cluster_name +
                "/services/OOZIE/components/OOZIE_SERVER")
            if oozie_host:
                oozie["host"] = oozie_host[0]
            else:
                collectd.error("Unable to get oozie ip")
            self.hdfs_hosts = self.get_hadoop_service_details(
                self.url_knox + "/" + self.cluster_name +
                "/services/HDFS/components/NAMENODE")
            if self.hdfs_hosts:
                if len(self.hdfs_hosts) == 2:
                    hdfs["url"] = "http://{0}:{1};http://{2}:{3}".format(
                        self.hdfs_hosts[0], self.hdfs_port, self.hdfs_hosts[1],
                        self.hdfs_port)
                else:
                    hdfs["url"] = "http://{0}:{1}".format(
                        self.hdfs_hosts[0], self.hdfs_port)
                hdfs['timezone'] = timezone
                hdfs["user"] = "******"
            else:
                collectd.error("Unable to get hdfs ips")
            if job_history_host and timeline_host and oozie_host and self.hdfs_hosts:
                self.update_config_file(use_rest_api, jobhistory_copy_dir)
                self.is_config_updated = 1
                initialize_app()
                initialize_app_elastic()
        else:
            collectd.error("Unable to get cluster name")
Exemple #49
0
    def get_redis_details(self):
        details_dict={}
        stats_dict={}
        persistence_dict = {}
        cpu_dict = {}
        final_redis_dict={}
        try:
            server_details = self.redis_client.info(section="server")
            if server_details:
                details_dict["version"] = server_details.get("redis_version",None)
                details_dict["buildId"] = server_details.get("redis_build_id",None)
                details_dict["mode"] = server_details.get("redis_mode",None)
                details_dict["os"] = server_details.get("os")
                details_dict["tcpPort"] = server_details.get("tcp_port")
                details_dict["runId"] = server_details.get("run_id")
                details_dict["upTime"] = server_details.get("uptime_in_seconds",None)
            server_conn_details = self.redis_client.info("clients")
            if server_conn_details:
                stats_dict["clientLongestOutputList"] = server_conn_details.get("client_longest_output_list")
                stats_dict["clientBiggestInputBuf"] = server_conn_details.get("client_biggest_input_buf")
                stats_dict["blockedClients"] = server_conn_details.get("blocked_clients",0)
                stats_dict["connectedClients"] = server_conn_details.get("connected_clients",0)
            server_stats = self.redis_client.info(section="stats")
            if server_stats:
                input_bytes = None
                try:
                    input_bytes = round(server_stats.get("total_net_input_bytes",0) / (1024.0 * 1024.0), 2)
                except Exception as e:
                    collectd.error("Error in getting total input bytes due to %s" % str(e))
                output_bytes = None
                try:
                    output_bytes = round(server_stats.get("total_net_output_bytes",0) / (1024.0 * 1024.0), 2)
                except Exception as e:
                    collectd.error("Error in getting total input bytes due to %s" % str(e))
                stats_dict["clusterEnabled"] = True if self.redis_client.info().get("cluster_enabled")== 1 else False
                stats_dict["instantaneousInputKbps"] = server_stats.get("instantaneous_input_kbps",0.0)
                stats_dict["instantaneousOutputKbps"] = server_stats.get("instantaneous_output_kbps",0.0)
                if self.pollCounter <= 1:
                    self.previousData["syncFull"] = server_stats.get("sync_full",0)
                    self.previousData["syncPartialOk"] = server_stats.get("sync_partial_ok",0)
                    self.previousData["syncPartialErr"] = server_stats.get("sync_partial_err",0)
                    self.previousData["totalConnReceived"] = server_stats.get("total_connections_received",0)
                    self.previousData["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0)
                    self.previousData["totalNetInputBytes"] = input_bytes
                    self.previousData["totalNetOuputBytes"] = output_bytes
                    self.previousData["keyspaceHits"] = server_stats.get("keyspace_hits",0)
                    self.previousData["keyspaceMisses"] = server_stats.get("keyspace_misses",0)
                    self.previousData["expiredKeys"] = server_stats.get("expired_keys",0)
                    self.previousData["evictedKeys"] = server_stats.get("evicted_keys",0)
                    self.previousData["rejectedConn"] = server_stats.get("rejected_connections",0)
                    stats_dict["syncFull"]=0
                    stats_dict["syncPartialOk"]=0
                    stats_dict["syncPartialErr"]=0
                    stats_dict["totalConnReceived"] = 0
                    stats_dict["totalCommandsProcessed"] = 0
                    stats_dict["totalNetInputBytes"] = 0
                    stats_dict["totalNetOutputBytes"] = 0
                    stats_dict["keyspaceHits"] = 0
                    stats_dict["keyspaceMisses"] = 0
#                    stats_dict["keyspaceHitRate"] = 0.0
                    stats_dict["keyspaceMissRate"] = 0.0
                    stats_dict["writeThroughput"] = 0.0
                    stats_dict["readThroughput"] = 0.0
                else:
                    stats_dict["syncFull"]=server_stats.get("sync_full",0)-self.previousData["syncFull"]
                    stats_dict["syncPartialOk"]= server_stats.get("sync_partial_ok",0)-self.previousData["syncPartialOk"]
                    stats_dict["syncPartialErr"]=server_stats.get("sync_partial_err",0)-self.previousData["syncPartialErr"]
                    stats_dict["rejectedConn"] = server_stats.get("rejected_connections",0) - self.previousData["rejectedConn"]
                    stats_dict["expiredKeys"] = server_stats.get("expired_keys",0) - self.previousData["expiredKeys"]
                    stats_dict["evictedKeys"] = server_stats.get("evicted_keys",0) - self.previousData["evictedKeys"]
                    stats_dict["totalConnReceived"] = server_stats.get("total_connections_received",0) - self.previousData["totalConnReceived"]
                    stats_dict["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0) - self.previousData["totalCommandsProcessed"]
                    stats_dict["totalNetInputBytes"] = input_bytes - self.previousData["totalNetInputBytes"]
                    stats_dict["totalNetOutputBytes"] = output_bytes - self.previousData["totalNetOuputBytes"]
                    stats_dict["keyspaceHits"] = server_stats.get("keyspace_hits",0) - self.previousData["keyspaceHits"]
                    stats_dict["keyspaceMisses"] = server_stats.get("keyspace_misses",0) - self.previousData["keyspaceMisses"]
#                   if ((stats_dict["keyspaceHits"] > 0) or (stats_dict["keyspaceMisses"] > 0)):
#                        keyspace_dict["keyspaceHitRate"] = round(float(stats_dict["keyspaceHits"] / (keyspace_dict["keyspaceHits"] + stats_dict["keyspaceMisses"])), 2)
#                        stats_dict["keyspaceMissRate"] = round(float(stats_dict["keyspaceMisses"] / (stats_dict["keyspaceHits"] + stats_dict["keyspaceMisses"])), 2)
#                    else:
#                        stats_dict["keyspaceHitRate"] = 0
#                        stats_dict["keyspaceMissRate"] = 0
                    stats_dict["readThroughput"] = round(float(float(stats_dict["totalNetInputBytes"]) / int(self.interval)), 2)
                    stats_dict["writeThroughput"] = round(float(float(stats_dict["totalNetOutputBytes"]) / int(self.interval)), 2)
                    self.previousData["syncFull"] = server_stats.get("sync_full",0)
                    self.previousData["syncPartialOk"] = server_stats.get("sync_partial_ok",0)
                    self.previousData["syncPartialErr"] = server_stats.get("sync_partial_err",0)
                    self.previousData["totalConnReceived"] = server_stats.get("total_connections_received",0)
                    self.previousData["totalCommandsProcessed"] = server_stats.get("total_commands_processed",0)
                    self.previousData["totalNetInputBytes"] = input_bytes
                    self.previousData["totalNetOutputBytes"] = output_bytes
                    self.previousData["keyspaceHits"] = server_stats.get("keyspace_hits",0)
                    self.previousData["keyspaceMisses"] = server_stats.get("keyspace_misses",0)
                    self.previousData["expiredKeys"] = server_stats.get("expired_keys",0)
                    self.previousData["evictedKeys"] = server_stats.get("evicted_keys",0)
                    self.previousData["rejectedConn"] = server_stats.get("rejected_connections",0)
                keyspace_details = self.redis_client.info("keyspace")
                if keyspace_details:
                    totalk = 0
                    dbcount = 0
                    for k, v in keyspace_details.items():
                        totalk += int(v["keys"])
                        dbcount += 1
                    stats_dict["totKeys"] = totalk
                else:
                    collectd.error("No Key details found")
                    stats_dict["totKeys"] = 0

                outlis = subprocess.check_output(["redis-cli", "--intrinsic-latency", "1"]).split()
                if len(outlis) > 0:
                    try:
                        stats_dict["latency"] = float(outlis[-16])
                    except ValueError:
                        collectd.error("No latency details found")
                        stats_dict["latency"] = 0
            memory_stats = self.redis_client.info(section="memory")
            if memory_stats:
                stats_dict["usedMemoryPeak"] = round(memory_stats.get("used_memory_peak",0) / (1024.0 * 1024.0), 2)
                stats_dict["totalSystemMemory"] = round(memory_stats.get("total_system_memory",0) / (1024.0 * 1024.0), 2)
                stats_dict["memFragmentationRatio"] = memory_stats.get("mem_fragmentation_ratio",0)
                stats_dict["memoryAllocator"] = memory_stats.get("mem_allocator",None)
                stats_dict["maxmemoryPolicy"]  =  memory_stats.get("maxmemory_policy")
                stats_dict["memoryUsed"] = round(memory_stats.get("used_memory",0) / (1024.0 * 1024.0), 2)
            else:
                collectd.error("No memory stats found")
                pass
            cpu_stats = self.redis_client.info(section="cpu")
            if cpu_stats:
                if self.pollCounter <= 1:
                    self.previousData["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0)
                    self.previousData["usedCpuUser"] = cpu_stats.get("used_cpu_user",0.0)
                    self.previousData["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0)
                    self.previousData["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0)
                    details_dict["usedCpuSys"] = 0.0
                    details_dict["usedCpuUser"]= 0.0
                    details_dict["usedCpuUserChildren"]= 0.0
                    details_dict["usedCpuSysChildren"]= 0.0
                else:
                    details_dict["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0)-self.previousData["usedCpuSys"]
                    details_dict["usedCpuUser"] =  cpu_stats.get("used_cpu_user",0.0)- self.previousData["usedCpuUser"]
                    details_dict["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0) -  self.previousData["usedCpuUserChildren"]
                    details_dict["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0) - self.previousData["usedCpuSysChildren"]
                    self.previousData["usedCpuSys"] = cpu_stats.get("used_cpu_sys",0.0)
                    self.previousData["usedCpuUser"] = cpu_stats.get("used_cpu_user",0.0)
                    self.previousData["usedCpuUserChildren"] = cpu_stats.get("used_cpu_user_children",0.0)
                    self.previousData["usedCpuSysChildren"] = cpu_stats.get("used_cpu_sys_children",0.0)
            persistence_stats = self.redis_client.info(section="persistence")
            if persistence_stats:
                persistence_dict["aofEnabled"] = True if persistence_stats.get("aof_enabled") == 1 else False
                persistence_dict["aofRewriteInProgress"] = True if persistence_stats.get("aof_rewrite_in_progress") == 1 else False
                persistence_dict["aofLastWriteStatus"] = persistence_stats.get("aof_last_write_status","Failed")
                persistence_dict["aofRewriteScheduled"] = True if persistence_stats.get("aof_rewrite_scheduled") == 1 else False
                persistence_dict["aofCurrentSize"] = round(persistence_stats.get("aof_current_size",0)/1024, 2)
                persistence_dict["aofBufferLength"] = round(persistence_stats.get("aof_buffer_length",0)/ 1024, 2)
                persistence_dict["rdbBgsaveInProgress"] = True if persistence_stats.get("rdb_bgsave_in_progress") == 1 else False
                persistence_dict["rdbLastSaveTime"] = persistence_stats.get("rdb_last_save_time",0)
                persistence_dict["rdbLastBgsaveStatus"] = persistence_stats.get("rdb_last_bgsave_status","Failed")
                persistence_dict["loadingStartTime"] = persistence_stats.get("loading_start_time",0)
                persistence_dict["loadingTotalKBytes"] = round(persistence_stats.get("loading_total_bytes",0.0)/(1024.0), 2)
                if self.pollCounter <= 1:
                    self.previousData["loadingLoadedKBytes"] = persistence_stats.get("loading_loaded_bytes",0.0)
                    persistence_dict["loadingLoadedKBytes"] = 0.0
                else:
                    persistence_dict["loadingLoadedKBytes"] = round((persistence_stats.get("loading_loaded_bytes",0.0)-self.previousData["loadingLoadedKBytes"])/(1024.0), 2)
                    self.previousData["loadingLoadedKBytes"] = persistence_stats.get("loading_loaded_bytes",0.0)
                persistence_dict["loadingLoadedPerc"] = int(persistence_stats.get("loading_loaded_perc",0))
                persistence_dict["loadingEtaSeconds"] = int(persistence_stats.get("loading_eta_seconds",0))
            persistence_dict[PLUGINTYPE] = "redisPersistence"
            rep_stats = self.redis_client.info(section="replication")
            if rep_stats:
                details_dict["role"] = rep_stats.get("role",None)
                stats_dict["connectedSlaves"] = rep_stats.get("connected_slaves",0)
                details_dict["replBacklogActive"] = True if rep_stats.get("repl_backlog_active") == 1 else False
                if self.pollCounter <= 1:
                    self.previousData["replBacklogHistlen"] = int(rep_stats.get("repl_backlog_histlen",0))
                    details_dict["replBacklogHistlen"] = 0
                else:
                    details_dict["replBacklogHistlen"] = int((rep_stats.get("repl_backlog_histlen",0)-self.previousData["replBacklogHistlen"])/1024)
                    self.previousData["replBacklogHistlen"] = rep_stats.get("repl_backlog_histlen",0)
                details_dict["masterLinkStatus"] = rep_stats.get("master_link_status", None)
                stats_dict["masterLastIOSecsAgo"] = rep_stats.get("master_last_io_seconds_ago", None)
                details_dict["masterLinkDownSinceSecs"] = rep_stats.get("master_link_down_since_seconds", None)
            details_dict[PLUGINTYPE] = "redisDetails"
            stats_dict[PLUGINTYPE] = "redisStat"
            final_redis_dict["redisPersistence"] = persistence_dict
            final_redis_dict["redisDetails"] = details_dict
            final_redis_dict["redisStat"] = stats_dict
            self.add_common_params(final_redis_dict)
            return final_redis_dict
        except Exception as err:
            collectd.error("Unable to fetch the details due to %s" % str(err))
            return final_redis_dict
    except socket.error, e:
        collectd.error('redis_info plugin: Error connecting to %s:%d - %r'
                       % (REDIS_HOST, REDIS_PORT, e))
        return None

    fp = s.makefile('r')

    if REDIS_AUTH is not None:
        log_verbose('Sending auth command')
        s.sendall('auth %s\r\n' % (REDIS_AUTH))

        status_line = fp.readline()
        if not status_line.startswith('+OK'):
            # -ERR invalid password
            # -ERR Client sent AUTH, but no password is set
            collectd.error('redis_info plugin: Error sending auth to %s:%d - %r'
                           % (REDIS_HOST, REDIS_PORT, status_line))
            return None

    log_verbose('Sending info command')
    s.sendall('info\r\n')

    status_line = fp.readline()
    content_length = int(status_line[1:-1]) # status_line looks like: $<content_length>
    data = fp.read(content_length)
    log_verbose('Received data: %s' % data)
    s.close()

    linesep = '\r\n' if '\r\n' in data else '\n'
    return parse_info(data.split(linesep))

Exemple #51
0
                                data[self.prefix][name]['memory_mb'] * data[self.prefix]['cluster']['config']['AllocationRatioRam']
                            data[self.prefix][name]['memory_mb_overcommit_withreserve'] = \
                                data[self.prefix][name]['memory_mb_overcommit'] - data[self.prefix]['cluster']['config']['ReservedNodeRamMB']
                            data[self.prefix][name]['vcpus_overcommit'] = \
                                data[self.prefix][name]['vcpus'] * data[self.prefix]['cluster']['config']['AllocationRatioCores']
                            data[self.prefix][name]['vcpus_overcommit_withreserve'] = \
                                data[self.prefix][name]['vcpus_overcommit'] - data[self.prefix]['cluster']['config']['ReservedNodeCores']

        return data


try:
    plugin = NovaPlugin()
except Exception as exc:
    collectd.error(
        "openstack-nova: failed to initialize nova plugin :: %s :: %s" %
        (exc, traceback.format_exc()))


def configure_callback(conf):
    """Received configuration information"""
    plugin.config_callback(conf)


def read_callback():
    """Callback triggerred by collectd on read"""
    plugin.read_callback()


collectd.register_config(configure_callback)
collectd.register_read(read_callback, plugin.interval)
        results = output.split('\n')
        # push values
        data[ceph_cluster]['cluster'] = {}
        data[ceph_cluster]['cluster']['avg_latency'] = results[0]
        data[ceph_cluster]['cluster']['stddev_latency'] = results[1]
        data[ceph_cluster]['cluster']['max_latency'] = results[2]
        data[ceph_cluster]['cluster']['min_latency'] = results[3]

        return data


try:
    plugin = CephLatencyPlugin()
except Exception as exc:
    collectd.error(
        "ceph-latency: failed to initialize ceph latency plugin :: %s :: %s" %
        (exc, traceback.format_exc()))


def configure_callback(conf):
    """Received configuration information"""
    plugin.config_callback(conf)
    collectd.register_read(read_callback, plugin.interval)


def read_callback():
    """Callback triggerred by collectd on read"""
    plugin.read_callback()


collectd.register_config(configure_callback)
    def poll(self):
        node_stats = dict()
        try:
            endTime = int(
                time.time() *
                1000)  #swagger-stats time is represented in nano seconds
            startTime = endTime - (int(self.interval) * 1000)
            error_response = False
            log_stats = True
            url_api_request = "http://localhost:{}/swagger-stats/stats?fields=apistats".format(
                self.port)
            response = requests.get(url_api_request)
            final_json_to_be_dispatched = list()
            if response.status_code == 200:
                collectd.info(
                    'Plugin nodejsapi: Response code 200 received for apistats'
                )
                content = response.content
                #response_json = json.loads(content)
                response_json = ast.literal_eval(content)
                swagger_api_stats = response_json.get("apistats")
                api_req_stats = dict()
                for path_key in swagger_api_stats.keys():
                    path = path_key
                    method_info = swagger_api_stats[path]
                    method_level_info = dict()
                    for method in method_info.keys():
                        req_method = method
                        req_method_details = method_info[method]
                        api_req = {}
                        api_req["requests"] = req_method_details.get(
                            "requests")
                        api_req["responses"] = req_method_details.get(
                            "responses")
                        api_req["redirect"] = req_method_details.get(
                            "redirect")
                        api_req["total_time"] = req_method_details.get(
                            "total_time")
                        api_req["success"] = req_method_details.get("success")
                        api_req["errors"] = req_method_details.get("errors")
                        api_req["total_req_clength"] = req_method_details.get(
                            "total_req_clength")
                        api_req["total_res_clength"] = req_method_details.get(
                            "total_res_clength")
                        method_level_info[req_method] = api_req
                    api_req_stats[path] = method_level_info

                if self.previous_data:
                    for key_path in api_req_stats.keys():
                        api_stats = dict()
                        if key_path in self.previous_data.keys():
                            method_info = api_req_stats[key_path]
                            for method, method_details in method_info.items():
                                if method in self.previous_data[key_path].keys(
                                ):
                                    if (method_details.get("requests") -
                                            self.previous_data[key_path]
                                        [method]["requests"]) < 0:
                                        log_stats = False  #dont log long request and errors when app server is restarted
                                        final_json_to_be_dispatched = list()
                                        break
                                    api_stats["requests"] = method_details.get(
                                        "requests") - self.previous_data[
                                            key_path][method]["requests"]
                                    api_stats[
                                        "responses"] = method_details.get(
                                            "responses") - self.previous_data[
                                                key_path][method]["responses"]
                                    api_stats["redirect"] = method_details.get(
                                        "redirect") - self.previous_data[
                                            key_path][method]["redirect"]
                                    api_stats[
                                        "total_time"] = method_details.get(
                                            "total_time") - self.previous_data[
                                                key_path][method]["total_time"]
                                    api_stats["success"] = method_details.get(
                                        "success") - self.previous_data[
                                            key_path][method]["success"]
                                    api_stats["errors"] = method_details.get(
                                        "errors") - self.previous_data[
                                            key_path][method]["errors"]
                                    api_stats[
                                        "total_req_clength"] = method_details.get(
                                            "total_req_clength"
                                        ) - self.previous_data[key_path][
                                            method]["total_req_clength"]
                                    api_stats["method"] = method
                                    api_stats["path"] = key_path
                                else:
                                    api_stats = copy.deepcopy(method_details)
                                    api_stats["method"] = method
                                    api_stats["path"] = key_path
                                if int(api_stats["requests"]) != 0:
                                    final_json_to_be_dispatched.append(
                                        api_stats)

                        else:
                            method_info = api_req_stats[key_path]
                            for method, method_details in method_info.items():
                                api_stats = copy.deepcopy(method_details)
                                api_stats["method"] = method
                                api_stats["path"] = key_path
                            final_json_to_be_dispatched.append(api_stats)
                else:
                    log_stats = False  #dont log long request and errors in first poll

                self.previous_data = copy.deepcopy(api_req_stats)

            else:
                error_response = True
            node_stats[API_STATS] = final_json_to_be_dispatched

            url_last_errors = "http://localhost:{}/swagger-stats/stats?fields=lasterrors".format(
                self.port)
            response = requests.get(url_last_errors)
            error_stats = list()
            if response.status_code == 200 and log_stats:  # don't log data for first interval
                collectd.info(
                    'Plugin nodejsapi: Response code 200 received for lasterrors'
                )
                content = response.content
                # response_json = json.loads(content)
                response_json = ast.literal_eval(content)
                swagger_error_stats = response_json.get("lasterrors")
                for error in swagger_error_stats:
                    if (error.get("startts") <= endTime
                            and error.get("startts") >= startTime):
                        error_stat = {}
                        error_stat["path"] = error.get("api").get("path")
                        error_stat["method"] = error.get("method")
                        error_stat["error_code"] = ((
                            error.get("http")).get("response")).get("code")
                        error_stats.append(error_stat)
            else:
                error_response = True
            node_stats[ERROR_STATS] = error_stats

            url_long_request = "http://localhost:{}/swagger-stats/stats?fields=longestreq".format(
                self.port)
            response = requests.get(url_long_request)
            long_req_stats = list()
            if response.status_code == 200 and log_stats:  # don't log data for first interval
                collectd.info(
                    'Plugin nodejsapi: Response code 200 received for longestreq'
                )
                content = response.content
                # response_json = json.loads(content)
                response_json = ast.literal_eval(content)
                swagger_long_req_stats = response_json.get("longestreq")
                for long_req in swagger_long_req_stats:
                    if (long_req.get("startts") <= endTime
                            and long_req.get("startts") >= startTime):
                        long_req_stat = {}
                        long_req_stat["path"] = long_req.get("api").get("path")
                        long_req_stat["method"] = long_req.get("method")
                        long_req_stat["responsetime"] = long_req.get(
                            "responsetime")
                        long_req_stats.append(long_req_stat)
            else:
                error_response = True
            node_stats[LONG_REQUEST_STATS] = long_req_stats

            #send empty value if no data present for given time interval
            if not error_response and len(error_stats) == 0 and len(
                    long_req_stats) == 0 and len(
                        final_json_to_be_dispatched) == 0:
                long_req_stat = {}
                long_req_stat["path"] = None
                long_req_stat["method"] = None
                long_req_stat["responsetime"] = None
                long_req_stats.append(long_req_stat)
                node_stats[LONG_REQUEST_STATS] = long_req_stats

        except Exception as ex:
            collectd.error(
                'Error collecting nodejsapi application stats : %s ' %
                ex.message)
        return node_stats
    v.dispatch(values=[value if new_data else 0])


def read():
    global gw_stat_file, last_time

    # Read the stats file
    try:
        with open(gw_stat_file, 'r') as f:
            gw_stats = load(f)
    except IOError, e:
        collectd.error('ttn_gw plugin: Cannot read gateway stats file %s' % e)
        collectd.error('ttn_gw plugin: (gateway not runing?)')
        return
    except ValueError:
        collectd.error('ttn_gw plugin: Cannot parse gateway stats file')
        return

    new_data = False
    if last_time != gw_stats['time']:
        new_data = True
        last_time = gw_stats['time']

    current = gw_stats['current']
    keys = (
        'up_radio_packets_received',
        'up_radio_packets_crc_good',
        'up_radio_packets_crc_bad',
        'up_radio_packets_crc_absent',
        'up_radio_packets_dropped',
        'up_radio_packets_forwarded',
Exemple #55
0
 def populate_disk_details(self, vol_name, brick_host, brick_path):
     try:
         device_to_partitions = {}
         brick_devices, brick_device_partitions, mount_point = \
             self.get_brick_devices(brick_path)
         if not (brick_devices or brick_device_partitions):
             collectd.error(
                 'Failed to fetch device details for brick %s:%s'
                 ' of volume %s' % (
                     brick_host,
                     brick_path,
                     vol_name
                 )
             )
             return
         for device in brick_devices:
             partition_name_re = re.compile('%s[0-9]+' % device)
             device_partitions = []
             for partition in brick_device_partitions:
                 if partition_name_re.match(partition):
                     device_partitions = device_to_partitions.get(
                         device,
                         []
                     )
                     device_partitions.append(partition)
             device_to_partitions[device] = device_partitions
         for brick_device, partitions in device_to_partitions.iteritems():
             # Collect disk read and write octets
             # Push to cluster->volume->node->brick tree
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_octets.read' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_bytes'
             )
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_octets.write' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_bytes'
             )
             # Push to cluster->node->brick tree
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_octets.read' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_bytes'
             )
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_octets.write' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_bytes'
             )
             # Collect disk read and write io
             # Push cluster->volume->host->brick tree
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_ops.read' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_count'
             )
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_ops.write' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_count'
             )
             # Push to cluster->node->brick tree
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_ops.read' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_count'
             )
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_ops.write' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_count'
             )
             # Collect disk read and write latency
             # Push to cluster->volume->node->brick tree
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_time.read' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_time'
             )
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_time.write' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_time'
             )
             # Push to cluster->node->brick tree
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_time.read' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'read_time'
             )
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'disk_time.write' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = self.get_interval_disk_io_stat(
                 brick_device,
                 partitions,
                 'write_time'
             )
             # Collect disk utilization
             # Push to cluster->volume->node->brick tree
             disk_usage = self.get_disk_usage(brick_device)
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.used' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.used
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.total' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.total
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.percent_used' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.percent
             # Push to cluster->node->brick tree
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.used' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.used
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.total' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.total
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'utilization.percent_used' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.percent
             # Collect disk mount-point utilization
             if not mount_point:
                 return
             disk_usage = self.get_disk_usage(mount_point)
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.used' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.used
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.total' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.total
             self.brick_details[
                 'clusters.%s.volumes.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.percent_used' % (
                     self.CONFIG['integration_id'],
                     vol_name,
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.percent
             # Push to cluster->node->brick tree
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.used' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.used
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.total' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.total
             self.brick_details[
                 'clusters.%s.nodes.%s.bricks.%s.device.%s.'
                 'mount_utilization.percent_used' % (
                     self.CONFIG['integration_id'],
                     brick_host.replace('.', '_'),
                     brick_path.replace('/', '|'),
                     brick_device.replace('/dev/', '')
                 )
             ] = disk_usage.percent
     except (AttributeError, KeyError):
         collectd.error(
             'Failed to populate_disk_details for volume %s, brick %s.'
             'Error %s' % (
                 vol_name,
                 brick_path,
                 traceback.format_exc()
             )
         )
Exemple #56
0
def dns_request_conf(config=None):
    """Collectd Plugin Configuration Parsing"""
    global Queries, Nameserver_Cache

    collectd.debug("config: {}".format(str(config.key)))
    Queries = {}
    for request in config.children:
        collectd.debug("C: {} = {}".format(request.key, request.values[0]))
        request_name = request.values[0]
        Queries[request_name] = {'recordtype': 'A', 'timeout': DEFAULT_TIMEOUT}
        for c in [
                x for x in request.children
                if x.key.lower() in ('query', 'server', 'timeout', 'sourceip',
                                     'sourceport')
        ]:
            collectd.debug("Queries[{}][{}] values: {}".format(
                request_name, c.key.lower(), c.values[0]))
            Queries[request_name][c.key.lower()] = c.values[0]

    collectd.debug("QUERIES: {}".format(Queries.keys()))
    required_args = set(['query', 'server', 'timeout'])
    for q, query in Queries.items():
        query = Queries[q]

        actual_args = set(sorted(query.keys()))

        if (not required_args.issubset(actual_args)):
            #        if (('query', 'server', 'timeout') not in query.keys()):
            collectd.warning("Request '{}' is missing either a Query, "
                             "Server or Timeout value ({}). Skipping.".format(
                                 q, query))
            query['skip'] = True

        if (('server' in query.keys())
                and query['server'] not in Nameserver_Cache.keys()):
            try:
                resolver = dns.resolver.Resolver()
                results = resolver.query(query['server'], 'A')
                collectd.debug("RESULTS {}: {}".format(query['server'],
                                                       results))
                if results:
                    Nameserver_Cache[query['server']] = str(results[0])
            except dns.resolver.NXDOMAIN as e:
                collectd.warning("Unable to determine the IP of the server "
                                 "'{}', supplied in request '{}'".format(
                                     query['server'], q))
                query['skip'] = True

        # Validate the SourceIP to see that it makes sense
        # (it's an IP and we can bind to it)
        if ('sourceip' in query.keys()):
            try:
                ip_addr = ipaddress.ip_address(unicode(query['sourceip']))
                if (type(ip_addr) is ipaddress.IPv6Address):
                    test_sock = socket.socket(socket.AF_INET6)
                    test_sock.bind(('', 0))
                elif (type(ip_addr) is ipaddress.IPv4Address):
                    test_sock = socket.socket(socket.AF_INET)
                    test_sock.bind(('', 0))
                else:
                    raise ValueError("'{}' isn't an IPv4 or IPv6 address"
                                     "!?".format(query['sourceip']))
                    query['skip'] = True
            except ValueError as v:
                collectd.error("Source IP in '{}' ({}) doens't look valid!"
                               " {}".format(q, query['sourceip'], v))
                query['skip'] = True

        # Validate the SourcePort to see that it makes sense:
        # It's in a good range and we can bind to it.
        # Binding's a little redundant, but it'll complain if we don't
        # have permission, or if the port's already in use, where checking the
        # range will only tell us if the port is a reasonable number
        if ('sourceport' in query.keys()):
            source_port = int(query['sourceport'])
            if (source_port < 0 or source_port > 65535):
                query['skip'] = True
                collectd.warning(
                    "Invalid source port '{:d}' provided. Skipping "
                    "the DNS query for '{}' [{}]".format(
                        source_port, query['query'], query['recordtype']))

            else:
                test_sock = socket.socket(socket.AF_INET)
                test_sock.bind(('', source_port))
                test_sock.close()

    collectd.debug("QUERIES: {}".format(Queries))
Exemple #57
0
    try:
        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        sock.connect(admin_socket)
    except socket.error, e:
        collectd.error('ERROR: ceph plugin: Connecting to %s: - %r' %
                       (admin_socket, e))
        return None

    sock.sendall(cmd)

    try:
        length = struct.unpack('>i', sock.recv(4))[0]
        json_data = json.loads(sock.recv(length))
    except Exception as err:
        collectd.error('ERROR: ceph plugin: Unable to parse json: %r' %
                       (err, ))
        json_data = {}
    finally:
        sock.close()
        return json_data


def configure_callback(conf):
    """ Collectd configuration callback """

    global CEPH_ADMIN_SOCKET
    for node in conf.children:
        if node.key == 'AdminSocket':
            CEPH_ADMIN_SOCKET = node.values[0]
        else:
            collectd.warning('WARNING: ceph plugin: Unknown config key: %s.' %
        collectd.error('redis_info plugin: Error connecting to %s:%d - %r' %
                       (conf['host'], conf['port'], e))
        return None

    fp = s.makefile('r')

    if conf['auth'] is not None:
        log_verbose('Sending auth command')
        s.sendall('auth %s\r\n' % (conf['auth']))

        status_line = fp.readline()
        if not status_line.startswith('+OK'):
            # -ERR invalid password
            # -ERR Client sent AUTH, but no password is set
            collectd.error(
                'redis_info plugin: Error sending auth to %s:%d - %r' %
                (conf['host'], conf['port'], status_line))
            return None

    log_verbose('Sending info command')
    s.sendall('info\r\n')

    status_line = fp.readline()

    if status_line.startswith('-'):
        collectd.error('redis_info plugin: Error response from %s:%d - %r' %
                       (conf['host'], conf['port'], status_line))
        s.close()
        return None

    content_length = int(
 def get_metrics(self):
     try:
         ret_val = {}
         volumes = self.CLUSTER_TOPOLOGY.get('volumes', [])
         # Push brick level connections count
         volumes_list = []
         for volume in volumes:
             brick_found_for_curr_node = False
             for sub_volume_index, sub_volume_bricks in volume.get(
                 'bricks',
                 {}
             ).iteritems():
                 for brick in sub_volume_bricks:
                     brick_hostname = \
                         tendrl_glusterfs_utils.find_brick_host(
                             self.etcd_client,
                             self.CONFIG['integration_id'],
                             brick.get('hostname')
                         )
                     if brick_hostname:
                         brick_ip = socket.gethostbyname(brick_hostname)
                         if (
                             brick_ip == socket.gethostbyname(
                                 self.CONFIG['peer_name']
                             ) or
                             brick_hostname == self.CONFIG['peer_name']
                         ):
                             brick_found_for_curr_node = True
                             # Push brick client connections
                             ret_val[
                                 'clusters.%s.volumes.%s.nodes.%s.'
                                 'bricks.%s.'
                                 'connections_count' % (
                                     self.CONFIG['integration_id'],
                                     volume.get('name', ''),
                                     self.CONFIG['peer_name'].replace(
                                         '.', '_'),
                                     brick['path'].replace(
                                         '/', self.brick_path_separator
                                     )
                                 )
                             ] = brick['connections_count']
             if brick_found_for_curr_node:
                 # Update rebalance info only for this volumes
                 volumes_list.append(volume.get('name', ''))
         # push rebalance info
         rebalance_info = self._get_rebalance_info()
         for vol_name in rebalance_info:
             if vol_name in volumes_list:
                 # Push volume wise snap counts
                 ret_val[
                     'clusters.%s.volumes.%s.snap_count' % (
                         self.CONFIG['integration_id'],
                         vol_name
                     )
                 ] = rebalance_info[vol_name]['snap_count']
                 # Push rebalance bytes progress
                 ret_val[
                     'clusters.%s.volumes.%s.nodes.%s.rebalance_bytes' % (
                         self.CONFIG['integration_id'],
                         vol_name,
                         self.CONFIG['peer_name'].replace('.', '_')
                     )
                 ] = rebalance_info[vol_name]['rebalance_data']
                 # Push rebalance files progress
                 ret_val[
                     'clusters.%s.volumes.%s.nodes.%s.rebalance_files' % (
                         self.CONFIG['integration_id'],
                         vol_name,
                         self.CONFIG['peer_name'].replace('.', '_')
                     )
                 ] = rebalance_info[vol_name]['rebalance_files']
                 # Push rebalance failures
                 ret_val[
                     'clusters.%s.volumes.%s.nodes.%s.'
                     'rebalance_failures' % (
                         self.CONFIG['integration_id'],
                         vol_name,
                         self.CONFIG['peer_name'].replace('.', '_')
                     )
                 ] = rebalance_info[vol_name]['rebalance_failures']
                 # Push rebalance skipped
                 ret_val[
                     'clusters.%s.volumes.%s.nodes.%s.rebalance_skipped' % (
                         self.CONFIG['integration_id'],
                         vol_name,
                         self.CONFIG['peer_name'].replace('.', '_')
                     )
                 ] = rebalance_info[vol_name]['rebalance_skipped']
         return ret_val
     except (AttributeError, KeyError, ValueError):
         collectd.error(
             'Failed to fetch counters. Error %s\n\n' % (
                 traceback.format_exc()
             )
         )
         return {}
Exemple #60
0
def init_func():

    # ntp query is for controllers only
    if tsc.nodetype != 'controller':
        return 0

    # do nothing till config is complete.
    if obj.config_complete() is False:
        return 0

    # get current hostname
    obj.hostname = obj.gethostname()
    if not obj.hostname:
        collectd.error("%s failed to get hostname" % PLUGIN)
        return 1

    obj.base_eid = 'host=' + obj.hostname + '.ntp'
    collectd.debug("%s on %s with entity id '%s'" %
                   (PLUGIN, obj.hostname, obj.base_eid))

    # get a list of provisioned ntp servers
    _get_ntp_servers()

    # manage existing alarms.
    try:
        alarms = api.get_faults_by_id(PLUGIN_ALARMID)

    except Exception as ex:
        collectd.error("%s 'get_faults_by_id' exception ; %s ; %s" %
                       (PLUGIN, PLUGIN_ALARMID, ex))
        return 0

    if alarms:
        for alarm in alarms:
            eid = alarm.entity_instance_id
            # ignore alarms not for this host
            if obj.hostname not in eid:
                continue

            # maintain only the base alarm.
            if alarm.entity_instance_id != obj.base_eid:
                # clear any ntp server specific alarms over process restart
                # this is done to avoid the potential for stuck ntp ip alarms
                collectd.info("%s clearing found startup alarm '%s'" %
                              (PLUGIN, alarm.entity_instance_id))
                try:
                    api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id)
                except Exception as ex:
                    collectd.error(
                        "%s 'clear_fault' exception ; %s:%s ; %s" %
                        (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id, ex))
                    return 0

            else:
                obj.alarm_raised = True
                collectd.info(
                    "%s found alarm %s:%s" %
                    (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id))

                # ensure the base alarm is cleared if there are no
                # provisioned servers.
                if not obj.server_list_conf:
                    _clear_base_alarm()

    else:
        collectd.info("%s no major startup alarms found" % PLUGIN)

    obj.init_completed()
    return 0