Example #1
0
def main():
    if USER != "root":
        utils.drop_privileges(user=USER)

    last_scan = time.time() - SCAN_INTERVAL

    while True:
        ts = time.time()

        # We haven't looked for zookeeper instance recently, let's do that
        if ts - last_scan > SCAN_INTERVAL:
            instances = scan_zk_instances()
            last_scan = ts

        if not instances:
            return 13  # Ask tcollector not to respawn us

        # Iterate over every zookeeper instance and get statistics
        for ip, port, tcp_version in instances:
            tags = "port=%s" % port

            sock = connect_socket(tcp_version, port)
            if sock is None:
                continue

            sock.send("mntr\n")
            data = sock.recv(1024)
            for stat in data.splitlines():
                metric = stat.split()[0]
                value = stat.split()[1]
                if metric in KEYS:
                    print_stat(metric, ts, value, tags)
            sock.close()

        time.sleep(COLLECTION_INTERVAL)
Example #2
0
def main(argv):
  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  servers = []

  if json is None:
    utils.err("This collector requires the `json' Python module.")
    return 1

  for conf in elasticsearch_conf.get_servers():
    server = HTTPConnection( *conf )
    try:
      server.connect()
    except socket.error as exc:
      if exc.errno == errno.ECONNREFUSED:
        continue
      raise
    servers.append( server )

  if len( servers ) == 0:
    return 13  # No ES running, ask tcollector to not respawn us.

  lock = threading.Lock()
  while True:
    threads = []
    for server in servers:
      status = node_status(server)
      version = status["version"]["number"]
      t = threading.Thread(target = _collect_server, args = (server, version, lock))
      t.start()
      threads.append(t)
    for thread in threads:
      thread.join()
    time.sleep(COLLECTION_INTERVAL)
Example #3
0
def main():
    utils.drop_privileges()
    if pymongo is None:
       print >>sys.stderr, "error: Python module `pymongo' is missing"
       return 13

    c = pymongo.Connection(host=HOST, port=PORT)

    while True:
        res = c.admin.command('serverStatus')
        ts = int(time.time())

        for base_metric, tags in TAG_METRICS:
            for tag in tags:
                print 'mongo.%s %d %s type=%s' % (base_metric, ts,
                                                  res[base_metric][tag], tag)
        for metric in METRICS:
            cur = res
            try:
                for m in metric.split('.'):
                    cur = cur[m]
            except KeyError:
                continue
            print 'mongo.%s %d %s' % (metric, ts, cur)

        sys.stdout.flush()
        time.sleep(INTERVAL)
def main():
  utils.drop_privileges()
  pid = find_couchbase_pid()
  if not pid:
    utils.err("Error: Either couchbase-server is not running or file (%s)"
        " doesn't exist" % COUCHBASE_INITFILE)
    return 13

  conf_file = find_conf_file(pid)
  if not conf_file:
    utils.err("Error: Can't find config file (%s)" % conf_file)
    return 13

  bin_dir = find_bindir_path(conf_file)
  if not bin_dir:
    utils.err("Error: Can't find bindir path in config file")
    return 13

  while True:
    # Listing bucket everytime so as to start collecting datapoints
    # of any new bucket.
    buckets = list_bucket(bin_dir)
    for b in buckets:
      collect_stats(bin_dir, b)
    time.sleep(COLLECTION_INTERVAL)
Example #5
0
def main(argv):
  if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()):
    sys.exit(13)

  settings = flume_conf.get_settings()

  if (settings['default_timeout']):
    DEFAULT_TIMEOUT = settings['default_timeout']

  if (settings['default_timeout']):
    COLLECTION_INTERVAL = settings['collection_interval']

  if (settings['flume_host']):
    FLUME_HOST = settings['flume_host']

  if (settings['flume_port']):
    FLUME_PORT = settings['flume_port']

  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT)
  try:
    server.connect()
  except socket.error, (erno, e):
    if erno == errno.ECONNREFUSED:
      return 13  # No Flume server available, ask tcollector to not respawn us.
    raise
Example #6
0
def main():
    """netfilter main loop"""

    utils.drop_privileges()

    if (os.path.isdir(basedir)): 
        while True:
            ts = int(time.time())
        
            for s in STATS: 
                try: 
                   f = open(basedir + "/" + s, 'r')
                   value = f.readline().rstrip()
                   print("proc.sys.net.ipv4.netfilter.%s %d %s" % (s, ts, value))
                   f.close() 
                except:
                   # brute'ish, but should keep the collector reasonably future 
                   # proof if some of the stats disappear between kernel module 
                   # versions
                   continue

            sys.stdout.flush()
            time.sleep(interval)
    else: 
        print ("%s does not exist - ip_conntrack probably missing")
        sys.exit(13) # we signal tcollector to not run us
Example #7
0
def main():
    """ifstat main loop"""
    interval = 15

    f_netdev = open("/proc/net/dev", "r")
    utils.drop_privileges()

    # We just care about ethN interfaces.  We specifically
    # want to avoid bond interfaces, because interface
    # stats are still kept on the child interfaces when
    # you bond.  By skipping bond we avoid double counting.
    while True:
        f_netdev.seek(0)
        ts = int(time.time())
        for line in f_netdev:
            m = re.match("\s+(eth\d+):(.*)", line)
            if not m:
                continue
            intf = m.group(1)
            stats = m.group(2).split(None)
            def direction(i):
                if i >= 8:
                    return "out"
                return "in"
            for i in xrange(16):
                print ("proc.net.%s %d %s iface=%s direction=%s"
                       % (FIELDS[i], ts, stats[i], intf, direction(i)))

        sys.stdout.flush()
        time.sleep(interval)
Example #8
0
def main():
    if utils is not None:
        utils.drop_privileges()
    while True:
        RabbitCollector().get_metrics()
        sys.stdout.flush()
        time.sleep(INTERVAL)
def main():
	utils.drop_privileges()

	while True:

		try:
			response =  requests.get(STATUS_URL)
		except requests.exceptions.RequestException, error:
			print "%s error retrieving %s %s" %(METRIC_BASENAME, STATUS_URL, str(error))
			time.sleep(COLLECTION_INTERVAL)
			continue
			
		timestamp = int(time.time())
		lines = [ l.strip() for l in response.text.split("\n") ][:-1]

		print "%s.conn.active %d %s" %(METRIC_BASENAME, timestamp, lines[0].split(":")[-1].strip())

		(cAccepts, cHandled, cRequests) = ( i for i in lines[2].split() if i != "")
		print "%s.conn.accepts %d %s" %(METRIC_BASENAME, timestamp, cAccepts)
		print "%s.conn.handled %d %s" %(METRIC_BASENAME, timestamp, cHandled)
		print "%s.requests %d %s" %(METRIC_BASENAME, timestamp, cRequests)

		(_, conReads, _, conWrites, _, conWaiting)= (c.strip() for c in lines[-1].split())
		print "%s.conn.state %d %s type=reading" %(METRIC_BASENAME, timestamp, conReads)
		print "%s.conn.state %d %s type=writing" %(METRIC_BASENAME, timestamp, conWrites)
		print "%s.conn.state %d %s type=waiting" %(METRIC_BASENAME, timestamp, conWaiting)

		sys.stdout.flush()
		time.sleep(COLLECTION_INTERVAL)
Example #10
0
def main():
    utils.drop_privileges()
    if psutil is None:
       print >>sys.stderr, "error: python module `psutil' is missing"
       return 13
    if not len(PROCS) and not len(PYTHON_PROCS):
        print >>sys.stderr, "error: no PROCS or PYTHON_PROCS specified, " \
                            "create psconf module"
        return 14

    while True:

        lines = []
        for proc in psutil.process_iter():
            if len(PYTHON_PROCS) and proc.name() == PYTHON_INTERP:
                for pp in PYTHON_PROCS:
                    if proc.cmdline()[1].endswith(pp):
                        add_metrics(proc, lines, pp)
            elif proc.name() in PROCS:
                add_metrics(proc, lines, proc.name())

        if len(lines):
            for l in lines:
                print l

        sys.stdout.flush()
        time.sleep(INTERVAL)
Example #11
0
def main():
    """ntpstats main loop"""

    if not (ntpstat_conf and ntpstat_conf.enabled()):
        sys.exit(13)

    utils.drop_privileges()

    while True:
        ts = int(time.time())
        try:
            ntp_proc = subprocess.Popen(["ntpq", "-p"], stdout=subprocess.PIPE)
        except OSError, e:
            if e.errno == errno.ENOENT:
                # looks like ntpdc is not available, stop using this collector
                sys.exit(13) # we signal tcollector to stop using this
            raise

        stdout, _ = ntp_proc.communicate()
        if ntp_proc.returncode == 0:
            for line in stdout.split("\n"):
                if not line:
                    continue
                fields = line.split()
                if len(fields) <= 0:
                    continue
                if fields[0].startswith("*"):
                    offset=fields[8]
                    continue
            print ("ntp.offset %d %s" % (ts, offset))
        else:
            print >> sys.stderr, "ntpq -p, returned %r" % (ntp_proc.returncode)

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #12
0
def main():
    utils.drop_privileges()

    while True:
        try:
            if vstats == "all":
                stats = subprocess.Popen(["varnishstat", "-1", "-j"], stdout=subprocess.PIPE)
            else:
                fields = ",".join(vstats)
                stats = subprocess.Popen(["varnishstat", "-1", "-f" + fields, "-j"], stdout=subprocess.PIPE)
        except OSError, e:
            # Die and signal to tcollector not to run this script.
            sys.stderr.write("Error: %s\n" % e)
            sys.exit(13)

        metrics = ""
        for line in stats.stdout.readlines():
            metrics += line
        metrics = json.loads(metrics)

        timestamp = ""
        if use_varnishstat_timestamp:
            pattern = "%Y-%m-%dT%H:%M:%S"
            timestamp = int(time.mktime(time.strptime(metrics["timestamp"], pattern)))
        else:
            timestamp = time.time()

        for k, v in metrics.iteritems():
            if k != "timestamp":
                metric_name = metric_prefix + "." + k
                print "%s %d %s %s" % (metric_name, timestamp, v["value"], ",".join(tags))

        sys.stdout.flush()
        time.sleep(interval)
Example #13
0
def main():
 # ignore SIGCHLD, prevent the zombie apocalypse
 signal.signal(signal.SIGCHLD, signal.SIG_IGN)

 utils.drop_privileges()
 bad_regex = re.compile("[,()]+")  # avoid forbidden by TSD symbols

 while True:
    try:
      if vstats == "all":
        stats = subprocess.Popen(
          ["varnishstat", "-1", "-x"],
          stdout=subprocess.PIPE,
        )
      else:
        fields = ",".join(vstats)
        stats = subprocess.Popen(
          ["varnishstat", "-1", "-f" + fields, "-x"],
          stdout=subprocess.PIPE,
        )
    except OSError, e:
      # Die and signal to tcollector not to run this script.
      sys.stderr.write("Error: %s\n" % e)
      sys.exit(13)

    metrics = ""
    for line in stats.stdout.readlines():
      metrics += line
    metrics = ET.fromstringlist(metrics)

    timestamp = ""
    if use_varnishstat_timestamp:
      pattern = "%Y-%m-%dT%H:%M:%S"
      timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern)))
    else:
      timestamp = time.time()

    for stat in metrics.findall('stat'):
      tags = ""
      k = stat.findtext('name')
      if None == bad_regex.search(k):
        stattype = stat.findtext('type')
        if stattype == None:
          metric_name = metric_prefix + "." + k
        elif stattype == "LCK":
          metric_name = metric_prefix + ".locks." + k
          ident = stat.findtext('ident')
          tags = "ident=" + ident
        elif stattype == "SMA":
          metric_name = metric_prefix + ".storage." + k
          ident = stat.findtext('ident')
          tags = "ident=" + ident
        else:
          continue
        print "%s %d %s %s" % \
          (metric_name, timestamp, stat.findtext('value'), tags)

    sys.stdout.flush()
    time.sleep(interval)
Example #14
0
def main():
    utils.drop_privileges()
    if BinLogStreamReader is None:
        utils.err("error: Python module `pymysqlreplication' is missing")
        return 1
    settings = zabbix_bridge_conf.get_settings()

    # Set blocking to True if you want to block and wait for the next event at
    # the end of the stream
    stream = BinLogStreamReader(connection_settings=settings['mysql'],
                                server_id=settings['slaveid'],
                                only_events=[WriteRowsEvent],
                                resume_stream=True,
                                blocking=True)

    db_filename = settings['sqlitedb']
    dbcache = sqlite3.connect(':memory:')
    cachecur = dbcache.cursor()
    cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,))
    cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
    cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)')

    # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics.
    log_pos = 0
    key_lookup_miss = 0
    sample_last_ts = int(time.time())
    last_key_lookup_miss = 0

    for binlogevent in stream:
        if binlogevent.schema == settings['mysql']['db']:
            table = binlogevent.table
            log_pos = binlogevent.packet.log_pos
            if table == 'history' or table == 'history_uint':
                for row in binlogevent.rows:
                    r = row['values']
                    itemid = r['itemid']
                    cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,))
                    row = cachecur.fetchone()
                    if (row is not None):
                        print("zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3]))
                        if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals
                            sample_last_ts = int(time.time())
                            print("tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos))
                            print("tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss))
                            print("tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock'])))
                            if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']):
                                print("tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss)))
                                cachecur.execute('DROP TABLE zabbix_cache')
                                cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
                                cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)')
                                last_key_lookup_miss = key_lookup_miss
                    else:
                        # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry
                        utils.err("error: Key lookup miss for %s" % (itemid))
                        key_lookup_miss += 1
                sys.stdout.flush()

    dbcache.close()
    stream.close()
def main():
    """Main loop"""

    if USER != "root":
        utils.drop_privileges(user=USER)
    sys.stdin.close()

    interval = 15

    # we scan for instances here to see if there are any redis servers
    # running on this machine...
    last_scan = time.time()
    instances = scan_for_instances()  # port:name
    if not len(instances):
        return 13
    if not has_redis:
        sys.stderr.write("Found %d instance(s) to monitor, but the Python"
                         " Redis module isn't installed.\n" % len(instances))
        return 1

    def print_stat(metric, value, tags=""):
        if value is not None:
            print "redis.%s %d %s %s" % (metric, ts, value, tags)

    dbre = re.compile("^db\d+$")

    while True:
        ts = int(time.time())

        # if we haven't looked for redis instances recently, let's do that
        if ts - last_scan > SCAN_INTERVAL:
            instances = scan_for_instances()
            last_scan = ts

        # now iterate over every instance and gather statistics
        for port in instances:
            tags = "cluster=%s port=%d" % (instances[port], port)

            # connect to the instance and attempt to gather info
            r = redis.Redis(host="127.0.0.1", port=port)
            info = r.info()
            for key in KEYS:
                if key in info:
                    print_stat(key, info[key], tags)

            # per database metrics
            for db in filter(dbre.match, info.keys()):
                for db_metric in info[db].keys():
                    print_stat(db_metric, info[db][db_metric], "%s db=%s" % (tags, db))

            # get some instant latency information
            # TODO: might be nice to get 95th, 99th, etc here?
            start_time = time.time()
            r.ping()
            print_stat("latency", time.time() - start_time, tags)

        sys.stdout.flush()
        time.sleep(interval)
Example #16
0
 def loop(self):
     utils.drop_privileges()
     if json is None:
         utils.err("This collector requires the `json' Python module.")
         return 13  # Ask tcollector not to respawn us
     while True:
         self.emit()
         time.sleep(self.delay)
     return 0
Example #17
0
def main():
    utils.drop_privileges()

    while True:
        processes = ProcessTable()
        processes.update()
        collect_tcollect_stats(processes)

        time.sleep(COLLECTION_INTERVAL)
Example #18
0
def main():
    """dfstats main loop"""

    utils.drop_privileges()
    while True:
        ts = int(time.time())
        # 1kblocks
        df_proc = subprocess.Popen(["df", "-PlTk"], stdout=subprocess.PIPE)
        stdout, _ = df_proc.communicate()
        if df_proc.returncode == 0:
            for line in stdout.split("\n"):  # pylint: disable=E1103
                fields = line.split()
                # skip header/blank lines
                if not line or not fields[2].isdigit():
                    continue
                # Skip mounts/types we don't care about.
                # Most of this stuff is of type tmpfs, but we don't
                # want to blacklist all tmpfs since sometimes it's
                # used for active filesystems (/var/run, /tmp)
                # that we do want to track.
                if fields[1] in ("debugfs", "devtmpfs"):
                    continue
                if fields[6] == "/dev":
                    continue
                # /dev/shm, /lib/init_rw, /lib/modules, etc
                # if fields[6].startswith(("/lib/", "/dev/")):  # python2.5+
                if fields[6].startswith("/lib/"):
                    continue
                if fields[6].startswith("/dev/"):
                    continue

                mount = fields[6]
                print("df.1kblocks.total %d %s mount=%s fstype=%s" % (ts, fields[2], mount, fields[1]))
                print("df.1kblocks.used %d %s mount=%s fstype=%s" % (ts, fields[3], mount, fields[1]))
                print("df.1kblocks.free %d %s mount=%s fstype=%s" % (ts, fields[4], mount, fields[1]))
        else:
            print >>sys.stderr, "df -Pltk returned %r" % df_proc.returncode

        ts = int(time.time())
        # inodes
        df_proc = subprocess.Popen(["df", "-PlTi"], stdout=subprocess.PIPE)
        stdout, _ = df_proc.communicate()
        if df_proc.returncode == 0:
            for line in stdout.split("\n"):  # pylint: disable=E1103
                fields = line.split()
                if not line or not fields[2].isdigit():
                    continue

                mount = fields[6]
                print("df.inodes.total %d %s mount=%s fstype=%s" % (ts, fields[2], mount, fields[1]))
                print("df.inodes.used %d %s mount=%s fstype=%s" % (ts, fields[3], mount, fields[1]))
                print("df.inodes.free %d %s mount=%s fstype=%s" % (ts, fields[4], mount, fields[1]))
        else:
            print >>sys.stderr, "df -Plti returned %r" % df_proc.returncode

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #19
0
def main(argv):
  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  server = httplib.HTTPConnection(ES_HOST, ES_PORT)
  try:
    server.connect()
  except socket.error, (erno, e):
    if erno == errno.ECONNREFUSED:
      return 13  # No ES running, ask tcollector to not respawn us.
    raise
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    hbase_service = HBaseMaster()
    while True:
        hbase_service.emit()
        time.sleep(90)
    return 0
Example #21
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    datanode_service = HadoopDataNode()
    while True:
        datanode_service.emit()
        time.sleep(15)
    return 0
Example #22
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    rm_node_service = HadoopResourceManager()
    while True:
        rm_node_service.emit()
        time.sleep(90)
    return 0
Example #23
0
def main():
    try:
        utils.drop_privileges()
        # collect period 60 secs
        url = "http://localhost:9999/stats.txt?period=60"
        response = urllib2.urlopen(url)
        content = response.read()
        process(content)

    except Exception:
        pass
Example #24
0
def main():
    utils.drop_privileges()

    monitors_dict = {role: None for role in DRUID_ROLES}

    while True:
        for role, monitor in monitors_dict.items():
            if monitor is None or monitor.poll() is not None:
                monitors_dict[role] = spawn_monitor(role)

        time.sleep(5)
Example #25
0
def main():
    if not (udp_bridge_conf and udp_bridge_conf.enabled()):
      sys.exit(13)
    utils.drop_privileges()

    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        sock.bind((HOST, PORT))
    except socket.error, msg:
        sys.stderr.write('could not open socket: %s\n' % msg)
        sys.exit(1)
Example #26
0
def main():
    if not (graphite_bridge_conf and graphite_bridge_conf.enabled()):
      sys.exit(13)
    utils.drop_privileges()

    server = GraphiteServer((HOST, PORT), GraphiteHandler)
    server.daemon_threads = True
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        server.shutdown()
        server.server_close()
Example #27
0
def main(argv):
  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  if json is None:
    err("This collector requires the `json' Python module.")
    return 1

  while True:
    ts = int(time.time())
    output_stats("impalad",ts,worker_stats())
    output_stats("statestored",ts,statestore_stats())
    output_stats("catalogd",ts,catalog_stats())
    time.sleep(COLLECTION_INTERVAL)
Example #28
0
def main(argv):
    utils.drop_privileges(user=USER)

    # Build the classpath.
    dir = os.path.dirname(sys.argv[0])
    jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
    if not os.path.exists(jar):
        print >>sys.stderr, "WTF?!  Can't run, %s doesn't exist" % jar
        return 13
    classpath = [jar]
    for jar in CLASSPATH:
        if os.path.exists(jar):
            classpath.append(jar)
    classpath = ":".join(classpath)

    jpid = "worker"
    jps = subprocess.check_output("/usr/bin/jps").split("\n")
    for item in jps:
      vals = item.split(" ")
      if len(vals) == 2:
        if vals[1] == "worker":
          jmx = subprocess.Popen(
             [JAVA, "-enableassertions", "-enablesystemassertions",
              "-Xmx64m", "-cp", classpath, "com.stumbleupon.monitoring.jmx",
              vals[0]
              ], stdout=subprocess.PIPE).communicate()[0]

          if len(jmx) > 0:
            topologyPos=jmx.find("userevents:type=JmxMetricsConsumer")
            if topologyPos != -1:
              beans = [x.split("\t")[0] for x in jmx.split("\n")]
              #Check if there is a name
              topologyName="userevents"
              taskId=0

              for bean in beans:
                if bean.startswith('userevents'):
                  stormInfo=bean.split(',')
                  for stormDetail in stormInfo:
                    if stormDetail.startswith('name'):
                      topologyName=stormDetail.split('=')[1]
                    elif stormDetail.startswith('task'):
                      taskId=stormDetail.split('=')[1]

              t = Thread(target=processJMX, args=(vals[0], topologyName, taskId, classpath))
              t.daemon = True # thread dies with the program
              t.start()


    time.sleep(30)
    return 0  # Ask the tcollector to re-spawn us.
Example #29
0
def main():
    """ifstat main loop"""

    f_netdev = open("/proc/net/dev")
    utils.drop_privileges()

    # We just care about ethN and emN interfaces.  We specifically
    # want to avoid bond interfaces, because interface
    # stats are still kept on the child interfaces when
    # you bond.  By skipping bond we avoid double counting.
    while True:
        f_netdev.seek(0)
        ts = int(time.time())
        for line in f_netdev:
            m = re.match(r'''
                \s*
                (
                    eth?\d+ |
                    em\d+_\d+/\d+ | em\d+_\d+ | em\d+ |
                    p\d+p\d+_\d+/\d+ | p\d+p\d+_\d+ | p\d+p\d+ |
                    (?:   # Start of 'predictable network interface names'
                        (?:en|sl|wl|ww)
                        (?:
                            b\d+ |           # BCMA bus
                            c[0-9a-f]+ |     # CCW bus group
                            o\d+(?:d\d+)? |  # On-board device
                            s\d+(?:f\d+)?(?:d\d+)? |  # Hotplug slots
                            x[0-9a-f]+ |     # Raw MAC address
                            p\d+s\d+(?:f\d+)?(?:d\d+)? | # PCI geographic loc
                            p\d+s\d+(?:f\d+)?(?:u\d+)*(?:c\d+)?(?:i\d+)? # USB
                         )
                    )
                ):(.*)''', line, re.VERBOSE)
            if not m:
                continue
            intf = m.group(1)
            stats = m.group(2).split(None)

            def direction(i):
                if i >= 8:
                    return "out"
                return "in"
            for i in xrange(16):
                print("proc.net.%s %d %s iface=%s direction=%s"
                      % (FIELDS[i], ts, stats[i], intf, direction(i)))

        sys.stdout.flush()
        time.sleep(interval)
Example #30
0
def main(argv):
  if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()):
    # Status code 13 tells the parent tcollector not to respawn this collector
    return 13

  settings = flume_conf.get_settings()

  if (settings['default_timeout']):
    DEFAULT_TIMEOUT = settings['default_timeout']

  if (settings['flume_host']):
    FLUME_HOST = settings['flume_host']

  if (settings['flume_port']):
    FLUME_PORT = settings['flume_port']

  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT)
  try:
    server.connect()
  except:
    # Nothing really wrong if the Flume server is unavailable, we should just try again next time.
    return 0

  if json is None:
    err("This collector requires the `json' Python module.")
    return 1

  def printmetric(component, metric, value, **tags):
    if tags:
      tags = " " + " ".join("%s=%s" % (name, value)
                            for name, value in tags.iteritems())
    else:
      tags = ""
    print ("flume.%s.%s %d %s %s" % (component, metric, ts, value, tags))

  # Get the metrics
  ts = int(time.time())  # In case last call took a while.
  stats = flume_metrics(server)

  for component in stats:
    (component_type, name) = component.split(".")
    tags = {"type": name}
    for metric, value in stats[component].items():
      if metric not in EXCLUDE:
        printmetric(component_type.lower(), metric, value, **tags)
  return 0
Example #31
0
def main():
    if not (tcp_bridge_conf and tcp_bridge_conf.enabled()):
        print >> sys.stderr, 'not enabled, or tcp_bridge_conf unavilable'
        sys.exit(13)
    utils.drop_privileges()

    def printm(string, time, value):
        out.write(m_namespace + string + ' ' + str(time) + ' ' + str(value) +
                  '\n')

    def printmetrics():
        global m_delay
        global m_last

        ts = int(time.time())
        if ts > m_last + m_delay:
            printm('lines_read', ts, m_lines)
            printm('connections_processed', ts, m_connections)
            printm('processing_time', ts, m_ptime)
            printm('active', ts, 1)
            m_last = ts

    def clientthread(connection):
        global m_lines
        global m_connections
        global m_ptime

        start = time.time()
        f = connection.makefile()
        while True:
            data = f.readline()

            if not data:
                break

            data = removePut(data)
            out.write(data)
            m_lines += 1

        f.close()
        connection.close()

        end = time.time()
        m_ptime += (end - start)
        m_connections += 1
        printmetrics()

    def removePut(line):
        if line.startswith('put '):
            return line[4:]
        else:
            return line

    try:
        if tcp_bridge_conf.port():
            PORT = tcp_bridge_conf.port()

        if tcp_bridge_conf.host():
            HOST = tcp_bridge_conf.host()

        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind((HOST, PORT))
        sock.listen(1)

    except socket.error, msg:
        utils.err('could not open socket: %s' % msg)
        sys.exit(1)
Example #32
0
def main(argv):
    utils.drop_privileges(user=USER)
    # Build the classpath.
    dir = os.path.dirname(sys.argv[0])
    jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
    if not os.path.exists(jar):
        print >>sys.stderr, "WTF?!  Can't run, %s doesn't exist" % jar
        return 13
    classpath = [jar]
    for jar in CLASSPATH:
        if os.path.exists(jar):
            classpath.append(jar)
    classpath = ":".join(classpath)

    jpid = "HRegionServer"
    jps = subprocess.check_output("jps").split("\n")
    for item in jps:
      vals = item.split(" ")
      if len(vals) == 2:
        if vals[1] == "HRegionServer":
          jpid = vals[0]
          break

    # in HBase 0.94 the mbean domain is hadoop
    # in HBase 0.96 it is Hadoop (captical H)
    jmx = subprocess.Popen(
        [JAVA, "-enableassertions", "-enablesystemassertions",  # safe++
         "-Xmx64m",  # Low RAM limit, to avoid stealing too much from prod.
         "-cp", classpath, "com.stumbleupon.monitoring.jmx",
         "--watch", INTERVAL , "--long", "--timestamp",
         jpid,  # Name of the process.
         # The remaining arguments are pairs (mbean_regexp, attr_regexp).
         # The first regexp is used to match one or more MBeans, the 2nd
         # to match one or more attributes of the MBeans matched.
         "[Hh]adoop", "",                  # All HBase / hadoop metrics.
         "Memory$", "",                    # Heap stats
         "Threading", "Count|Time$",       # Number of threads and CPU time.
         "OperatingSystem", "OpenFile",    # Number of open files.
         "GarbageCollector", "Collection", # GC runs and time spent GCing.
         ], stdout=subprocess.PIPE, bufsize=1)
    do_on_signal(signal.SIGINT, kill, jmx)
    do_on_signal(signal.SIGPIPE, kill, jmx)
    do_on_signal(signal.SIGTERM, kill, jmx)
    try:
        prev_timestamp = 0
        while True:
            line = jmx.stdout.readline()

            if not line and jmx.poll() is not None:
                break  # Nothing more to read and process exited.
            elif len(line) < 4:
                print >>sys.stderr, "invalid line (too short): %r" % line
                continue

            try:
                timestamp, metric, value, mbean = line.split("\t", 3)
            except ValueError, e:
                # Temporary workaround for jmx.jar not printing these lines we
                # don't care about anyway properly.
                if "java.lang.String" not in line:
                    print >>sys.stderr, "Can't split line: %r" % line
                continue

            # Sanitize the timestamp.
            try:
                timestamp = int(timestamp)
                if timestamp < time.time() - 600:
                    raise ValueError("timestamp too old: %d" % timestamp)
                if timestamp < prev_timestamp:
                    raise ValueError("timestamp out of order: prev=%d, new=%d"
                                     % (prev_timestamp, timestamp))
            except ValueError, e:
                print >>sys.stderr, ("Invalid timestamp on line: %r -- %s"
                                     % (line, e))
                continue
            prev_timestamp = timestamp

            if metric in IGNORED_METRICS:
              continue

            tags = ""
            # The JMX metrics have per-request-type metrics like so:
            #   metricNameNumOps
            #   metricNameMinTime
            #   metricNameMaxTime
            #   metricNameAvgTime
            # Group related metrics together in the same metric name, use tags
            # to separate the different request types, so we end up with:
            #   numOps op=metricName
            #   avgTime op=metricName
            # etc, which makes it easier to graph things with the TSD.
            if metric.endswith("MinTime"):  # We don't care about the minimum
                continue                    # time taken by operations.
            elif metric.startswith("tbl."): # Per-table/region/cf metrics
                continue                    # ignore for now, too much spam
            elif "BlockedSeconds" in metric or "LatencyHistogram" in metric: 
                continue                    # ignore for now, too much spam
            elif metric.endswith("KB"): 
                metric = metric[:-2]
                # Try converting to bytes
                try:
                  value = float(value) * 1024
                except ValueError, e:
                  value = 0
Example #33
0
# These are the jmx handlers we'll be using.
from bbm.jvm import jvm_collector
from bbm.tomcat import tomcat_collector

signal.signal(signal.SIGCHLD, signal.SIG_IGN)


# The sonos uses an embdedded tomcat with the webapp name set to "Tomcat", we';;
# rewrite the webapp name to "sonos"
def renamer(v):
    if v.metric.startswith("tomcat."):
        v.tags = map(
            lambda t: "webapp=sonos" if t.startswith("webapp=") else t, v.tags)
    return v


# Find the pid of the bbm-sonos server
pgrep = subprocess.check_output([
    "/usr/bin/pgrep", "-f", "-u", "bbm-sonos", "/usr/share/bbm-sonos/sonos.war"
])
jpid = pgrep.rstrip("\n")
if jpid == "":
    sys.exit(1)

# We can change over to hte bbm-sonos user for security
utils.drop_privileges(user="******")

RunCollector(start_jmx_collector(15, jpid, jvm_collector + tomcat_collector,
                                 renamer),
             extraTags=["application=sonos"])
Example #34
0
def main():
    """dfstats main loop"""
    try:
        f_mounts = open("/proc/mounts", "r")
    except IOError as e:
        utils.err("error: can't open /proc/mounts: %s" % e)
        return 13  # Ask tcollector to not respawn us

    utils.drop_privileges()

    while True:
        devices = []
        f_mounts.seek(0)
        ts = int(time.time())

        for line in f_mounts:
            # Docs come from the fstab(5)
            # fs_spec     # Mounted block special device or remote filesystem
            # fs_file     # Mount point
            # fs_vfstype  # File system type
            # fs_mntops   # Mount options
            # fs_freq     # Dump(8) utility flags
            # fs_passno   # Order in which filesystem checks are done at reboot time
            try:
                fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split(
                    None)
            except ValueError as e:
                utils.err("error: can't parse line at /proc/mounts: %s" % e)
                continue

            if fs_spec == "none":
                continue
            elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."):
                continue
            # startswith(tuple) avoided to preserve support of Python 2.4
            elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \
                  fs_file.startswith("/proc") or fs_file.startswith("/lib") or \
                  fs_file.startswith("net:") or fs_file.startswith("/var/lib/kubelet"):
                continue

            # keep /dev/xxx device with shorter fs_file (remove mount binds)
            device_found = False
            if fs_spec.startswith("/dev"):
                for device in devices:
                    if fs_spec == device[0]:
                        device_found = True
                        if len(fs_file) < len(device[1]):
                            device[1] = fs_file
                        break
                if not device_found:
                    devices.append([fs_spec, fs_file, fs_vfstype])
            else:
                devices.append([fs_spec, fs_file, fs_vfstype])

        for device in devices:
            fs_spec, fs_file, fs_vfstype = device
            try:
                r = os.statvfs(fs_file)
            except OSError as e:
                utils.err("can't get info for mount point: %s: %s" %
                          (fs_file, e))
                continue

            used = r.f_blocks - r.f_bfree

            # conditional expression avoided to preserve support of Python 2.4
            # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / r.f_blocks
            if r.f_blocks == 0:
                percent_used = 100
            else:
                percent_used = used * 100.0 / r.f_blocks

            print("df.bytes.total %d %s mount=%s fstype=%s" %
                  (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype))
            print("df.bytes.used %d %s mount=%s fstype=%s" %
                  (ts, r.f_frsize * used, fs_file, fs_vfstype))
            print("df.bytes.percentused %d %s mount=%s fstype=%s" %
                  (ts, percent_used, fs_file, fs_vfstype))
            print("df.bytes.free %d %s mount=%s fstype=%s" %
                  (ts, r.f_frsize * r.f_bfree, fs_file, fs_vfstype))

            used = r.f_files - r.f_ffree

            # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files
            if r.f_files == 0:
                percent_used = 100
            else:
                percent_used = used * 100.0 / r.f_files

            print("df.inodes.total %d %s mount=%s fstype=%s" %
                  (ts, r.f_files, fs_file, fs_vfstype))
            print("df.inodes.used %d %s mount=%s fstype=%s" %
                  (ts, used, fs_file, fs_vfstype))
            print("df.inodes.percentused %d %s mount=%s fstype=%s" %
                  (ts, percent_used, fs_file, fs_vfstype))
            print("df.inodes.free %d %s mount=%s fstype=%s" %
                  (ts, r.f_ffree, fs_file, fs_vfstype))

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #35
0
])


def err(msg):
  print >> sys.stderr, msg


def main():
  """dfstats main loop"""
  try:
    f_mounts = open("/proc/mounts", "r")
  except IOError, e:
    err("error: can't open /proc/mounts: %s" % e)
    return 13 # Ask tcollector to not respawn us

  utils.drop_privileges()

  while True:
    devices = []
    f_mounts.seek(0)
    ts = int(time.time())

    for line in f_mounts:
      # Docs come from the fstab(5)
      # fs_spec     # Mounted block special device or remote filesystem
      # fs_file     # Mount point
      # fs_vfstype  # File system type
      # fs_mntops   # Mount options
      # fs_freq     # Dump(8) utility flags
      # fs_passno   # Order in which filesystem checks are done at reboot time
      try:
Example #36
0
def main():
    """procstats main loop"""

    f_uptime = open("/proc/uptime", "r")
    f_meminfo = open("/proc/meminfo", "r")
    f_vmstat = open("/proc/vmstat", "r")
    f_stat = open("/proc/stat", "r")
    f_loadavg = open("/proc/loadavg", "r")
    f_entropy_avail = open("/proc/sys/kernel/random/entropy_avail", "r")
    f_interrupts = open("/proc/interrupts", "r")

    f_scaling = "/sys/devices/system/cpu/cpu%s/cpufreq/cpuinfo_%s_freq"
    f_scaling_min  = dict([])
    f_scaling_max  = dict([])
    f_scaling_cur  = dict([])
    for cpu in glob.glob("/sys/devices/system/cpu/cpu[0-9]*/cpufreq/cpuinfo_cur_freq"):
        m = re.match("/sys/devices/system/cpu/cpu([0-9]*)/cpufreq/cpuinfo_cur_freq", cpu)
        if not m:
            continue
        cpu_no = m.group(1)
        sys.stderr.write(f_scaling % (cpu_no,"min"))
        f_scaling_min[cpu_no] = open(f_scaling % (cpu_no,"min"), "r")
        f_scaling_max[cpu_no] = open(f_scaling % (cpu_no,"max"), "r")
        f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no,"cur"), "r")

    numastats = open_sysfs_numa_stats()
    utils.drop_privileges()

    while True:
        # proc.uptime
        f_uptime.seek(0)
        ts = int(time.time())
        for line in f_uptime:
            m = re.match("(\S+)\s+(\S+)", line)
            if m:
                print "proc.uptime.total %d %s" % (ts, m.group(1))
                print "proc.uptime.now %d %s" % (ts, m.group(2))

        # proc.meminfo
        f_meminfo.seek(0)
        ts = int(time.time())
        for line in f_meminfo:
            m = re.match("(\w+):\s+(\d+)\s+(\w+)", line)
            if m:
                if m.group(3).lower() == 'kb':
                    # convert from kB to B for easier graphing
                    value = str(int(m.group(2)) * 1000)
                else:
                    value = m.group(2)
                print ("proc.meminfo.%s %d %s"
                        % (m.group(1).lower(), ts, value))

        # proc.vmstat
        f_vmstat.seek(0)
        ts = int(time.time())
        for line in f_vmstat:
            m = re.match("(\w+)\s+(\d+)", line)
            if not m:
                continue
            if m.group(1) in ("pgpgin", "pgpgout", "pswpin",
                              "pswpout", "pgfault", "pgmajfault"):
                print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2))

        # proc.stat
        f_stat.seek(0)
        ts = int(time.time())
        for line in f_stat:
            m = re.match("(\w+)\s+(.*)", line)
            if not m:
                continue
            if m.group(1).startswith("cpu"):
                cpu_m = re.match("cpu(\d+)", m.group(1))
                if cpu_m:
                    metric_percpu = '.percpu'
                    tags = ' cpu=%s' % cpu_m.group(1)
                else:
                    metric_percpu = ''
                    tags = ''
                fields = m.group(2).split()
                cpu_types = ['user', 'nice', 'system', 'idle', 'iowait',
                    'irq', 'softirq', 'guest', 'guest_nice']

                # We use zip to ignore fields that don't exist.
                for value, field_name in zip(fields, cpu_types):
                    print "proc.stat.cpu%s %d %s type=%s%s" % (metric_percpu,
                        ts, value, field_name, tags)
            elif m.group(1) == "intr":
                print ("proc.stat.intr %d %s"
                        % (ts, m.group(2).split()[0]))
            elif m.group(1) == "ctxt":
                print "proc.stat.ctxt %d %s" % (ts, m.group(2))
            elif m.group(1) == "processes":
                print "proc.stat.processes %d %s" % (ts, m.group(2))
            elif m.group(1) == "procs_blocked":
                print "proc.stat.procs_blocked %d %s" % (ts, m.group(2))

        f_loadavg.seek(0)
        ts = int(time.time())
        for line in f_loadavg:
            m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line)
            if not m:
                continue
            print "proc.loadavg.1min %d %s" % (ts, m.group(1))
            print "proc.loadavg.5min %d %s" % (ts, m.group(2))
            print "proc.loadavg.15min %d %s" % (ts, m.group(3))
            print "proc.loadavg.runnable %d %s" % (ts, m.group(4))
            print "proc.loadavg.total_threads %d %s" % (ts, m.group(5))

        f_entropy_avail.seek(0)
        ts = int(time.time())
        for line in f_entropy_avail:
            print "proc.kernel.entropy_avail %d %s" % (ts, line.strip())

        f_interrupts.seek(0)
        ts = int(time.time())
        # Get number of CPUs from description line.
        num_cpus = len(f_interrupts.readline().split())
        for line in f_interrupts:
            cols = line.split()

            irq_type = cols[0].rstrip(":")
            if irq_type.isalnum():
                if irq_type.isdigit():
                    if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]:
                        irq_type = cols[-1]
                    else:
                        continue  # Interrupt type is just a number, ignore.
                for i, val in enumerate(cols[1:]):
                    if i >= num_cpus:
                        # All values read, remaining cols contain textual
                        # description
                        break
                    if not val.isdigit():
                        # something is weird, there should only be digit values
                        sys.stderr.write("Unexpected interrupts value %r in"
                                         " %r: " % (val, cols))
                        break
                    print ("proc.interrupts %s %s type=%s cpu=%s"
                           % (ts, val, irq_type, i))

        print_numa_stats(numastats)

        # Print scaling stats
        ts = int(time.time())
        for cpu_no in f_scaling_min.keys():
            f = f_scaling_min[cpu_no]
            f.seek(0)
            for line in f:
                print "proc.scaling.min %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)
        ts = int(time.time())
        for cpu_no in f_scaling_max.keys():
            f = f_scaling_max[cpu_no]
            f.seek(0)
            for line in f:
                print "proc.scaling.max %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)
        ts = int(time.time())
        for cpu_no in f_scaling_cur.keys():
            f = f_scaling_cur[cpu_no]
            f.seek(0)
            for line in f:
                print "proc.scaling.cur %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no)

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #37
0
def main():
    """Main loop"""
    sys.stdin.close()

    interval = 15
    page_size = resource.getpagesize()

    try:
        sockstat = open("/proc/net/sockstat")
        netstat = open("/proc/net/netstat")
        snmp = open("/proc/net/snmp")
    except IOError as e:
        print("open failed: %s" % e, file=sys.stderr)
        return 13  # Ask tcollector to not re-start us.
    utils.drop_privileges()

    # Note: up until v2.6.37-rc2 most of the values were 32 bits.
    # The first value is pretty useless since it accounts for some
    # socket types but not others.  So we don't report it because it's
    # more confusing than anything else and it's not well documented
    # what type of sockets are or aren't included in this count.
    regexp = re.compile("sockets: used \d+\n"
                        "TCP: inuse (?P<tcp_inuse>\d+) orphan (?P<orphans>\d+)"
                        " tw (?P<tw_count>\d+) alloc (?P<tcp_sockets>\d+)"
                        " mem (?P<tcp_pages>\d+)\n"
                        "UDP: inuse (?P<udp_inuse>\d+)"
                        # UDP memory accounting was added in v2.6.25-rc1
                        "(?: mem (?P<udp_pages>\d+))?\n"
                        # UDP-Lite (RFC 3828) was added in v2.6.20-rc2
                        "(?:UDPLITE: inuse (?P<udplite_inuse>\d+)\n)?"
                        "RAW: inuse (?P<raw_inuse>\d+)\n"
                        "FRAG: inuse (?P<ip_frag_nqueues>\d+)"
                        " memory (?P<ip_frag_mem>\d+)\n")

    def print_sockstat(metric,
                       value,
                       tags=""):  # Note: tags must start with ' '
        if value is not None:
            print("net.sockstat.%s %d %s%s" % (metric, ts, value, tags))

    # If a line in /proc/net/{netstat,snmp} doesn't start with a word in that
    # dict, we'll ignore it.  We use the value to build the metric name.
    known_statstypes = {
        "TcpExt:": "tcp",
        "IpExt:": "ip",  # We don't collect anything from here for now.
        "Ip:": "ip",  # We don't collect anything from here for now.
        "Icmp:": "icmp",  # We don't collect anything from here for now.
        "IcmpMsg:": "icmpmsg",  # We don't collect anything from here for now.
        "Tcp:": "tcp",  # We don't collect anything from here for now.
        "Udp:": "udp",
        "UdpLite:": "udplite",  # We don't collect anything from here for now.
        "Arista:": "arista",  # We don't collect anything from here for now.
    }

    # Any stat in /proc/net/{netstat,snmp} that doesn't appear in this dict will
    # be ignored.  If we find a match, we'll use the (metricname, tags).
    tcp_stats = {
        # An application wasn't able to accept a connection fast enough, so
        # the kernel couldn't store an entry in the queue for this connection.
        # Instead of dropping it, it sent a cookie to the client.
        "SyncookiesSent": ("syncookies", "type=sent"),
        # After sending a cookie, it came back to us and passed the check.
        "SyncookiesRecv": ("syncookies", "type=received"),
        # After sending a cookie, it came back to us but looked invalid.
        "SyncookiesFailed": ("syncookies", "type=failed"),
        # When a socket is using too much memory (rmem), the kernel will first
        # discard any out-of-order packet that has been queued (with SACK).
        "OfoPruned": ("memory.prune", "type=drop_ofo_queue"),
        # If the kernel is really really desperate and cannot give more memory
        # to this socket even after dropping the ofo queue, it will simply
        # discard the packet it received.  This is Really Bad.
        "RcvPruned": ("memory.prune", "type=drop_received"),
        # We waited for another packet to send an ACK, but didn't see any, so
        # a timer ended up sending a delayed ACK.
        "DelayedACKs": ("delayedack", "type=sent"),
        # We wanted to send a delayed ACK but failed because the socket was
        # locked.  So the timer was reset.
        "DelayedACKLocked": ("delayedack", "type=locked"),
        # We sent a delayed and duplicated ACK because the remote peer
        # retransmitted a packet, thinking that it didn't get to us.
        "DelayedACKLost": ("delayedack", "type=lost"),
        # We completed a 3WHS but couldn't put the socket on the accept queue,
        # so we had to discard the connection.
        "ListenOverflows": ("failed_accept", "reason=full_acceptq"),
        # We couldn't accept a connection because one of: we had no route to
        # the destination, we failed to allocate a socket, we failed to
        # allocate a new local port bind bucket.  Note: this counter
        # also include all the increments made to ListenOverflows...
        "ListenDrops": ("failed_accept", "reason=other"),
        # A packet was lost and we used Forward RTO-Recovery to retransmit.
        "TCPForwardRetrans": ("retransmit", "type=forward"),
        # A packet was lost and we fast-retransmitted it.
        "TCPFastRetrans": ("retransmit", "type=fast"),
        # A packet was lost and we retransmitted after a slow start.
        "TCPSlowStartRetrans": ("retransmit", "type=slowstart"),
        # A packet was lost and we recovered after a fast retransmit.
        "TCPRenoRecovery": ("packetloss.recovery", "type=fast_retransmit"),
        # A packet was lost and we recovered by using selective
        # acknowledgements.
        "TCPSackRecovery": ("packetloss.recovery", "type=sack"),
        # We detected re-ordering using FACK (Forward ACK -- the highest
        # sequence number known to have been received by the peer when using
        # SACK -- FACK is used during congestion control).
        "TCPFACKReorder": ("reording", "detectedby=fack"),
        # We detected re-ordering using SACK.
        "TCPSACKReorder": ("reording", "detectedby=sack"),
        # We detected re-ordering using fast retransmit.
        "TCPRenoReorder": ("reording", "detectedby=fast_retransmit"),
        # We detected re-ordering using the timestamp option.
        "TCPTSReorder": ("reording", "detectedby=timestamp"),
        # We detected some erroneous retransmits and undid our CWND reduction.
        "TCPFullUndo": ("congestion.recovery", "type=full_undo"),
        # We detected some erroneous retransmits, a partial ACK arrived while
        # we were fast retransmitting, so we were able to partially undo some
        # of our CWND reduction.
        "TCPPartialUndo": ("congestion.recovery", "type=hoe_heuristic"),
        # We detected some erroneous retransmits, a D-SACK arrived and ACK'ed
        # all the retransmitted data, so we undid our CWND reduction.
        "TCPDSACKUndo": ("congestion.recovery", "type=sack"),
        # We detected some erroneous retransmits, a partial ACK arrived, so we
        # undid our CWND reduction.
        "TCPLossUndo": ("congestion.recovery", "type=ack"),
        # We received an unexpected SYN so we sent a RST to the peer.
        "TCPAbortOnSyn": ("abort", "type=unexpected_syn"),
        # We were in FIN_WAIT1 yet we received a data packet with a sequence
        # number that's beyond the last one for this connection, so we RST'ed.
        "TCPAbortOnData": ("abort", "type=data_after_fin_wait1"),
        # We received data but the user has closed the socket, so we have no
        # wait of handing it to them, so we RST'ed.
        "TCPAbortOnClose": ("abort", "type=data_after_close"),
        # This is Really Bad.  It happens when there are too many orphaned
        # sockets (not attached a FD) and the kernel has to drop a connection.
        # Sometimes it will send a reset to the peer, sometimes it wont.
        "TCPAbortOnMemory": ("abort", "type=out_of_memory"),
        # The connection timed out really hard.
        "TCPAbortOnTimeout": ("abort", "type=timeout"),
        # We killed a socket that was closed by the application and lingered
        # around for long enough.
        "TCPAbortOnLinger": ("abort", "type=linger"),
        # We tried to send a reset, probably during one of teh TCPABort*
        # situations above, but we failed e.g. because we couldn't allocate
        # enough memory (very bad).
        "TCPAbortFailed": ("abort.failed", None),
        # Number of times a socket was put in "memory pressure" due to a non
        # fatal memory allocation failure (reduces the send buffer size etc).
        "TCPMemoryPressures": ("memory.pressure", None),
        # We got a completely invalid SACK block and discarded it.
        "TCPSACKDiscard": ("invalid_sack", "type=invalid"),
        # We got a duplicate SACK while retransmitting so we discarded it.
        "TCPDSACKIgnoredOld": ("invalid_sack", "type=retransmit"),
        # We got a duplicate SACK and discarded it.
        "TCPDSACKIgnoredNoUndo": ("invalid_sack", "type=olddup"),
        # We received something but had to drop it because the socket's
        # receive queue was full.
        "TCPBacklogDrop": ("receive.queue.full", None),
    }
    known_stats = {
        "tcp": tcp_stats,
        "ip": {},
        "icmp": {},
        "icmpmsg": {},
        "udp": {
            # Total UDP datagrams received by this host
            "InDatagrams": ("datagrams", "direction=in"),
            # UDP datagrams received on a port with no listener
            "NoPorts": ("errors", "direction=in reason=noport"),
            # Total UDP datagrams that could not be delivered to an application
            # Note: this counter also increments for RcvbufErrors
            "InErrors": ("errors", "direction=in reason=other"),
            # Total UDP datagrams sent from this host
            "OutDatagrams": ("datagrams", "direction=out"),
            # Datagrams for which not enough socket buffer memory to receive
            "RcvbufErrors": ("errors", "direction=in reason=nomem"),
            # Datagrams for which not enough socket buffer memory to transmit
            "SndbufErrors": ("errors", "direction=out reason=nomem"),
        },
        "udplite": {},
        "arista": {},
    }

    def print_netstat(statstype, metric, value, tags=""):
        if tags:
            space = " "
        else:
            tags = space = ""
        print("net.stat.%s.%s %d %s%s%s" %
              (statstype, metric, ts, value, space, tags))

    def parse_stats(stats, filename):
        statsdikt = {}
        # /proc/net/{netstat,snmp} have a retarded column-oriented format.  It
        # looks like this:
        #   Header: SomeMetric OtherMetric
        #   Header: 1 2
        #   OtherHeader: ThirdMetric FooBar
        #   OtherHeader: 42 51
        #   OtherHeader: FourthMetric
        #   OtherHeader: 4
        # We first pair the lines together, then create a dict for each type:
        #   {"SomeMetric": "1", "OtherMetric": "2"}
        lines = stats.splitlines()
        assert len(lines) % 2 == 0, repr(lines)
        for header, data in zip(*(iter(lines), ) * 2):
            header = header.split()
            data = data.split()
            assert header[0] == data[0], repr((header, data))
            assert len(header) == len(data), repr((header, data))
            if header[0] not in known_statstypes:
                print("Unrecoginized line in %s:"
                      " %r (file=%r)" % (filename, header, stats),
                      file=sys.stderr)
                continue
            statstype = header.pop(0)
            data.pop(0)
            stats = dict(zip(header, data))
            statsdikt.setdefault(known_statstypes[statstype], {}).update(stats)
        for statstype, stats in statsdikt.items():
            # Undo the kernel's double counting
            if "ListenDrops" in stats:
                stats["ListenDrops"] = int(stats["ListenDrops"]) - int(
                    stats.get("ListenOverflows", 0))
            elif "RcvbufErrors" in stats:
                stats["InErrors"] = int(stats.get("InErrors", 0)) - int(
                    stats["RcvbufErrors"])
            for stat, (metric, tags) in known_stats[statstype].items():
                value = stats.get(stat)
                if value is not None:
                    print_netstat(statstype, metric, value, tags)

    while True:
        ts = int(time.time())
        sockstat.seek(0)
        netstat.seek(0)
        snmp.seek(0)
        data = sockstat.read()
        netstats = netstat.read()
        snmpstats = snmp.read()
        m = re.match(regexp, data)
        if not m:
            print("Cannot parse sockstat: %r" % data, file=sys.stderr)
            return 13

        # The difference between the first two values is the number of
        # sockets allocated vs the number of sockets actually in use.
        print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp")
        print_sockstat("num_timewait", m.group("tw_count"))
        print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp")
        print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp")
        print_sockstat("sockets_inuse", m.group("udplite_inuse"),
                       " type=udplite")
        print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw")

        print_sockstat("num_orphans", m.group("orphans"))
        print_sockstat("memory",
                       int(m.group("tcp_pages")) * page_size, " type=tcp")
        if m.group("udp_pages") is not None:
            print_sockstat("memory",
                           int(m.group("udp_pages")) * page_size, " type=udp")
        print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag")
        print_sockstat("ipfragqueues", m.group("ip_frag_nqueues"))

        parse_stats(netstats, netstat.name)
        parse_stats(snmpstats, snmp.name)

        sys.stdout.flush()
        time.sleep(interval)
Example #38
0
def main():
    """iostats main loop."""
    f_diskstats = open("/proc/diskstats")
    HZ = get_system_hz()
    itv = 1.0
    utils.drop_privileges()

    while True:
        f_diskstats.seek(0)
        ts = int(time.time())
        itv = read_uptime()[1]
        for line in f_diskstats:
            # maj, min, devicename, [list of stats, see above]
            values = line.split(None)
            # shortcut the deduper and just skip disks that
            # haven't done a single read.  This eliminates a bunch
            # of loopback, ramdisk, and cdrom devices but still
            # lets us report on the rare case that we actually use
            # a ramdisk.
            if values[3] == "0":
                continue

            if int(values[1]) % 16 == 0 and int(values[0]) > 1:
                metric = "iostat.disk."
            else:
                metric = "iostat.part."

            device = values[2]
            if len(values) == 14:
                # full stats line
                for i in range(11):
                    print("%s%s %d %s dev=%s" %
                          (metric, FIELDS_DISK[i], ts, values[i + 3], device))

                ret = is_device(device, 0)
                # if a device or a partition, calculate the svctm/await/util
                if ret:
                    stats = dict(zip(FIELDS_DISK, values[3:]))
                    nr_ios = float(stats.get("read_requests")) + \
                        float(stats.get("write_requests"))
                    tput = (nr_ios * float(HZ) / float(itv))
                    util = (float(stats.get("msec_total")) * float(HZ) /
                            float(itv))
                    svctm = 0.0
                    await = 0.0

                    if tput:
                        svctm = util / tput

                    if nr_ios:
                        rd_ticks = stats.get("msec_read")
                        wr_ticks = stats.get("msec_write")
                        await = (float(rd_ticks) +
                                 float(wr_ticks)) / float(nr_ios)
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "svctm", ts, svctm, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "await", ts, await, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "util", ts, float(util / 1000.0), device))

            elif len(values) == 7:
                # partial stats line
                for i in range(4):
                    print("%s%s %d %s dev=%s" %
                          (metric, FIELDS_PART[i], ts, values[i + 3], device))
            else:
                print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line
                continue

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #39
0
from bbm.jmx import JMXPattern

# These are the jmx handlers we'll be using.
from bbm.jvm import jvm_collector

signal.signal(signal.SIGCHLD, signal.SIG_IGN)

# Find the pid of the bbm-core-api server
pgrep = subprocess.check_output(
    ["/usr/bin/pgrep", "-u", "activemq", "-f", "xbean:activemq.xml"])
jpid = pgrep.rstrip("\n")
if jpid == "":
    sys.exit(1)

# We can change over to hte bbm-core-api user for secturity
utils.drop_privileges(user="******")


def rewriter(v):
    if v.metric.startswith("jmx.org.apache.activemq."):
        # Strip off leading
        v.metric = v.metric[len("jmx.org.apache.activemq."):]
        metrictype = None
        for t in v.tags:
            if t.startswith("Type="):
                metrictype = t[len("Type="):].lower()
                break

        if metrictype == None:
            return []
Example #40
0
def main():
    """Main loop"""

    if USER != "root":
        utils.drop_privileges(user=USER)
    sys.stdin.close()

    config = redis_stats_conf.get_config()
    interval = config['collection_interval']

    # we scan for instances here to see if there are any redis servers
    # running on this machine...
    last_scan = time.time()
    instances = scan_for_instances()  # port:name
    if not len(instances):
        return 13
    if not has_redis:
        sys.stderr.write("Found %d instance(s) to monitor, but the Python"
                         " Redis module isn't installed.\n" % len(instances))
        return 1

    def print_stat(metric, value, tags=""):
        if value is not None:
            print "redis.%s %d %s %s" % (metric, ts, value, tags)

    dbre = re.compile("^db\d+$")

    while True:
        ts = int(time.time())

        # if we haven't looked for redis instances recently, let's do that
        if ts - last_scan > SCAN_INTERVAL:
            instances = scan_for_instances()
            last_scan = ts

        # now iterate over every instance and gather statistics
        for port in instances:
            tags = "cluster=%s port=%d" % (instances[port], port)

            # connect to the instance and attempt to gather info
            r = redis.Redis(host="127.0.0.1", port=port)
            try:
                info = r.info()
                for key in KEYS:
                    if key in info:
                        print_stat(key, info[key], tags)

                # per database metrics
                for db in filter(dbre.match, info.keys()):
                    for db_metric in info[db].keys():
                        print_stat(db_metric, info[db][db_metric],
                                   "%s db=%s" % (tags, db))

                # get some instant latency information
                # TODO: might be nice to get 95th, 99th, etc here?
                start_time = time.time()
                r.ping()
                print_stat("latency", time.time() - start_time, tags)
            finally:
                r.connection_pool.disconnect()

        sys.stdout.flush()
        time.sleep(interval)
Example #41
0
#!/usr/bin/python

import signal
import sys
import subprocess
from collectors.lib import utils
from bbm import RunCollector
from bbm.jmx import start_jmx_collector

# These are the jmx handlers we'll be using.
from bbm.jvm import jvm_collector
from bbm.jetty import jetty_collector

signal.signal(signal.SIGCHLD, signal.SIG_IGN)

# Find the pid of the tomcat server
pgrep = subprocess.check_output([
    "/usr/bin/pgrep", "-u", "bbm-events-api", "-f",
    "/etc/bbm/bbm-events-api.yml"
])
jpid = pgrep.rstrip("\n")
if jpid == "":
    sys.exit(1)

# We can change over to tomcat7 user for secturity
utils.drop_privileges(user="******")

RunCollector(start_jmx_collector(15, jpid, jvm_collector + jetty_collector),
             extraTags=["application=events-api"])
Example #42
0
            for tag in self.txns_max.keys():
                data = data + [
                    TSDBMetricData("timings.txns.max", self.txns_max[tag],
                                   tag.split(" "))
                ]
            self.time_max = {}
            self.db_max = {}
            self.txns_max = {}
            self.memc_max = {}
            self.oldtime = newtime


#            for tag in self.ends_hash.keys():
#                data = data + [TSDBMetricData("streams.duration", self.ends_hash[tag],tag.split(" "))]
#
#        if (newtime - self.olduserstime) >= (1000 * 60 * 5): # Only output users stats every 5 minutes
#            for tag in self.users_hash.keys():
#                data = data + [TSDBMetricData("streams.users.5min", len(self.users_hash[tag]),tag.split(" "))]
#            self.users_hash = {}
#            self.olduserstime = newtime

        return data

utils.drop_privileges(user="******")

parser = LogParser()
RunCollector(start_dated_files_collector("/var/log/java", "*/*/*-timing.log",
                                         "%Y/%m/%Y%m%d-timing.log",
                                         parser.ParseLine),
             exitOnFinished=False)
Example #43
0
from bbm.c3p0 import c3p0_collector

signal.signal(signal.SIGCHLD, signal.SIG_IGN)


# The core-api uses an embdedded tomcat with the webapp name set to "Tomcat", we';;
# rewrite the webapp name to "coreapi"
def renamer(v):
    if v.metric.startswith("tomcat."):
        v.tags = map(
            lambda t: "webapp=coreapi"
            if t.startswith("webapp=") else t, v.tags)
    return v


# Find the pid of the bbm-core-api server
pgrep = subprocess.check_output([
    "/usr/bin/pgrep", "-f", "-u", "bbm-core-api",
    "/usr/share/bbm-core-api/ROOT.war"
])
jpid = pgrep.rstrip("\n")
if jpid == "":
    sys.exit(1)

# We can change over to hte bbm-core-api user for secturity
utils.drop_privileges(user="******")

RunCollector(start_jmx_collector(
    15, jpid, jvm_collector + tomcat_collector + c3p0_collector, renamer),
             extraTags=["application=coreapi"])
Example #44
0
from bbm.jvm import jvm_collector
from bbm.tomcat import tomcat_collector

signal.signal(signal.SIGCHLD, signal.SIG_IGN)


# The bbm-admin uses an embdedded tomcat with the webapp name set to "Tomcat", we';;
# rewrite the webapp name to "bbm-admin"
def renamer(v):
    if v.metric.startswith("tomcat."):
        v.tags = map(
            lambda t: "webapp=admin" if t.startswith("webapp=") else t, v.tags)
    return v


# Find the pid of the bbm-admin server
pgrep = subprocess.check_output([
    "/usr/bin/pgrep", "-f", "-u", "bbm-admin",
    "/usr/share/bbm-admin/admin-assembly-1.0.jar"
])
jpid = pgrep.rstrip("\n")
if jpid == "":
    sys.exit(1)

# We can change over to hte bbm-admin user for security
utils.drop_privileges(user="******")

RunCollector(start_jmx_collector(15, jpid, jvm_collector + tomcat_collector,
                                 renamer),
             extraTags=["application=admin"])
Example #45
0
def main():
    """iostats main loop."""
    init_stats = {
        "read_requests": 0,
        "read_merged": 0,
        "read_sectors": 0,
        "msec_read": 0,
        "write_requests": 0,
        "write_merged": 0,
        "write_sectors": 0,
        "msec_write": 0,
        "ios_in_progress": 0,
        "msec_total": 0,
        "msec_weighted_total": 0,
    }
    prev_stats = dict()
    f_diskstats = open("/proc/diskstats")
    HZ = get_system_hz()
    itv = 1.0
    utils.drop_privileges()

    while True:
        f_diskstats.seek(0)
        ts = int(time.time())
        itv = read_uptime()[0]
        for line in f_diskstats:
            # maj, min, devicename, [list of stats, see above]
            values = line.split(None)
            # shortcut the deduper and just skip disks that
            # haven't done a single read.  This eliminates a bunch
            # of loopback, ramdisk, and cdrom devices but still
            # lets us report on the rare case that we actually use
            # a ramdisk.
            if values[3] == "0":
                continue

            if int(values[1]) % 16 == 0 and int(values[0]) > 1:
                metric = "iostat.disk."
            else:
                metric = "iostat.part."

            device = values[2]
            if len(values) == 14:
                # full stats line
                for i in range(11):
                    print("%s%s %d %s dev=%s" %
                          (metric, FIELDS_DISK[i], ts, values[i + 3], device))
                    if FIELDS_DISK[i] == "read_sectors":
                        if PY3:
                            v = int(
                                values[i + 3]) * get_device_sector_size(device)
                        else:
                            # noinspection PyCompatibility
                            v = long(values[i + 3]) * get_device_sector_size(
                                device)  # pylint:disable=undefined-variable
                        print("%s%s %d %s dev=%s" %
                              (metric, "read_bytes", ts, v, device))
                    if FIELDS_DISK[i] == "write_sectors":
                        if PY3:
                            v = int(
                                values[i + 3]) * get_device_sector_size(device)
                        else:
                            # noinspection PyCompatibility
                            v = long(values[i + 3]) * get_device_sector_size(
                                device)  # pylint:disable=undefined-variable
                        print("%s%s %d %s dev=%s" %
                              (metric, "write_bytes", ts, v, device))

                ret = is_device(device, 0)
                # if a device or a partition, calculate the svctm/await/util
                if ret:
                    stats = dict(zip(FIELDS_DISK, values[3:]))
                    if not device in prev_stats:
                        prev_stats[device] = init_stats
                    rd_ios = float(stats.get("read_requests"))
                    wr_ios = float(stats.get("write_requests"))
                    nr_ios = rd_ios + wr_ios
                    prev_rd_ios = float(
                        prev_stats[device].get("read_requests"))
                    prev_wr_ios = float(
                        prev_stats[device].get("write_requests"))
                    prev_nr_ios = prev_rd_ios + prev_wr_ios
                    tput = ((nr_ios - prev_nr_ios) * float(HZ) / float(itv))
                    util = ((float(stats.get("msec_total")) -
                             float(prev_stats[device].get("msec_total"))) *
                            float(HZ) / float(itv))
                    svctm = 0.0
                    await_ = 0.0
                    r_await = 0.0
                    w_await = 0.0

                    if tput:
                        svctm = util / tput

                    rd_ticks = stats.get("msec_read")
                    wr_ticks = stats.get("msec_write")
                    prev_rd_ticks = prev_stats[device].get("msec_read")
                    prev_wr_ticks = prev_stats[device].get("msec_write")
                    if rd_ios != prev_rd_ios:
                        r_await = (float(rd_ticks) - float(prev_rd_ticks)
                                   ) / float(rd_ios - prev_rd_ios)
                    if wr_ios != prev_wr_ios:
                        w_await = (float(wr_ticks) - float(prev_wr_ticks)
                                   ) / float(wr_ios - prev_wr_ios)
                    if nr_ios != prev_nr_ios:
                        await_ = (float(rd_ticks) + float(wr_ticks) -
                                  float(prev_rd_ticks) - float(prev_wr_ticks)
                                  ) / float(nr_ios - prev_nr_ios)
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "svctm", ts, svctm, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "r_await", ts, r_await, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "w_await", ts, w_await, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "await", ts, await_, device))
                    print("%s%s %d %.2f dev=%s" %
                          (metric, "util", ts, float(util / 1000.0), device))

                    prev_stats[device] = copy.deepcopy(stats)

            elif len(values) == 7:
                # partial stats line
                for i in range(4):
                    print("%s%s %d %s dev=%s" %
                          (metric, FIELDS_PART[i], ts, values[i + 3], device))
            else:
                print("Cannot parse /proc/diskstats line: ",
                      line,
                      file=sys.stderr)
                continue

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)