def main(): if USER != "root": utils.drop_privileges(user=USER) last_scan = time.time() - SCAN_INTERVAL while True: ts = time.time() # We haven't looked for zookeeper instance recently, let's do that if ts - last_scan > SCAN_INTERVAL: instances = scan_zk_instances() last_scan = ts if not instances: return 13 # Ask tcollector not to respawn us # Iterate over every zookeeper instance and get statistics for ip, port, tcp_version in instances: tags = "port=%s" % port sock = connect_socket(tcp_version, port) if sock is None: continue sock.send("mntr\n") data = sock.recv(1024) for stat in data.splitlines(): metric = stat.split()[0] value = stat.split()[1] if metric in KEYS: print_stat(metric, ts, value, tags) sock.close() time.sleep(COLLECTION_INTERVAL)
def main(argv): utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) servers = [] if json is None: utils.err("This collector requires the `json' Python module.") return 1 for conf in elasticsearch_conf.get_servers(): server = HTTPConnection( *conf ) try: server.connect() except socket.error as exc: if exc.errno == errno.ECONNREFUSED: continue raise servers.append( server ) if len( servers ) == 0: return 13 # No ES running, ask tcollector to not respawn us. lock = threading.Lock() while True: threads = [] for server in servers: status = node_status(server) version = status["version"]["number"] t = threading.Thread(target = _collect_server, args = (server, version, lock)) t.start() threads.append(t) for thread in threads: thread.join() time.sleep(COLLECTION_INTERVAL)
def main(): utils.drop_privileges() if pymongo is None: print >>sys.stderr, "error: Python module `pymongo' is missing" return 13 c = pymongo.Connection(host=HOST, port=PORT) while True: res = c.admin.command('serverStatus') ts = int(time.time()) for base_metric, tags in TAG_METRICS: for tag in tags: print 'mongo.%s %d %s type=%s' % (base_metric, ts, res[base_metric][tag], tag) for metric in METRICS: cur = res try: for m in metric.split('.'): cur = cur[m] except KeyError: continue print 'mongo.%s %d %s' % (metric, ts, cur) sys.stdout.flush() time.sleep(INTERVAL)
def main(): utils.drop_privileges() pid = find_couchbase_pid() if not pid: utils.err("Error: Either couchbase-server is not running or file (%s)" " doesn't exist" % COUCHBASE_INITFILE) return 13 conf_file = find_conf_file(pid) if not conf_file: utils.err("Error: Can't find config file (%s)" % conf_file) return 13 bin_dir = find_bindir_path(conf_file) if not bin_dir: utils.err("Error: Can't find bindir path in config file") return 13 while True: # Listing bucket everytime so as to start collecting datapoints # of any new bucket. buckets = list_bucket(bin_dir) for b in buckets: collect_stats(bin_dir, b) time.sleep(COLLECTION_INTERVAL)
def main(argv): if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): sys.exit(13) settings = flume_conf.get_settings() if (settings['default_timeout']): DEFAULT_TIMEOUT = settings['default_timeout'] if (settings['default_timeout']): COLLECTION_INTERVAL = settings['collection_interval'] if (settings['flume_host']): FLUME_HOST = settings['flume_host'] if (settings['flume_port']): FLUME_PORT = settings['flume_port'] utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT) try: server.connect() except socket.error, (erno, e): if erno == errno.ECONNREFUSED: return 13 # No Flume server available, ask tcollector to not respawn us. raise
def main(): """netfilter main loop""" utils.drop_privileges() if (os.path.isdir(basedir)): while True: ts = int(time.time()) for s in STATS: try: f = open(basedir + "/" + s, 'r') value = f.readline().rstrip() print("proc.sys.net.ipv4.netfilter.%s %d %s" % (s, ts, value)) f.close() except: # brute'ish, but should keep the collector reasonably future # proof if some of the stats disappear between kernel module # versions continue sys.stdout.flush() time.sleep(interval) else: print ("%s does not exist - ip_conntrack probably missing") sys.exit(13) # we signal tcollector to not run us
def main(): """ifstat main loop""" interval = 15 f_netdev = open("/proc/net/dev", "r") utils.drop_privileges() # We just care about ethN interfaces. We specifically # want to avoid bond interfaces, because interface # stats are still kept on the child interfaces when # you bond. By skipping bond we avoid double counting. while True: f_netdev.seek(0) ts = int(time.time()) for line in f_netdev: m = re.match("\s+(eth\d+):(.*)", line) if not m: continue intf = m.group(1) stats = m.group(2).split(None) def direction(i): if i >= 8: return "out" return "in" for i in xrange(16): print ("proc.net.%s %d %s iface=%s direction=%s" % (FIELDS[i], ts, stats[i], intf, direction(i))) sys.stdout.flush() time.sleep(interval)
def main(): if utils is not None: utils.drop_privileges() while True: RabbitCollector().get_metrics() sys.stdout.flush() time.sleep(INTERVAL)
def main(): utils.drop_privileges() while True: try: response = requests.get(STATUS_URL) except requests.exceptions.RequestException, error: print "%s error retrieving %s %s" %(METRIC_BASENAME, STATUS_URL, str(error)) time.sleep(COLLECTION_INTERVAL) continue timestamp = int(time.time()) lines = [ l.strip() for l in response.text.split("\n") ][:-1] print "%s.conn.active %d %s" %(METRIC_BASENAME, timestamp, lines[0].split(":")[-1].strip()) (cAccepts, cHandled, cRequests) = ( i for i in lines[2].split() if i != "") print "%s.conn.accepts %d %s" %(METRIC_BASENAME, timestamp, cAccepts) print "%s.conn.handled %d %s" %(METRIC_BASENAME, timestamp, cHandled) print "%s.requests %d %s" %(METRIC_BASENAME, timestamp, cRequests) (_, conReads, _, conWrites, _, conWaiting)= (c.strip() for c in lines[-1].split()) print "%s.conn.state %d %s type=reading" %(METRIC_BASENAME, timestamp, conReads) print "%s.conn.state %d %s type=writing" %(METRIC_BASENAME, timestamp, conWrites) print "%s.conn.state %d %s type=waiting" %(METRIC_BASENAME, timestamp, conWaiting) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def main(): utils.drop_privileges() if psutil is None: print >>sys.stderr, "error: python module `psutil' is missing" return 13 if not len(PROCS) and not len(PYTHON_PROCS): print >>sys.stderr, "error: no PROCS or PYTHON_PROCS specified, " \ "create psconf module" return 14 while True: lines = [] for proc in psutil.process_iter(): if len(PYTHON_PROCS) and proc.name() == PYTHON_INTERP: for pp in PYTHON_PROCS: if proc.cmdline()[1].endswith(pp): add_metrics(proc, lines, pp) elif proc.name() in PROCS: add_metrics(proc, lines, proc.name()) if len(lines): for l in lines: print l sys.stdout.flush() time.sleep(INTERVAL)
def main(): """ntpstats main loop""" if not (ntpstat_conf and ntpstat_conf.enabled()): sys.exit(13) utils.drop_privileges() while True: ts = int(time.time()) try: ntp_proc = subprocess.Popen(["ntpq", "-p"], stdout=subprocess.PIPE) except OSError, e: if e.errno == errno.ENOENT: # looks like ntpdc is not available, stop using this collector sys.exit(13) # we signal tcollector to stop using this raise stdout, _ = ntp_proc.communicate() if ntp_proc.returncode == 0: for line in stdout.split("\n"): if not line: continue fields = line.split() if len(fields) <= 0: continue if fields[0].startswith("*"): offset=fields[8] continue print ("ntp.offset %d %s" % (ts, offset)) else: print >> sys.stderr, "ntpq -p, returned %r" % (ntp_proc.returncode) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def main(): utils.drop_privileges() while True: try: if vstats == "all": stats = subprocess.Popen(["varnishstat", "-1", "-j"], stdout=subprocess.PIPE) else: fields = ",".join(vstats) stats = subprocess.Popen(["varnishstat", "-1", "-f" + fields, "-j"], stdout=subprocess.PIPE) except OSError, e: # Die and signal to tcollector not to run this script. sys.stderr.write("Error: %s\n" % e) sys.exit(13) metrics = "" for line in stats.stdout.readlines(): metrics += line metrics = json.loads(metrics) timestamp = "" if use_varnishstat_timestamp: pattern = "%Y-%m-%dT%H:%M:%S" timestamp = int(time.mktime(time.strptime(metrics["timestamp"], pattern))) else: timestamp = time.time() for k, v in metrics.iteritems(): if k != "timestamp": metric_name = metric_prefix + "." + k print "%s %d %s %s" % (metric_name, timestamp, v["value"], ",".join(tags)) sys.stdout.flush() time.sleep(interval)
def main(): # ignore SIGCHLD, prevent the zombie apocalypse signal.signal(signal.SIGCHLD, signal.SIG_IGN) utils.drop_privileges() bad_regex = re.compile("[,()]+") # avoid forbidden by TSD symbols while True: try: if vstats == "all": stats = subprocess.Popen( ["varnishstat", "-1", "-x"], stdout=subprocess.PIPE, ) else: fields = ",".join(vstats) stats = subprocess.Popen( ["varnishstat", "-1", "-f" + fields, "-x"], stdout=subprocess.PIPE, ) except OSError, e: # Die and signal to tcollector not to run this script. sys.stderr.write("Error: %s\n" % e) sys.exit(13) metrics = "" for line in stats.stdout.readlines(): metrics += line metrics = ET.fromstringlist(metrics) timestamp = "" if use_varnishstat_timestamp: pattern = "%Y-%m-%dT%H:%M:%S" timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern))) else: timestamp = time.time() for stat in metrics.findall('stat'): tags = "" k = stat.findtext('name') if None == bad_regex.search(k): stattype = stat.findtext('type') if stattype == None: metric_name = metric_prefix + "." + k elif stattype == "LCK": metric_name = metric_prefix + ".locks." + k ident = stat.findtext('ident') tags = "ident=" + ident elif stattype == "SMA": metric_name = metric_prefix + ".storage." + k ident = stat.findtext('ident') tags = "ident=" + ident else: continue print "%s %d %s %s" % \ (metric_name, timestamp, stat.findtext('value'), tags) sys.stdout.flush() time.sleep(interval)
def main(): utils.drop_privileges() if BinLogStreamReader is None: utils.err("error: Python module `pymysqlreplication' is missing") return 1 settings = zabbix_bridge_conf.get_settings() # Set blocking to True if you want to block and wait for the next event at # the end of the stream stream = BinLogStreamReader(connection_settings=settings['mysql'], server_id=settings['slaveid'], only_events=[WriteRowsEvent], resume_stream=True, blocking=True) db_filename = settings['sqlitedb'] dbcache = sqlite3.connect(':memory:') cachecur = dbcache.cursor() cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,)) cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics. log_pos = 0 key_lookup_miss = 0 sample_last_ts = int(time.time()) last_key_lookup_miss = 0 for binlogevent in stream: if binlogevent.schema == settings['mysql']['db']: table = binlogevent.table log_pos = binlogevent.packet.log_pos if table == 'history' or table == 'history_uint': for row in binlogevent.rows: r = row['values'] itemid = r['itemid'] cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,)) row = cachecur.fetchone() if (row is not None): print("zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3])) if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals sample_last_ts = int(time.time()) print("tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos)) print("tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss)) print("tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock']))) if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']): print("tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss))) cachecur.execute('DROP TABLE zabbix_cache') cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') last_key_lookup_miss = key_lookup_miss else: # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry utils.err("error: Key lookup miss for %s" % (itemid)) key_lookup_miss += 1 sys.stdout.flush() dbcache.close() stream.close()
def main(): """Main loop""" if USER != "root": utils.drop_privileges(user=USER) sys.stdin.close() interval = 15 # we scan for instances here to see if there are any redis servers # running on this machine... last_scan = time.time() instances = scan_for_instances() # port:name if not len(instances): return 13 if not has_redis: sys.stderr.write("Found %d instance(s) to monitor, but the Python" " Redis module isn't installed.\n" % len(instances)) return 1 def print_stat(metric, value, tags=""): if value is not None: print "redis.%s %d %s %s" % (metric, ts, value, tags) dbre = re.compile("^db\d+$") while True: ts = int(time.time()) # if we haven't looked for redis instances recently, let's do that if ts - last_scan > SCAN_INTERVAL: instances = scan_for_instances() last_scan = ts # now iterate over every instance and gather statistics for port in instances: tags = "cluster=%s port=%d" % (instances[port], port) # connect to the instance and attempt to gather info r = redis.Redis(host="127.0.0.1", port=port) info = r.info() for key in KEYS: if key in info: print_stat(key, info[key], tags) # per database metrics for db in filter(dbre.match, info.keys()): for db_metric in info[db].keys(): print_stat(db_metric, info[db][db_metric], "%s db=%s" % (tags, db)) # get some instant latency information # TODO: might be nice to get 95th, 99th, etc here? start_time = time.time() r.ping() print_stat("latency", time.time() - start_time, tags) sys.stdout.flush() time.sleep(interval)
def loop(self): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us while True: self.emit() time.sleep(self.delay) return 0
def main(): utils.drop_privileges() while True: processes = ProcessTable() processes.update() collect_tcollect_stats(processes) time.sleep(COLLECTION_INTERVAL)
def main(): """dfstats main loop""" utils.drop_privileges() while True: ts = int(time.time()) # 1kblocks df_proc = subprocess.Popen(["df", "-PlTk"], stdout=subprocess.PIPE) stdout, _ = df_proc.communicate() if df_proc.returncode == 0: for line in stdout.split("\n"): # pylint: disable=E1103 fields = line.split() # skip header/blank lines if not line or not fields[2].isdigit(): continue # Skip mounts/types we don't care about. # Most of this stuff is of type tmpfs, but we don't # want to blacklist all tmpfs since sometimes it's # used for active filesystems (/var/run, /tmp) # that we do want to track. if fields[1] in ("debugfs", "devtmpfs"): continue if fields[6] == "/dev": continue # /dev/shm, /lib/init_rw, /lib/modules, etc # if fields[6].startswith(("/lib/", "/dev/")): # python2.5+ if fields[6].startswith("/lib/"): continue if fields[6].startswith("/dev/"): continue mount = fields[6] print("df.1kblocks.total %d %s mount=%s fstype=%s" % (ts, fields[2], mount, fields[1])) print("df.1kblocks.used %d %s mount=%s fstype=%s" % (ts, fields[3], mount, fields[1])) print("df.1kblocks.free %d %s mount=%s fstype=%s" % (ts, fields[4], mount, fields[1])) else: print >>sys.stderr, "df -Pltk returned %r" % df_proc.returncode ts = int(time.time()) # inodes df_proc = subprocess.Popen(["df", "-PlTi"], stdout=subprocess.PIPE) stdout, _ = df_proc.communicate() if df_proc.returncode == 0: for line in stdout.split("\n"): # pylint: disable=E1103 fields = line.split() if not line or not fields[2].isdigit(): continue mount = fields[6] print("df.inodes.total %d %s mount=%s fstype=%s" % (ts, fields[2], mount, fields[1])) print("df.inodes.used %d %s mount=%s fstype=%s" % (ts, fields[3], mount, fields[1])) print("df.inodes.free %d %s mount=%s fstype=%s" % (ts, fields[4], mount, fields[1])) else: print >>sys.stderr, "df -Plti returned %r" % df_proc.returncode sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def main(argv): utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) server = httplib.HTTPConnection(ES_HOST, ES_PORT) try: server.connect() except socket.error, (erno, e): if erno == errno.ECONNREFUSED: return 13 # No ES running, ask tcollector to not respawn us. raise
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us hbase_service = HBaseMaster() while True: hbase_service.emit() time.sleep(90) return 0
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us datanode_service = HadoopDataNode() while True: datanode_service.emit() time.sleep(15) return 0
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us rm_node_service = HadoopResourceManager() while True: rm_node_service.emit() time.sleep(90) return 0
def main(): try: utils.drop_privileges() # collect period 60 secs url = "http://localhost:9999/stats.txt?period=60" response = urllib2.urlopen(url) content = response.read() process(content) except Exception: pass
def main(): utils.drop_privileges() monitors_dict = {role: None for role in DRUID_ROLES} while True: for role, monitor in monitors_dict.items(): if monitor is None or monitor.poll() is not None: monitors_dict[role] = spawn_monitor(role) time.sleep(5)
def main(): if not (udp_bridge_conf and udp_bridge_conf.enabled()): sys.exit(13) utils.drop_privileges() try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind((HOST, PORT)) except socket.error, msg: sys.stderr.write('could not open socket: %s\n' % msg) sys.exit(1)
def main(): if not (graphite_bridge_conf and graphite_bridge_conf.enabled()): sys.exit(13) utils.drop_privileges() server = GraphiteServer((HOST, PORT), GraphiteHandler) server.daemon_threads = True try: server.serve_forever() except KeyboardInterrupt: server.shutdown() server.server_close()
def main(argv): utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) if json is None: err("This collector requires the `json' Python module.") return 1 while True: ts = int(time.time()) output_stats("impalad",ts,worker_stats()) output_stats("statestored",ts,statestore_stats()) output_stats("catalogd",ts,catalog_stats()) time.sleep(COLLECTION_INTERVAL)
def main(argv): utils.drop_privileges(user=USER) # Build the classpath. dir = os.path.dirname(sys.argv[0]) jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar") if not os.path.exists(jar): print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar return 13 classpath = [jar] for jar in CLASSPATH: if os.path.exists(jar): classpath.append(jar) classpath = ":".join(classpath) jpid = "worker" jps = subprocess.check_output("/usr/bin/jps").split("\n") for item in jps: vals = item.split(" ") if len(vals) == 2: if vals[1] == "worker": jmx = subprocess.Popen( [JAVA, "-enableassertions", "-enablesystemassertions", "-Xmx64m", "-cp", classpath, "com.stumbleupon.monitoring.jmx", vals[0] ], stdout=subprocess.PIPE).communicate()[0] if len(jmx) > 0: topologyPos=jmx.find("userevents:type=JmxMetricsConsumer") if topologyPos != -1: beans = [x.split("\t")[0] for x in jmx.split("\n")] #Check if there is a name topologyName="userevents" taskId=0 for bean in beans: if bean.startswith('userevents'): stormInfo=bean.split(',') for stormDetail in stormInfo: if stormDetail.startswith('name'): topologyName=stormDetail.split('=')[1] elif stormDetail.startswith('task'): taskId=stormDetail.split('=')[1] t = Thread(target=processJMX, args=(vals[0], topologyName, taskId, classpath)) t.daemon = True # thread dies with the program t.start() time.sleep(30) return 0 # Ask the tcollector to re-spawn us.
def main(): """ifstat main loop""" f_netdev = open("/proc/net/dev") utils.drop_privileges() # We just care about ethN and emN interfaces. We specifically # want to avoid bond interfaces, because interface # stats are still kept on the child interfaces when # you bond. By skipping bond we avoid double counting. while True: f_netdev.seek(0) ts = int(time.time()) for line in f_netdev: m = re.match(r''' \s* ( eth?\d+ | em\d+_\d+/\d+ | em\d+_\d+ | em\d+ | p\d+p\d+_\d+/\d+ | p\d+p\d+_\d+ | p\d+p\d+ | (?: # Start of 'predictable network interface names' (?:en|sl|wl|ww) (?: b\d+ | # BCMA bus c[0-9a-f]+ | # CCW bus group o\d+(?:d\d+)? | # On-board device s\d+(?:f\d+)?(?:d\d+)? | # Hotplug slots x[0-9a-f]+ | # Raw MAC address p\d+s\d+(?:f\d+)?(?:d\d+)? | # PCI geographic loc p\d+s\d+(?:f\d+)?(?:u\d+)*(?:c\d+)?(?:i\d+)? # USB ) ) ):(.*)''', line, re.VERBOSE) if not m: continue intf = m.group(1) stats = m.group(2).split(None) def direction(i): if i >= 8: return "out" return "in" for i in xrange(16): print("proc.net.%s %d %s iface=%s direction=%s" % (FIELDS[i], ts, stats[i], intf, direction(i))) sys.stdout.flush() time.sleep(interval)
def main(argv): if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): # Status code 13 tells the parent tcollector not to respawn this collector return 13 settings = flume_conf.get_settings() if (settings['default_timeout']): DEFAULT_TIMEOUT = settings['default_timeout'] if (settings['flume_host']): FLUME_HOST = settings['flume_host'] if (settings['flume_port']): FLUME_PORT = settings['flume_port'] utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) server = httplib.HTTPConnection(FLUME_HOST, FLUME_PORT) try: server.connect() except: # Nothing really wrong if the Flume server is unavailable, we should just try again next time. return 0 if json is None: err("This collector requires the `json' Python module.") return 1 def printmetric(component, metric, value, **tags): if tags: tags = " " + " ".join("%s=%s" % (name, value) for name, value in tags.iteritems()) else: tags = "" print ("flume.%s.%s %d %s %s" % (component, metric, ts, value, tags)) # Get the metrics ts = int(time.time()) # In case last call took a while. stats = flume_metrics(server) for component in stats: (component_type, name) = component.split(".") tags = {"type": name} for metric, value in stats[component].items(): if metric not in EXCLUDE: printmetric(component_type.lower(), metric, value, **tags) return 0
def main(): if not (tcp_bridge_conf and tcp_bridge_conf.enabled()): print >> sys.stderr, 'not enabled, or tcp_bridge_conf unavilable' sys.exit(13) utils.drop_privileges() def printm(string, time, value): out.write(m_namespace + string + ' ' + str(time) + ' ' + str(value) + '\n') def printmetrics(): global m_delay global m_last ts = int(time.time()) if ts > m_last + m_delay: printm('lines_read', ts, m_lines) printm('connections_processed', ts, m_connections) printm('processing_time', ts, m_ptime) printm('active', ts, 1) m_last = ts def clientthread(connection): global m_lines global m_connections global m_ptime start = time.time() f = connection.makefile() while True: data = f.readline() if not data: break data = removePut(data) out.write(data) m_lines += 1 f.close() connection.close() end = time.time() m_ptime += (end - start) m_connections += 1 printmetrics() def removePut(line): if line.startswith('put '): return line[4:] else: return line try: if tcp_bridge_conf.port(): PORT = tcp_bridge_conf.port() if tcp_bridge_conf.host(): HOST = tcp_bridge_conf.host() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind((HOST, PORT)) sock.listen(1) except socket.error, msg: utils.err('could not open socket: %s' % msg) sys.exit(1)
def main(argv): utils.drop_privileges(user=USER) # Build the classpath. dir = os.path.dirname(sys.argv[0]) jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar") if not os.path.exists(jar): print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar return 13 classpath = [jar] for jar in CLASSPATH: if os.path.exists(jar): classpath.append(jar) classpath = ":".join(classpath) jpid = "HRegionServer" jps = subprocess.check_output("jps").split("\n") for item in jps: vals = item.split(" ") if len(vals) == 2: if vals[1] == "HRegionServer": jpid = vals[0] break # in HBase 0.94 the mbean domain is hadoop # in HBase 0.96 it is Hadoop (captical H) jmx = subprocess.Popen( [JAVA, "-enableassertions", "-enablesystemassertions", # safe++ "-Xmx64m", # Low RAM limit, to avoid stealing too much from prod. "-cp", classpath, "com.stumbleupon.monitoring.jmx", "--watch", INTERVAL , "--long", "--timestamp", jpid, # Name of the process. # The remaining arguments are pairs (mbean_regexp, attr_regexp). # The first regexp is used to match one or more MBeans, the 2nd # to match one or more attributes of the MBeans matched. "[Hh]adoop", "", # All HBase / hadoop metrics. "Memory$", "", # Heap stats "Threading", "Count|Time$", # Number of threads and CPU time. "OperatingSystem", "OpenFile", # Number of open files. "GarbageCollector", "Collection", # GC runs and time spent GCing. ], stdout=subprocess.PIPE, bufsize=1) do_on_signal(signal.SIGINT, kill, jmx) do_on_signal(signal.SIGPIPE, kill, jmx) do_on_signal(signal.SIGTERM, kill, jmx) try: prev_timestamp = 0 while True: line = jmx.stdout.readline() if not line and jmx.poll() is not None: break # Nothing more to read and process exited. elif len(line) < 4: print >>sys.stderr, "invalid line (too short): %r" % line continue try: timestamp, metric, value, mbean = line.split("\t", 3) except ValueError, e: # Temporary workaround for jmx.jar not printing these lines we # don't care about anyway properly. if "java.lang.String" not in line: print >>sys.stderr, "Can't split line: %r" % line continue # Sanitize the timestamp. try: timestamp = int(timestamp) if timestamp < time.time() - 600: raise ValueError("timestamp too old: %d" % timestamp) if timestamp < prev_timestamp: raise ValueError("timestamp out of order: prev=%d, new=%d" % (prev_timestamp, timestamp)) except ValueError, e: print >>sys.stderr, ("Invalid timestamp on line: %r -- %s" % (line, e)) continue prev_timestamp = timestamp if metric in IGNORED_METRICS: continue tags = "" # The JMX metrics have per-request-type metrics like so: # metricNameNumOps # metricNameMinTime # metricNameMaxTime # metricNameAvgTime # Group related metrics together in the same metric name, use tags # to separate the different request types, so we end up with: # numOps op=metricName # avgTime op=metricName # etc, which makes it easier to graph things with the TSD. if metric.endswith("MinTime"): # We don't care about the minimum continue # time taken by operations. elif metric.startswith("tbl."): # Per-table/region/cf metrics continue # ignore for now, too much spam elif "BlockedSeconds" in metric or "LatencyHistogram" in metric: continue # ignore for now, too much spam elif metric.endswith("KB"): metric = metric[:-2] # Try converting to bytes try: value = float(value) * 1024 except ValueError, e: value = 0
# These are the jmx handlers we'll be using. from bbm.jvm import jvm_collector from bbm.tomcat import tomcat_collector signal.signal(signal.SIGCHLD, signal.SIG_IGN) # The sonos uses an embdedded tomcat with the webapp name set to "Tomcat", we';; # rewrite the webapp name to "sonos" def renamer(v): if v.metric.startswith("tomcat."): v.tags = map( lambda t: "webapp=sonos" if t.startswith("webapp=") else t, v.tags) return v # Find the pid of the bbm-sonos server pgrep = subprocess.check_output([ "/usr/bin/pgrep", "-f", "-u", "bbm-sonos", "/usr/share/bbm-sonos/sonos.war" ]) jpid = pgrep.rstrip("\n") if jpid == "": sys.exit(1) # We can change over to hte bbm-sonos user for security utils.drop_privileges(user="******") RunCollector(start_jmx_collector(15, jpid, jvm_collector + tomcat_collector, renamer), extraTags=["application=sonos"])
def main(): """dfstats main loop""" try: f_mounts = open("/proc/mounts", "r") except IOError as e: utils.err("error: can't open /proc/mounts: %s" % e) return 13 # Ask tcollector to not respawn us utils.drop_privileges() while True: devices = [] f_mounts.seek(0) ts = int(time.time()) for line in f_mounts: # Docs come from the fstab(5) # fs_spec # Mounted block special device or remote filesystem # fs_file # Mount point # fs_vfstype # File system type # fs_mntops # Mount options # fs_freq # Dump(8) utility flags # fs_passno # Order in which filesystem checks are done at reboot time try: fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split( None) except ValueError as e: utils.err("error: can't parse line at /proc/mounts: %s" % e) continue if fs_spec == "none": continue elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."): continue # startswith(tuple) avoided to preserve support of Python 2.4 elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \ fs_file.startswith("/proc") or fs_file.startswith("/lib") or \ fs_file.startswith("net:") or fs_file.startswith("/var/lib/kubelet"): continue # keep /dev/xxx device with shorter fs_file (remove mount binds) device_found = False if fs_spec.startswith("/dev"): for device in devices: if fs_spec == device[0]: device_found = True if len(fs_file) < len(device[1]): device[1] = fs_file break if not device_found: devices.append([fs_spec, fs_file, fs_vfstype]) else: devices.append([fs_spec, fs_file, fs_vfstype]) for device in devices: fs_spec, fs_file, fs_vfstype = device try: r = os.statvfs(fs_file) except OSError as e: utils.err("can't get info for mount point: %s: %s" % (fs_file, e)) continue used = r.f_blocks - r.f_bfree # conditional expression avoided to preserve support of Python 2.4 # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / r.f_blocks if r.f_blocks == 0: percent_used = 100 else: percent_used = used * 100.0 / r.f_blocks print("df.bytes.total %d %s mount=%s fstype=%s" % (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype)) print("df.bytes.used %d %s mount=%s fstype=%s" % (ts, r.f_frsize * used, fs_file, fs_vfstype)) print("df.bytes.percentused %d %s mount=%s fstype=%s" % (ts, percent_used, fs_file, fs_vfstype)) print("df.bytes.free %d %s mount=%s fstype=%s" % (ts, r.f_frsize * r.f_bfree, fs_file, fs_vfstype)) used = r.f_files - r.f_ffree # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files if r.f_files == 0: percent_used = 100 else: percent_used = used * 100.0 / r.f_files print("df.inodes.total %d %s mount=%s fstype=%s" % (ts, r.f_files, fs_file, fs_vfstype)) print("df.inodes.used %d %s mount=%s fstype=%s" % (ts, used, fs_file, fs_vfstype)) print("df.inodes.percentused %d %s mount=%s fstype=%s" % (ts, percent_used, fs_file, fs_vfstype)) print("df.inodes.free %d %s mount=%s fstype=%s" % (ts, r.f_ffree, fs_file, fs_vfstype)) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
]) def err(msg): print >> sys.stderr, msg def main(): """dfstats main loop""" try: f_mounts = open("/proc/mounts", "r") except IOError, e: err("error: can't open /proc/mounts: %s" % e) return 13 # Ask tcollector to not respawn us utils.drop_privileges() while True: devices = [] f_mounts.seek(0) ts = int(time.time()) for line in f_mounts: # Docs come from the fstab(5) # fs_spec # Mounted block special device or remote filesystem # fs_file # Mount point # fs_vfstype # File system type # fs_mntops # Mount options # fs_freq # Dump(8) utility flags # fs_passno # Order in which filesystem checks are done at reboot time try:
def main(): """procstats main loop""" f_uptime = open("/proc/uptime", "r") f_meminfo = open("/proc/meminfo", "r") f_vmstat = open("/proc/vmstat", "r") f_stat = open("/proc/stat", "r") f_loadavg = open("/proc/loadavg", "r") f_entropy_avail = open("/proc/sys/kernel/random/entropy_avail", "r") f_interrupts = open("/proc/interrupts", "r") f_scaling = "/sys/devices/system/cpu/cpu%s/cpufreq/cpuinfo_%s_freq" f_scaling_min = dict([]) f_scaling_max = dict([]) f_scaling_cur = dict([]) for cpu in glob.glob("/sys/devices/system/cpu/cpu[0-9]*/cpufreq/cpuinfo_cur_freq"): m = re.match("/sys/devices/system/cpu/cpu([0-9]*)/cpufreq/cpuinfo_cur_freq", cpu) if not m: continue cpu_no = m.group(1) sys.stderr.write(f_scaling % (cpu_no,"min")) f_scaling_min[cpu_no] = open(f_scaling % (cpu_no,"min"), "r") f_scaling_max[cpu_no] = open(f_scaling % (cpu_no,"max"), "r") f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no,"cur"), "r") numastats = open_sysfs_numa_stats() utils.drop_privileges() while True: # proc.uptime f_uptime.seek(0) ts = int(time.time()) for line in f_uptime: m = re.match("(\S+)\s+(\S+)", line) if m: print "proc.uptime.total %d %s" % (ts, m.group(1)) print "proc.uptime.now %d %s" % (ts, m.group(2)) # proc.meminfo f_meminfo.seek(0) ts = int(time.time()) for line in f_meminfo: m = re.match("(\w+):\s+(\d+)\s+(\w+)", line) if m: if m.group(3).lower() == 'kb': # convert from kB to B for easier graphing value = str(int(m.group(2)) * 1000) else: value = m.group(2) print ("proc.meminfo.%s %d %s" % (m.group(1).lower(), ts, value)) # proc.vmstat f_vmstat.seek(0) ts = int(time.time()) for line in f_vmstat: m = re.match("(\w+)\s+(\d+)", line) if not m: continue if m.group(1) in ("pgpgin", "pgpgout", "pswpin", "pswpout", "pgfault", "pgmajfault"): print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2)) # proc.stat f_stat.seek(0) ts = int(time.time()) for line in f_stat: m = re.match("(\w+)\s+(.*)", line) if not m: continue if m.group(1).startswith("cpu"): cpu_m = re.match("cpu(\d+)", m.group(1)) if cpu_m: metric_percpu = '.percpu' tags = ' cpu=%s' % cpu_m.group(1) else: metric_percpu = '' tags = '' fields = m.group(2).split() cpu_types = ['user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq', 'guest', 'guest_nice'] # We use zip to ignore fields that don't exist. for value, field_name in zip(fields, cpu_types): print "proc.stat.cpu%s %d %s type=%s%s" % (metric_percpu, ts, value, field_name, tags) elif m.group(1) == "intr": print ("proc.stat.intr %d %s" % (ts, m.group(2).split()[0])) elif m.group(1) == "ctxt": print "proc.stat.ctxt %d %s" % (ts, m.group(2)) elif m.group(1) == "processes": print "proc.stat.processes %d %s" % (ts, m.group(2)) elif m.group(1) == "procs_blocked": print "proc.stat.procs_blocked %d %s" % (ts, m.group(2)) f_loadavg.seek(0) ts = int(time.time()) for line in f_loadavg: m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line) if not m: continue print "proc.loadavg.1min %d %s" % (ts, m.group(1)) print "proc.loadavg.5min %d %s" % (ts, m.group(2)) print "proc.loadavg.15min %d %s" % (ts, m.group(3)) print "proc.loadavg.runnable %d %s" % (ts, m.group(4)) print "proc.loadavg.total_threads %d %s" % (ts, m.group(5)) f_entropy_avail.seek(0) ts = int(time.time()) for line in f_entropy_avail: print "proc.kernel.entropy_avail %d %s" % (ts, line.strip()) f_interrupts.seek(0) ts = int(time.time()) # Get number of CPUs from description line. num_cpus = len(f_interrupts.readline().split()) for line in f_interrupts: cols = line.split() irq_type = cols[0].rstrip(":") if irq_type.isalnum(): if irq_type.isdigit(): if cols[-2] == "PCI-MSI-edge" and "eth" in cols[-1]: irq_type = cols[-1] else: continue # Interrupt type is just a number, ignore. for i, val in enumerate(cols[1:]): if i >= num_cpus: # All values read, remaining cols contain textual # description break if not val.isdigit(): # something is weird, there should only be digit values sys.stderr.write("Unexpected interrupts value %r in" " %r: " % (val, cols)) break print ("proc.interrupts %s %s type=%s cpu=%s" % (ts, val, irq_type, i)) print_numa_stats(numastats) # Print scaling stats ts = int(time.time()) for cpu_no in f_scaling_min.keys(): f = f_scaling_min[cpu_no] f.seek(0) for line in f: print "proc.scaling.min %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) ts = int(time.time()) for cpu_no in f_scaling_max.keys(): f = f_scaling_max[cpu_no] f.seek(0) for line in f: print "proc.scaling.max %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) ts = int(time.time()) for cpu_no in f_scaling_cur.keys(): f = f_scaling_cur[cpu_no] f.seek(0) for line in f: print "proc.scaling.cur %d %s cpu=%s" % (ts, line.rstrip('\n'), cpu_no) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def main(): """Main loop""" sys.stdin.close() interval = 15 page_size = resource.getpagesize() try: sockstat = open("/proc/net/sockstat") netstat = open("/proc/net/netstat") snmp = open("/proc/net/snmp") except IOError as e: print("open failed: %s" % e, file=sys.stderr) return 13 # Ask tcollector to not re-start us. utils.drop_privileges() # Note: up until v2.6.37-rc2 most of the values were 32 bits. # The first value is pretty useless since it accounts for some # socket types but not others. So we don't report it because it's # more confusing than anything else and it's not well documented # what type of sockets are or aren't included in this count. regexp = re.compile("sockets: used \d+\n" "TCP: inuse (?P<tcp_inuse>\d+) orphan (?P<orphans>\d+)" " tw (?P<tw_count>\d+) alloc (?P<tcp_sockets>\d+)" " mem (?P<tcp_pages>\d+)\n" "UDP: inuse (?P<udp_inuse>\d+)" # UDP memory accounting was added in v2.6.25-rc1 "(?: mem (?P<udp_pages>\d+))?\n" # UDP-Lite (RFC 3828) was added in v2.6.20-rc2 "(?:UDPLITE: inuse (?P<udplite_inuse>\d+)\n)?" "RAW: inuse (?P<raw_inuse>\d+)\n" "FRAG: inuse (?P<ip_frag_nqueues>\d+)" " memory (?P<ip_frag_mem>\d+)\n") def print_sockstat(metric, value, tags=""): # Note: tags must start with ' ' if value is not None: print("net.sockstat.%s %d %s%s" % (metric, ts, value, tags)) # If a line in /proc/net/{netstat,snmp} doesn't start with a word in that # dict, we'll ignore it. We use the value to build the metric name. known_statstypes = { "TcpExt:": "tcp", "IpExt:": "ip", # We don't collect anything from here for now. "Ip:": "ip", # We don't collect anything from here for now. "Icmp:": "icmp", # We don't collect anything from here for now. "IcmpMsg:": "icmpmsg", # We don't collect anything from here for now. "Tcp:": "tcp", # We don't collect anything from here for now. "Udp:": "udp", "UdpLite:": "udplite", # We don't collect anything from here for now. "Arista:": "arista", # We don't collect anything from here for now. } # Any stat in /proc/net/{netstat,snmp} that doesn't appear in this dict will # be ignored. If we find a match, we'll use the (metricname, tags). tcp_stats = { # An application wasn't able to accept a connection fast enough, so # the kernel couldn't store an entry in the queue for this connection. # Instead of dropping it, it sent a cookie to the client. "SyncookiesSent": ("syncookies", "type=sent"), # After sending a cookie, it came back to us and passed the check. "SyncookiesRecv": ("syncookies", "type=received"), # After sending a cookie, it came back to us but looked invalid. "SyncookiesFailed": ("syncookies", "type=failed"), # When a socket is using too much memory (rmem), the kernel will first # discard any out-of-order packet that has been queued (with SACK). "OfoPruned": ("memory.prune", "type=drop_ofo_queue"), # If the kernel is really really desperate and cannot give more memory # to this socket even after dropping the ofo queue, it will simply # discard the packet it received. This is Really Bad. "RcvPruned": ("memory.prune", "type=drop_received"), # We waited for another packet to send an ACK, but didn't see any, so # a timer ended up sending a delayed ACK. "DelayedACKs": ("delayedack", "type=sent"), # We wanted to send a delayed ACK but failed because the socket was # locked. So the timer was reset. "DelayedACKLocked": ("delayedack", "type=locked"), # We sent a delayed and duplicated ACK because the remote peer # retransmitted a packet, thinking that it didn't get to us. "DelayedACKLost": ("delayedack", "type=lost"), # We completed a 3WHS but couldn't put the socket on the accept queue, # so we had to discard the connection. "ListenOverflows": ("failed_accept", "reason=full_acceptq"), # We couldn't accept a connection because one of: we had no route to # the destination, we failed to allocate a socket, we failed to # allocate a new local port bind bucket. Note: this counter # also include all the increments made to ListenOverflows... "ListenDrops": ("failed_accept", "reason=other"), # A packet was lost and we used Forward RTO-Recovery to retransmit. "TCPForwardRetrans": ("retransmit", "type=forward"), # A packet was lost and we fast-retransmitted it. "TCPFastRetrans": ("retransmit", "type=fast"), # A packet was lost and we retransmitted after a slow start. "TCPSlowStartRetrans": ("retransmit", "type=slowstart"), # A packet was lost and we recovered after a fast retransmit. "TCPRenoRecovery": ("packetloss.recovery", "type=fast_retransmit"), # A packet was lost and we recovered by using selective # acknowledgements. "TCPSackRecovery": ("packetloss.recovery", "type=sack"), # We detected re-ordering using FACK (Forward ACK -- the highest # sequence number known to have been received by the peer when using # SACK -- FACK is used during congestion control). "TCPFACKReorder": ("reording", "detectedby=fack"), # We detected re-ordering using SACK. "TCPSACKReorder": ("reording", "detectedby=sack"), # We detected re-ordering using fast retransmit. "TCPRenoReorder": ("reording", "detectedby=fast_retransmit"), # We detected re-ordering using the timestamp option. "TCPTSReorder": ("reording", "detectedby=timestamp"), # We detected some erroneous retransmits and undid our CWND reduction. "TCPFullUndo": ("congestion.recovery", "type=full_undo"), # We detected some erroneous retransmits, a partial ACK arrived while # we were fast retransmitting, so we were able to partially undo some # of our CWND reduction. "TCPPartialUndo": ("congestion.recovery", "type=hoe_heuristic"), # We detected some erroneous retransmits, a D-SACK arrived and ACK'ed # all the retransmitted data, so we undid our CWND reduction. "TCPDSACKUndo": ("congestion.recovery", "type=sack"), # We detected some erroneous retransmits, a partial ACK arrived, so we # undid our CWND reduction. "TCPLossUndo": ("congestion.recovery", "type=ack"), # We received an unexpected SYN so we sent a RST to the peer. "TCPAbortOnSyn": ("abort", "type=unexpected_syn"), # We were in FIN_WAIT1 yet we received a data packet with a sequence # number that's beyond the last one for this connection, so we RST'ed. "TCPAbortOnData": ("abort", "type=data_after_fin_wait1"), # We received data but the user has closed the socket, so we have no # wait of handing it to them, so we RST'ed. "TCPAbortOnClose": ("abort", "type=data_after_close"), # This is Really Bad. It happens when there are too many orphaned # sockets (not attached a FD) and the kernel has to drop a connection. # Sometimes it will send a reset to the peer, sometimes it wont. "TCPAbortOnMemory": ("abort", "type=out_of_memory"), # The connection timed out really hard. "TCPAbortOnTimeout": ("abort", "type=timeout"), # We killed a socket that was closed by the application and lingered # around for long enough. "TCPAbortOnLinger": ("abort", "type=linger"), # We tried to send a reset, probably during one of teh TCPABort* # situations above, but we failed e.g. because we couldn't allocate # enough memory (very bad). "TCPAbortFailed": ("abort.failed", None), # Number of times a socket was put in "memory pressure" due to a non # fatal memory allocation failure (reduces the send buffer size etc). "TCPMemoryPressures": ("memory.pressure", None), # We got a completely invalid SACK block and discarded it. "TCPSACKDiscard": ("invalid_sack", "type=invalid"), # We got a duplicate SACK while retransmitting so we discarded it. "TCPDSACKIgnoredOld": ("invalid_sack", "type=retransmit"), # We got a duplicate SACK and discarded it. "TCPDSACKIgnoredNoUndo": ("invalid_sack", "type=olddup"), # We received something but had to drop it because the socket's # receive queue was full. "TCPBacklogDrop": ("receive.queue.full", None), } known_stats = { "tcp": tcp_stats, "ip": {}, "icmp": {}, "icmpmsg": {}, "udp": { # Total UDP datagrams received by this host "InDatagrams": ("datagrams", "direction=in"), # UDP datagrams received on a port with no listener "NoPorts": ("errors", "direction=in reason=noport"), # Total UDP datagrams that could not be delivered to an application # Note: this counter also increments for RcvbufErrors "InErrors": ("errors", "direction=in reason=other"), # Total UDP datagrams sent from this host "OutDatagrams": ("datagrams", "direction=out"), # Datagrams for which not enough socket buffer memory to receive "RcvbufErrors": ("errors", "direction=in reason=nomem"), # Datagrams for which not enough socket buffer memory to transmit "SndbufErrors": ("errors", "direction=out reason=nomem"), }, "udplite": {}, "arista": {}, } def print_netstat(statstype, metric, value, tags=""): if tags: space = " " else: tags = space = "" print("net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value, space, tags)) def parse_stats(stats, filename): statsdikt = {} # /proc/net/{netstat,snmp} have a retarded column-oriented format. It # looks like this: # Header: SomeMetric OtherMetric # Header: 1 2 # OtherHeader: ThirdMetric FooBar # OtherHeader: 42 51 # OtherHeader: FourthMetric # OtherHeader: 4 # We first pair the lines together, then create a dict for each type: # {"SomeMetric": "1", "OtherMetric": "2"} lines = stats.splitlines() assert len(lines) % 2 == 0, repr(lines) for header, data in zip(*(iter(lines), ) * 2): header = header.split() data = data.split() assert header[0] == data[0], repr((header, data)) assert len(header) == len(data), repr((header, data)) if header[0] not in known_statstypes: print("Unrecoginized line in %s:" " %r (file=%r)" % (filename, header, stats), file=sys.stderr) continue statstype = header.pop(0) data.pop(0) stats = dict(zip(header, data)) statsdikt.setdefault(known_statstypes[statstype], {}).update(stats) for statstype, stats in statsdikt.items(): # Undo the kernel's double counting if "ListenDrops" in stats: stats["ListenDrops"] = int(stats["ListenDrops"]) - int( stats.get("ListenOverflows", 0)) elif "RcvbufErrors" in stats: stats["InErrors"] = int(stats.get("InErrors", 0)) - int( stats["RcvbufErrors"]) for stat, (metric, tags) in known_stats[statstype].items(): value = stats.get(stat) if value is not None: print_netstat(statstype, metric, value, tags) while True: ts = int(time.time()) sockstat.seek(0) netstat.seek(0) snmp.seek(0) data = sockstat.read() netstats = netstat.read() snmpstats = snmp.read() m = re.match(regexp, data) if not m: print("Cannot parse sockstat: %r" % data, file=sys.stderr) return 13 # The difference between the first two values is the number of # sockets allocated vs the number of sockets actually in use. print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp") print_sockstat("num_timewait", m.group("tw_count")) print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp") print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp") print_sockstat("sockets_inuse", m.group("udplite_inuse"), " type=udplite") print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw") print_sockstat("num_orphans", m.group("orphans")) print_sockstat("memory", int(m.group("tcp_pages")) * page_size, " type=tcp") if m.group("udp_pages") is not None: print_sockstat("memory", int(m.group("udp_pages")) * page_size, " type=udp") print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag") print_sockstat("ipfragqueues", m.group("ip_frag_nqueues")) parse_stats(netstats, netstat.name) parse_stats(snmpstats, snmp.name) sys.stdout.flush() time.sleep(interval)
def main(): """iostats main loop.""" f_diskstats = open("/proc/diskstats") HZ = get_system_hz() itv = 1.0 utils.drop_privileges() while True: f_diskstats.seek(0) ts = int(time.time()) itv = read_uptime()[1] for line in f_diskstats: # maj, min, devicename, [list of stats, see above] values = line.split(None) # shortcut the deduper and just skip disks that # haven't done a single read. This eliminates a bunch # of loopback, ramdisk, and cdrom devices but still # lets us report on the rare case that we actually use # a ramdisk. if values[3] == "0": continue if int(values[1]) % 16 == 0 and int(values[0]) > 1: metric = "iostat.disk." else: metric = "iostat.part." device = values[2] if len(values) == 14: # full stats line for i in range(11): print("%s%s %d %s dev=%s" % (metric, FIELDS_DISK[i], ts, values[i + 3], device)) ret = is_device(device, 0) # if a device or a partition, calculate the svctm/await/util if ret: stats = dict(zip(FIELDS_DISK, values[3:])) nr_ios = float(stats.get("read_requests")) + \ float(stats.get("write_requests")) tput = (nr_ios * float(HZ) / float(itv)) util = (float(stats.get("msec_total")) * float(HZ) / float(itv)) svctm = 0.0 await = 0.0 if tput: svctm = util / tput if nr_ios: rd_ticks = stats.get("msec_read") wr_ticks = stats.get("msec_write") await = (float(rd_ticks) + float(wr_ticks)) / float(nr_ios) print("%s%s %d %.2f dev=%s" % (metric, "svctm", ts, svctm, device)) print("%s%s %d %.2f dev=%s" % (metric, "await", ts, await, device)) print("%s%s %d %.2f dev=%s" % (metric, "util", ts, float(util / 1000.0), device)) elif len(values) == 7: # partial stats line for i in range(4): print("%s%s %d %s dev=%s" % (metric, FIELDS_PART[i], ts, values[i + 3], device)) else: print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line continue sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
from bbm.jmx import JMXPattern # These are the jmx handlers we'll be using. from bbm.jvm import jvm_collector signal.signal(signal.SIGCHLD, signal.SIG_IGN) # Find the pid of the bbm-core-api server pgrep = subprocess.check_output( ["/usr/bin/pgrep", "-u", "activemq", "-f", "xbean:activemq.xml"]) jpid = pgrep.rstrip("\n") if jpid == "": sys.exit(1) # We can change over to hte bbm-core-api user for secturity utils.drop_privileges(user="******") def rewriter(v): if v.metric.startswith("jmx.org.apache.activemq."): # Strip off leading v.metric = v.metric[len("jmx.org.apache.activemq."):] metrictype = None for t in v.tags: if t.startswith("Type="): metrictype = t[len("Type="):].lower() break if metrictype == None: return []
def main(): """Main loop""" if USER != "root": utils.drop_privileges(user=USER) sys.stdin.close() config = redis_stats_conf.get_config() interval = config['collection_interval'] # we scan for instances here to see if there are any redis servers # running on this machine... last_scan = time.time() instances = scan_for_instances() # port:name if not len(instances): return 13 if not has_redis: sys.stderr.write("Found %d instance(s) to monitor, but the Python" " Redis module isn't installed.\n" % len(instances)) return 1 def print_stat(metric, value, tags=""): if value is not None: print "redis.%s %d %s %s" % (metric, ts, value, tags) dbre = re.compile("^db\d+$") while True: ts = int(time.time()) # if we haven't looked for redis instances recently, let's do that if ts - last_scan > SCAN_INTERVAL: instances = scan_for_instances() last_scan = ts # now iterate over every instance and gather statistics for port in instances: tags = "cluster=%s port=%d" % (instances[port], port) # connect to the instance and attempt to gather info r = redis.Redis(host="127.0.0.1", port=port) try: info = r.info() for key in KEYS: if key in info: print_stat(key, info[key], tags) # per database metrics for db in filter(dbre.match, info.keys()): for db_metric in info[db].keys(): print_stat(db_metric, info[db][db_metric], "%s db=%s" % (tags, db)) # get some instant latency information # TODO: might be nice to get 95th, 99th, etc here? start_time = time.time() r.ping() print_stat("latency", time.time() - start_time, tags) finally: r.connection_pool.disconnect() sys.stdout.flush() time.sleep(interval)
#!/usr/bin/python import signal import sys import subprocess from collectors.lib import utils from bbm import RunCollector from bbm.jmx import start_jmx_collector # These are the jmx handlers we'll be using. from bbm.jvm import jvm_collector from bbm.jetty import jetty_collector signal.signal(signal.SIGCHLD, signal.SIG_IGN) # Find the pid of the tomcat server pgrep = subprocess.check_output([ "/usr/bin/pgrep", "-u", "bbm-events-api", "-f", "/etc/bbm/bbm-events-api.yml" ]) jpid = pgrep.rstrip("\n") if jpid == "": sys.exit(1) # We can change over to tomcat7 user for secturity utils.drop_privileges(user="******") RunCollector(start_jmx_collector(15, jpid, jvm_collector + jetty_collector), extraTags=["application=events-api"])
for tag in self.txns_max.keys(): data = data + [ TSDBMetricData("timings.txns.max", self.txns_max[tag], tag.split(" ")) ] self.time_max = {} self.db_max = {} self.txns_max = {} self.memc_max = {} self.oldtime = newtime # for tag in self.ends_hash.keys(): # data = data + [TSDBMetricData("streams.duration", self.ends_hash[tag],tag.split(" "))] # # if (newtime - self.olduserstime) >= (1000 * 60 * 5): # Only output users stats every 5 minutes # for tag in self.users_hash.keys(): # data = data + [TSDBMetricData("streams.users.5min", len(self.users_hash[tag]),tag.split(" "))] # self.users_hash = {} # self.olduserstime = newtime return data utils.drop_privileges(user="******") parser = LogParser() RunCollector(start_dated_files_collector("/var/log/java", "*/*/*-timing.log", "%Y/%m/%Y%m%d-timing.log", parser.ParseLine), exitOnFinished=False)
from bbm.c3p0 import c3p0_collector signal.signal(signal.SIGCHLD, signal.SIG_IGN) # The core-api uses an embdedded tomcat with the webapp name set to "Tomcat", we';; # rewrite the webapp name to "coreapi" def renamer(v): if v.metric.startswith("tomcat."): v.tags = map( lambda t: "webapp=coreapi" if t.startswith("webapp=") else t, v.tags) return v # Find the pid of the bbm-core-api server pgrep = subprocess.check_output([ "/usr/bin/pgrep", "-f", "-u", "bbm-core-api", "/usr/share/bbm-core-api/ROOT.war" ]) jpid = pgrep.rstrip("\n") if jpid == "": sys.exit(1) # We can change over to hte bbm-core-api user for secturity utils.drop_privileges(user="******") RunCollector(start_jmx_collector( 15, jpid, jvm_collector + tomcat_collector + c3p0_collector, renamer), extraTags=["application=coreapi"])
from bbm.jvm import jvm_collector from bbm.tomcat import tomcat_collector signal.signal(signal.SIGCHLD, signal.SIG_IGN) # The bbm-admin uses an embdedded tomcat with the webapp name set to "Tomcat", we';; # rewrite the webapp name to "bbm-admin" def renamer(v): if v.metric.startswith("tomcat."): v.tags = map( lambda t: "webapp=admin" if t.startswith("webapp=") else t, v.tags) return v # Find the pid of the bbm-admin server pgrep = subprocess.check_output([ "/usr/bin/pgrep", "-f", "-u", "bbm-admin", "/usr/share/bbm-admin/admin-assembly-1.0.jar" ]) jpid = pgrep.rstrip("\n") if jpid == "": sys.exit(1) # We can change over to hte bbm-admin user for security utils.drop_privileges(user="******") RunCollector(start_jmx_collector(15, jpid, jvm_collector + tomcat_collector, renamer), extraTags=["application=admin"])
def main(): """iostats main loop.""" init_stats = { "read_requests": 0, "read_merged": 0, "read_sectors": 0, "msec_read": 0, "write_requests": 0, "write_merged": 0, "write_sectors": 0, "msec_write": 0, "ios_in_progress": 0, "msec_total": 0, "msec_weighted_total": 0, } prev_stats = dict() f_diskstats = open("/proc/diskstats") HZ = get_system_hz() itv = 1.0 utils.drop_privileges() while True: f_diskstats.seek(0) ts = int(time.time()) itv = read_uptime()[0] for line in f_diskstats: # maj, min, devicename, [list of stats, see above] values = line.split(None) # shortcut the deduper and just skip disks that # haven't done a single read. This eliminates a bunch # of loopback, ramdisk, and cdrom devices but still # lets us report on the rare case that we actually use # a ramdisk. if values[3] == "0": continue if int(values[1]) % 16 == 0 and int(values[0]) > 1: metric = "iostat.disk." else: metric = "iostat.part." device = values[2] if len(values) == 14: # full stats line for i in range(11): print("%s%s %d %s dev=%s" % (metric, FIELDS_DISK[i], ts, values[i + 3], device)) if FIELDS_DISK[i] == "read_sectors": if PY3: v = int( values[i + 3]) * get_device_sector_size(device) else: # noinspection PyCompatibility v = long(values[i + 3]) * get_device_sector_size( device) # pylint:disable=undefined-variable print("%s%s %d %s dev=%s" % (metric, "read_bytes", ts, v, device)) if FIELDS_DISK[i] == "write_sectors": if PY3: v = int( values[i + 3]) * get_device_sector_size(device) else: # noinspection PyCompatibility v = long(values[i + 3]) * get_device_sector_size( device) # pylint:disable=undefined-variable print("%s%s %d %s dev=%s" % (metric, "write_bytes", ts, v, device)) ret = is_device(device, 0) # if a device or a partition, calculate the svctm/await/util if ret: stats = dict(zip(FIELDS_DISK, values[3:])) if not device in prev_stats: prev_stats[device] = init_stats rd_ios = float(stats.get("read_requests")) wr_ios = float(stats.get("write_requests")) nr_ios = rd_ios + wr_ios prev_rd_ios = float( prev_stats[device].get("read_requests")) prev_wr_ios = float( prev_stats[device].get("write_requests")) prev_nr_ios = prev_rd_ios + prev_wr_ios tput = ((nr_ios - prev_nr_ios) * float(HZ) / float(itv)) util = ((float(stats.get("msec_total")) - float(prev_stats[device].get("msec_total"))) * float(HZ) / float(itv)) svctm = 0.0 await_ = 0.0 r_await = 0.0 w_await = 0.0 if tput: svctm = util / tput rd_ticks = stats.get("msec_read") wr_ticks = stats.get("msec_write") prev_rd_ticks = prev_stats[device].get("msec_read") prev_wr_ticks = prev_stats[device].get("msec_write") if rd_ios != prev_rd_ios: r_await = (float(rd_ticks) - float(prev_rd_ticks) ) / float(rd_ios - prev_rd_ios) if wr_ios != prev_wr_ios: w_await = (float(wr_ticks) - float(prev_wr_ticks) ) / float(wr_ios - prev_wr_ios) if nr_ios != prev_nr_ios: await_ = (float(rd_ticks) + float(wr_ticks) - float(prev_rd_ticks) - float(prev_wr_ticks) ) / float(nr_ios - prev_nr_ios) print("%s%s %d %.2f dev=%s" % (metric, "svctm", ts, svctm, device)) print("%s%s %d %.2f dev=%s" % (metric, "r_await", ts, r_await, device)) print("%s%s %d %.2f dev=%s" % (metric, "w_await", ts, w_await, device)) print("%s%s %d %.2f dev=%s" % (metric, "await", ts, await_, device)) print("%s%s %d %.2f dev=%s" % (metric, "util", ts, float(util / 1000.0), device)) prev_stats[device] = copy.deepcopy(stats) elif len(values) == 7: # partial stats line for i in range(4): print("%s%s %d %s dev=%s" % (metric, FIELDS_PART[i], ts, values[i + 3], device)) else: print("Cannot parse /proc/diskstats line: ", line, file=sys.stderr) continue sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)