def main(): """dfstats main loop""" try: f_mounts = open("/proc/mounts", "r") except IOError, e: utils.err("error: can't open /proc/mounts: %s" % e) return 13 # Ask tcollector to not respawn us
def find_conf_file(pid): """Returns config file for couchbase-server.""" try: fd = open('/proc/%s/cmdline' % pid) except IOError, e: utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) return
def find_databases(dbs=None): """Returns a map of dbname (string) to DB instances to monitor. Args: dbs: A map of dbname (string) to DB instances already monitored. This map will be modified in place if it's not None. """ sockfiles = find_sockfiles() if dbs is None: dbs = {} for sockfile in sockfiles: dbname = get_dbname(sockfile) if dbname in dbs: continue if not dbname: continue try: db = mysql_connect(sockfile) cursor = db.cursor() cursor.execute("SELECT VERSION()") except (EnvironmentError, EOFError, RuntimeError, socket.error, MySQLdb.MySQLError), e: utils.err("Couldn't connect to %s: %s" % (sockfile, e)) continue version = cursor.fetchone()[0] dbs[dbname] = DB(sockfile, dbname, db, cursor, version)
def read_socket(sock): """ Connect to the HAProxy stats socket and ready the data from the show stat command, allowing up to three retries before aborting. This setup assumes that the socket will be closed and doesn't try to keep it open, reconnecting on each attempt to fetch the statistics. (Should better handle restarts and reloads of the monitored process.) """ stats = '' # Establish a socket to connect to the unix socket on HAProxy sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(DEFAULT_SOCKET) for attempt in range(3): try: sock.send("show stat\n") data = sock.recv(4096) while data: stats += data data = sock.recv(4096) return stats.split("\n") except IOError, error: utils.err("Error: Connection to HAProxy socket lost: %s (%d)" % (error, attempt)) sock.close() sock.connect(DEFAULT_SOCKET) # Reset stats in case it was broken mid-stream stats = ''
def main(args): """Collects and dumps stats from a MySQL server.""" if not find_sockfiles(): # Nothing to monitor. return 13 # Ask tcollector to not respawn us. if MySQLdb is None: utils.err("error: Python module `MySQLdb' is missing") return 1 last_db_refresh = now() dbs = find_databases() while True: ts = now() if ts - last_db_refresh >= DB_REFRESH_INTERVAL: find_databases(dbs) last_db_refresh = ts errs = [] for dbname, db in dbs.iteritems(): try: collect(db) except (EnvironmentError, EOFError, RuntimeError, socket.error, MySQLdb.MySQLError), e: if isinstance(e, IOError) and e[0] == errno.EPIPE: # Exit on a broken pipe. There's no point in continuing # because no one will read our stdout anyway. return 2 utils.err("error: failed to collect data from %s: %s" % (db, e)) errs.append(dbname) for dbname in errs: del dbs[dbname] sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def main(): pid = haproxy_pid() if not pid: utils.err("Error: HAProxy is not running") return 13 # Ask tcollector to not respawn us. conf_file = find_conf_file(pid) if not conf_file: return 13 sock_file = find_sock_file(conf_file) if sock_file is None: utils.err("Error: HAProxy is not listening on any unix domain socket") return 13 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(sock_file) # put haproxy to interactive mode, otherwise haproxy closes # connection after first command. # See haproxy documentation section 9.2. Unix Socket commands. sock.send("prompt\n") while True: collect_stats(sock) time.sleep(COLLECTION_INTERVAL)
def main(): if json is None: utils.err("This collector requires the 'json' Python module.") return 13 while True: read_impala_log() time.sleep(1)
def cloudwatch_query_metric(cloudwatch, region, metric): end = datetime.datetime.utcnow() start = end - datetime.timedelta(seconds=COLLECTION_INTERVAL) global STATISTICS # TODO: statistics no longer need to be one at at time so refactor that response = cloudwatch.get_metric_statistics( Namespace=metric["Namespace"], MetricName=metric["MetricName"], Dimensions=metric["Dimensions"], StartTime=start, EndTime=end, Period=300, Statistics=list(STATISTICS), Unit='Count' ) for datapoint in response['Datapoints']: for statistic in STATISTICS: timestamp = format_timestamp(str(datapoint['Timestamp'])) value = int(datapoint[statistic]) metric_name, tags = build_tag_list(metric['MetricName'].lower(), region, metric['Dimensions']) namespace = metric["Namespace"].lower().replace('/', '.') output = "%s.%s.%s %s %s %s" % ( namespace, metric_name, statistic.lower(), str(timestamp), str(value), tags) #sys.stderr.write('output: %s\n' % (output)) if validate_line_parses(output): sendQueue.put({'timestamp': timestamp, 'output': output}) else: utils.err("Invalid Line: %s" % output)
def main(argv): with utils.lower_privileges(self._logger): socket.setdefaulttimeout(DEFAULT_TIMEOUT) servers = [] if json is None: utils.err("This collector requires the `json' Python module.") return 1 for conf in elasticsearch_conf.get_servers(): server = httplib.HTTPConnection( *conf ) try: server.connect() except socket.error, (erno, e): if erno == errno.ECONNREFUSED: continue raise servers.append( server ) if len( servers ) == 0: return 13 # No ES running, ask tcollector to not respawn us. status = node_status(server) version = status["version"]["number"] while True: for server in servers: _collect_server(server, version) time.sleep(COLLECTION_INTERVAL)
def main(argv): utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) servers = [] if json is None: utils.err("This collector requires the `json' Python module.") return 1 for conf in elasticsearch_conf.get_servers(): server = HTTPConnection(*conf) try: server.connect() except socket.error as exc: if exc.errno == errno.ECONNREFUSED: continue raise servers.append(server) if len(servers) == 0: return 13 # No ES running, ask tcollector to not respawn us. lock = threading.Lock() while True: threads = [] for server in servers: status = node_status(server) version = status["version"]["number"] t = threading.Thread(target=_collect_server, args=(server, version, lock)) t.start() threads.append(t) for thread in threads: thread.join() time.sleep(COLLECTION_INTERVAL)
def process_metric(self, timestamp, metric, tags, value, mbean_domain, mbean_properties): if not mbean_domain.startswith("kafka") and not mbean_domain == "java.lang": utils.err("Unexpected mbean domain = %r" % mbean_domain) return if mbean_domain == "java.lang": jmx_service = mbean_properties.pop("type", "jvm") # Kafka producer metrics elif mbean_domain == "kafka.producer": self._process_kafka_producer_metric(timestamp, metric, tags, value, mbean_domain, mbean_properties) return # Kafka consumer metrics elif mbean_domain == "kafka.consumer": self._process_kafka_consumer_metric(timestamp, metric, tags, value, mbean_domain, mbean_properties) return # Kafka broker metrics elif mbean_domain.startswith("kafka."): domain_parts = mbean_domain.split(".") # drop the kafka prefix mbean_domain = mbean_domain[len("kafka."):] jmx_service = mbean_properties.get("type", domain_parts[-1]) else: return if mbean_properties: tags += " " + " ".join(k + "=" + v for k, v in mbean_properties.iteritems()) jmx_service = JmxMonitor.SHORT_SERVICE_NAMES.get(jmx_service, jmx_service) metric = mbean_domain + "." + jmx_service.lower() + "." + metric self.emit(metric, timestamp, value, tags)
def main(argv): with utils.lower_privileges(self._logger): socket.setdefaulttimeout(DEFAULT_TIMEOUT) servers = [] if json is None: utils.err("This collector requires the `json' Python module.") return 1 for conf in elasticsearch_conf.get_servers(): server = httplib.HTTPConnection(*conf) try: server.connect() except socket.error, (erno, e): if erno == errno.ECONNREFUSED: continue raise servers.append(server) if len(servers) == 0: return 13 # No ES running, ask tcollector to not respawn us. status = node_status(server) version = status["version"]["number"] while True: for server in servers: _collect_server(server, version) time.sleep(COLLECTION_INTERVAL)
def get_metrics(webserver_url, username, password, params): try: r = requests.get(webserver_url, auth=(username, password), verify=False, params=params) except requests.exceptions.ConnectionError as error: print >> sys.stderr, "Error connecting: %s" % error utils.err("Connection error: %s" % error) raise try: r.raise_for_status() except requests.exceptions.HTTPError as error: print >> sys.stderr, "Request was not successful: %s" % error utils.err("HTTP error getting metrics from '%s' - %s" % (webserver_url, error)) return 13 # tell tcollector to not respawn response = r.json() try: data = response['data'] except KeyError as e: print >> sys.stderr, "Did not get a 'data' key in the response." print >> sys.stderr, response raise return data
def find_sock_file(conf_file): """Returns the unix socket file of haproxy.""" try: fd = open(conf_file) except IOError, e: utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file)) return None
def find_bindir_path(config_file): """Returns the bin directory path""" try: fd = open(config_file) except IOError, e: utils.err("Error for Config file (%s): %s" % (config_file, e)) return None
def main(): try: check_imports() conn = libvirt.openReadOnly(LIBVIRT_URI) if conn is None: utils.err("Failed to open connection to the hypervisor") return ERROR_CODE_DONT_RETRY while True: domains = conn.listAllDomains() random.shuffle(domains) pids = get_pids() count = 0 for domain in domains: if process_domain(domain, pids.get(domain.UUIDString())): count += 1 # count only successfully processed VMs # write libvirt.vm.count metric print("%s %d %s" % (FIELDS["count"], int(time.time()), count)) sys.stdout.flush() time.sleep(INTERVAL) except LibvirtVmProcessingError as err: utils.err(err.value) return ERROR_CODE_DONT_RETRY
def main(args): """ Calls HadoopYarnNodeManager at interval secs and emits metrics to stdout for TCollector """ if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us utils.drop_privileges() parser = argparse.ArgumentParser() parser.add_argument('-H', '--host', default='localhost', help='Host to connect to (default: localhost)') parser.add_argument('-P', '--port', default=8042, type=int, help='Port to connect to (default: 8042)') parser.add_argument('-i', '--interval', default=90, type=int, help='Interval at which to emit metrics') args = parser.parse_args(args[1:]) host = args.host port = args.port interval = args.interval yarn_service = HadoopYarnNodeManager(host=host, port=port) while True: yarn_service.emit() time.sleep(interval) return 0
def find_conf_file(pid): """Returns the conf file of haproxy.""" try: output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid]) except subprocess.CalledProcessError, e: utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) return None
def main(argv): utils.drop_privileges() socket.setdefaulttimeout(DEFAULT_TIMEOUT) servers = [] if json is None: utils.err("This collector requires the `json' Python module.") return 1 for conf in elasticsearch_conf.get_servers(): server = HTTPConnection( *conf ) try: server.connect() except socket.error as exc: if exc.errno == errno.ECONNREFUSED: continue raise servers.append( server ) if len( servers ) == 0: return 13 # No ES running, ask tcollector to not respawn us. lock = threading.Lock() while True: threads = [] for server in servers: status = node_status(server) version = status["version"]["number"] t = threading.Thread(target = _collect_server, args = (server, version, lock)) t.start() threads.append(t) for thread in threads: thread.join() time.sleep(COLLECTION_INTERVAL)
def process_gc_log(collector): prefix = collector['prefix'] # get latest gc log to process gc_log = get_latest_gc_log(collector['log_dir'], collector['log_name_pattern']) # update current_file and current_file_pos if this is the first time to # process the gc log if collector['current_file'] != gc_log: collector['current_file'] = gc_log with open(gc_log, 'rb') as file_handler: collector['current_file_pos'] = get_file_end(file_handler) return try: with open(gc_log, 'rb') as file_handler: pos = collector['current_file_pos'] collector['current_file_pos'] = get_file_end(file_handler) file_handler.seek(pos) # Do not use foreach loop because inside function process_gc_record # will call file_handler.readline(). The reason is that some GC # event are multiline and need to be processed as a whole while True: line = file_handler.readline() if len(line) == 0: break pattern_name, matcher = match_pattern(line) if pattern_name == GC_START_TIME_PATTERN: year, month, day, hour, minute, second, timezone = [ int(matcher.group(i)) for i in range(1, 8) ] cause = matcher.group(8) timestamp = true_unix_timestamp(year, month, day, hour, minute, second, timezone) process_gc_record(prefix, file_handler, timestamp, cause, collector) else: unmatched_gc_log(line) current_timestamp_in_sec = int(time.time()) if not collector['timestamp'] is None: for gen, value in collector['gensize'].items(): print "%s.gc.g1.gensize %s %s gen=%s" % ( prefix, current_timestamp_in_sec, value, gen) # publish gc event count metrics for event, value in collector['count'].items(): print "%s.gc.g1.event.count %s %s event=%s" % ( prefix, current_timestamp_in_sec, value, event) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() utils.err(''.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) return 0
def main(): utils.drop_privileges() if BinLogStreamReader is None: utils.err("error: Python module `pymysqlreplication' is missing") return 1 settings = zabbix_bridge_conf.get_settings() # Set blocking to True if you want to block and wait for the next event at # the end of the stream stream = BinLogStreamReader(connection_settings=settings['mysql'], server_id=settings['slaveid'], only_events=[WriteRowsEvent], resume_stream=True, blocking=True) db_filename = settings['sqlitedb'] dbcache = sqlite3.connect(':memory:') cachecur = dbcache.cursor() cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,)) cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics. log_pos = 0 key_lookup_miss = 0 sample_last_ts = int(time.time()) last_key_lookup_miss = 0 for binlogevent in stream: if binlogevent.schema == settings['mysql']['db']: table = binlogevent.table log_pos = binlogevent.packet.log_pos if table == 'history' or table == 'history_uint': for row in binlogevent.rows: r = row['values'] itemid = r['itemid'] cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,)) row = cachecur.fetchone() if (row is not None): print("zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3])) if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals sample_last_ts = int(time.time()) print("tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos)) print("tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss)) print("tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock']))) if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']): print("tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss))) cachecur.execute('DROP TABLE zabbix_cache') cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') last_key_lookup_miss = key_lookup_miss else: # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry utils.err("error: Key lookup miss for %s" % (itemid)) key_lookup_miss += 1 sys.stdout.flush() dbcache.close() stream.close()
def find_conf_file(pid): """Returns the conf file of haproxy.""" try: output = subprocess.check_output( ["ps", "--no-headers", "-o", "cmd", pid]) except subprocess.CalledProcessError, e: utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) return None
def main(args): """Collects and dumps stats from a PostgreSQL server.""" try: db = postgresqlutils.connect() except (Exception), e: utils.err("error: Could not initialize collector : %s" % (e)) return 13 # Ask tcollector to not respawn us
def main(): """ifstat main loop""" try: f_netdev = open("/proc/net/dev") except IOError, e: utils.err("error: can't open /proc/net/dev: %s" % e) return 13 # Ask tcollector to not respawn us
def main(): utils.drop_privileges() if BinLogStreamReader is None: utils.err("error: Python module `pymysqlreplication' is missing") return 1 settings = zabbix_bridge_conf.get_settings() # Set blocking to True if you want to block and wait for the next event at # the end of the stream stream = BinLogStreamReader(connection_settings=settings['mysql'], server_id=settings['slaveid'], only_events=[WriteRowsEvent], resume_stream=True, blocking=True) db_filename = settings['sqlitedb'] dbcache = sqlite3.connect(':memory:') cachecur = dbcache.cursor() cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,)) cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)') # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics. log_pos = 0 key_lookup_miss = 0 sample_last_ts = int(time.time()) last_key_lookup_miss = 0 for binlogevent in stream: if binlogevent.schema == settings['mysql']['db']: table = binlogevent.table log_pos = binlogevent.packet.log_pos if table == 'history' or table == 'history_uint': for row in binlogevent.rows: r = row['values'] itemid = r['itemid'] cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,)) row = cachecur.fetchone() if (row is not None): print "zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3]) if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals sample_last_ts = int(time.time()) print "tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos) print "tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss) print "tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock'])) if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']): print "tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss)) cachecur.execute('DROP TABLE zabbix_cache') cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache') last_key_lookup_miss = key_lookup_miss else: # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry utils.err("error: Key lookup miss for %s" % (itemid)) key_lookup_miss += 1 sys.stdout.flush() dbcache.close() stream.close()
def get_dbname(sockfile): """Returns the name of the DB based on the path to the socket file.""" if sockfile in DEFAULT_SOCKFILES: return "default" m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) if not m: utils.err("error: couldn't guess the name of the DB for " + sockfile) return None return m.group(1)
def loop(self): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us while True: self.emit() time.sleep(self.delay) return 0
def validate_config(): aws_profile = aws_cloudwatch_conf.get_aws_profile() access_key, secret_access_key = aws_cloudwatch_conf.get_accesskey_secretkey() if (access_key == '<access_key_id>' or secret_access_key == '<secret_access_key>') and aws_profile is None: utils.err("Cloudwatch Collector is not configured\n") sys.exit(13) if not aws_cloudwatch_conf.enabled: utils.err("Cloudwatch Collector is not enabled\n") sys.exit(13)
def collect(db): """ Collects and prints stats. Here we collect only general info, for full list of data for collection see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html """ try: cursor = db.cursor() # general statics cursor.execute( "SELECT pg_stat_database.*, pg_database_size" " (pg_database.datname) AS size FROM pg_database JOIN" " pg_stat_database ON pg_database.datname =" " pg_stat_database.datname WHERE pg_stat_database.datname" " NOT IN ('template0', 'template1', 'postgres')") ts = time.time() stats = cursor.fetchall() # datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size result = {} for stat in stats: database = stat[1] result[database] = stat for database in result: for i in range(2, len(cursor.description)): metric = cursor.description[i].name value = result[database][i] try: if metric in ("stats_reset"): continue print("postgresql.%s %i %s database=%s" % (metric, ts, value, database)) except: utils.err("got here") continue # connections cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" " GROUP BY pg_stat_activity.datname") ts = time.time() connections = cursor.fetchall() for database, connection in connections: print("postgresql.connections %i %s database=%s" % (ts, connection, database)) except (EnvironmentError, EOFError, RuntimeError, socket.error), e: if isinstance(e, IOError) and e[0] == errno.EPIPE: # exit on a broken pipe. There is no point in continuing # because no one will read our stdout anyway. return 2 utils.err("error: failed to collect data: %s" % e)
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us hbase_service = HBaseMaster() while True: hbase_service.emit() time.sleep(90) return 0
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us rm_node_service = HadoopResourceManager() while True: rm_node_service.emit() time.sleep(90) return 0
def validate_line_parses(line): parsed = re.match('^([-_./a-zA-Z0-9]+)\s+' # Metric name. '(\d+\.?\d+)\s+' # Timestamp. '(\S+?)' # Value (int or float). '((?:\s+[-_./a-zA-Z0-9]+=[-_./a-zA-Z0-9]+)*)$', # Tags line) if parsed is None: utils.err("invalid data: %s \n" % (line)) return False return True
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us name_node_service = HadoopNameNode() while True: name_node_service.emit() time.sleep(90) return 0
def collect(): """Collects HTTP latencies in milliseconds from a list of ports in configuration""" ts = time.time() try: for metric, url in httpconf.urls().iteritems(): response = requests.get(url) latency = response.elapsed.total_seconds() * 1000 print("%s %i %f" % (metric, ts, latency)) except Exception as e: utils.err("error: something wrong happened in http: %s" % e)
def postgres_connect(sockdir): """Connects to the PostgreSQL server using the specified socket file.""" user, password = postgresqlconf.get_user_password() try: return psycopg2.connect("host='%s' user='******' password='******' " "connect_timeout='%s' dbname=postgres" % (sockdir, user, password, CONNECT_TIMEOUT)) except (EnvironmentError, EOFError, RuntimeError, socket.error), e: utils.err("Couldn't connect to DB :%s" % (e))
def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") return 13 # Ask tcollector not to respawn us datanode_service = HadoopDataNode() while True: datanode_service.emit() time.sleep(15) return 0
def collect(db): """ Collects and prints stats. Here we collect only general info, for full list of data for collection see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html """ try: cursor = db.cursor() # general statics cursor.execute("SELECT pg_stat_database.*, pg_database_size" " (pg_database.datname) AS size FROM pg_database JOIN" " pg_stat_database ON pg_database.datname =" " pg_stat_database.datname WHERE pg_stat_database.datname" " NOT IN ('template0', 'template1', 'postgres')") ts = time.time() stats = cursor.fetchall() # datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size result = {} for stat in stats: database = stat[1] result[database] = stat for database in result: for i in range(2,len(cursor.description)): metric = cursor.description[i].name value = result[database][i] try: if metric in ("stats_reset"): continue print ("postgresql.%s %i %s database=%s" % (metric, ts, value, database)) except: err("got here") continue # connections cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" " GROUP BY pg_stat_activity.datname") ts = time.time() connections = cursor.fetchall() for database, connection in connections: print ("postgresql.connections %i %s database=%s" % (ts, connection, database)) except (EnvironmentError, EOFError, RuntimeError, socket.error), e: if isinstance(e, IOError) and e[0] == errno.EPIPE: # exit on a broken pipe. There is no point in continuing # because no one will read our stdout anyway. return 2 utils.err("error: failed to collect data: %s" % e)
def scan_zk_instances(): """ Finding out all the running instances of zookeeper - Using netstat, finds out all listening java processes. - Figures out ZK instances among java processes by looking for the string "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline. """ instances = [] try: listen_sock = subprocess.check_output(["netstat", "-lnpt"], stderr=subprocess.PIPE) except subprocess.CalledProcessError: utils.err("netstat directory doesn't exist in PATH variable") return instances for line in listen_sock.split("\n"): if not "java" in line: continue listen_sock = line.split()[3] tcp_version = line.split()[0] m = re.match("(.+):(\d+)", listen_sock) ip = m.group(1) port = int(m.group(2)) pid = int(line.split()[6].split("/")[0]) try: fd = open("/proc/%d/cmdline" % pid) cmdline = fd.readline() if "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline: try: if tcp_version == "tcp6": sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) else: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.5) sock.connect((ip, port)) sock.send("ruok\n") data = sock.recv(1024) except: pass finally: sock.close() if data == "imok": instances.append([ip, port, tcp_version]) data = "" except: continue finally: fd.close() return instances
def find_conf_file(pid): """Returns config file for couchbase-server.""" try: fd = open('/proc/%s/cmdline' % pid) except IOError as e: utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) return try: config = fd.read().split("config_path")[1].split("\"")[1] return config finally: fd.close()
def connect_socket(tcp_version, port): sock = None if tcp_version == "tcp6": sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) ipaddr = '::1' else: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ipaddr = '127.0.0.1' try: sock.connect((ipaddr, port)) except Exception, err: utils.err(err)
def get_role_status(): ms_checker_host = "localhost:3300" command_is_salve = "curl " + ms_checker_host + "/checkSlave" s, o = commands.getstatusoutput(command_is_salve) if o == "" or s != 0: utils.err("Error checking mysql role, status %s" % s) elif s == 0: utils.err("INFO: status msg: %s" % o) if "not" not in o.lower(): return 1 return 0
def scan_zk_instances(): """ Finding out all the running instances of zookeeper - Using netstat, finds out all listening java processes. - Figures out ZK instances among java processes by looking for the string "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline. """ instances = [] try: listen_sock = subprocess.check_output(["netstat", "-lnpt"], stderr=subprocess.PIPE) except subprocess.CalledProcessError: utils.err("netstat directory doesn't exist in PATH variable") return instances for line in listen_sock.split("\n"): if not "java" in line: continue listen_sock = line.split()[3] tcp_version = line.split()[0] m = re.match("(.+):(\d+)", listen_sock) ip = m.group(1) port = int(m.group(2)) pid = int(line.split()[6].split("/")[0]) try: fd = open("/proc/%d/cmdline" % pid) cmdline = fd.readline() if "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline: try: if tcp_version == "tcp6" or ip == "::": sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) ip = "::1" else: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = "127.0.0.1" sock.settimeout(0.5) sock.connect((ip, port)) sock.send("ruok\n") data = sock.recv(1024) except: pass finally: sock.close() if data == "imok": instances.append([ip, port, tcp_version]) data = "" except: continue finally: fd.close() return instances
def process_gc_log(collector): prefix = collector['prefix'] # get latest gc log to process gc_log = get_latest_gc_log(collector['log_dir'], collector['log_name_pattern']) # update current_file and current_file_pos if this is the first time to # process the gc log if collector['current_file'] != gc_log: collector['current_file'] = gc_log with open(gc_log, 'rb') as file_handler: collector['current_file_pos'] = get_file_end(file_handler) return try: with open(gc_log, 'rb') as file_handler: pos = collector['current_file_pos'] collector['current_file_pos'] = get_file_end(file_handler) file_handler.seek(pos) # Do not use foreach loop because inside function process_gc_record # will call file_handler.readline(). The reason is that some GC # event are multiline and need to be processed as a whole while True: line = file_handler.readline() if len(line) == 0: break pattern_name, matcher = match_pattern(line) if pattern_name == GC_START_TIME_PATTERN: year, month, day, hour, minute, second, timezone = [int(matcher.group(i)) for i in range(1, 8)] cause = matcher.group(8) timestamp = true_unix_timestamp(year, month, day, hour, minute, second, timezone) process_gc_record(prefix, file_handler, timestamp, cause, collector) else: unmatched_gc_log(line) current_timestamp_in_sec = int(time.time()) if not collector['timestamp'] is None: for gen, value in collector['gensize'].items(): print "%s.gc.g1.gensize %s %s gen=%s" % (prefix, current_timestamp_in_sec, value, gen) # publish gc event count metrics for event, value in collector['count'].items(): print "%s.gc.g1.event.count %s %s event=%s" % (prefix, current_timestamp_in_sec, value, event) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() utils.err(''.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) return 0
def find_bindir_path(config_file): """Returns the bin directory path""" try: fd = open(config_file) except IOError as e: utils.err("Error for Config file (%s): %s" % (config_file, e)) return None try: for line in fd: if line.startswith("{path_config_bindir"): return line.split(",")[1].split("\"")[1] finally: fd.close()
def main(args): """Collects and dumps stats from a PostgreSQL server.""" try: db = postgresqlutils.connect() except (Exception) as e: utils.err("error: Could not initialize collector : %s" % (e)) return 13 # Ask tcollector to not respawn us while True: collect(db) sys.stdout.flush() time.sleep(COLLECTION_INTERVAL)
def process_domain(domain, pid): """Process one domain (vm)""" # skip vms that are not running if domain.isActive() != 1: utils.err("Domain %s is inactive. Skipping." % domain.name()) return False if not pid: utils.err("Cannot find PID for domain %s. Skipping." % domain.name()) return False if not psutil.pid_exists(pid): utils.err("PID %d no longer exists for domain %s. Skipping." % (pid, domain.name())) return False # populate vm structure with metrics try: vm = {} vm[FIELDS["cpu_time"]] = get_cpu_time(pid) vm[FIELDS["cpu_load"]] = get_cpu_load(pid) vm[FIELDS["memory"]] = get_memory(domain) vm[FIELDS["max_memory"]] = domain.maxMemory() vm[FIELDS["max_vcpus"]] = domain.maxVcpus() xml = BeautifulSoup(domain.XMLDesc()) vm[TAG_DEPLOY_ID] = domain.name() vm[TAG_TYPE] = get_type(domain, xml) vm.update(get_network_traffic(domain, xml)) vm.update(get_disk_io(domain, xml)) except LibvirtVmDataError as err: utils.err(err.value) return False print_vm(vm) return True