def parse_single_line(self, inode): """ apply socket-specific parsing rules """ result = None (socket_type, line) = self.sockets[inode] if socket_type == 'unix': # we are interested in everything in the last field # note that it may contain spaces or other separator characters fields = line.split(None, self.unix_socket_header_len - 1) socket_path = fields[-1] # check that it looks like a PostgreSQL socket match = re.search(r'(.*?)/\.s\.PGSQL\.(\d+)$', socket_path) if match: # path - port result = (socket_type,) + match.groups(1) else: logger.warning( 'unix socket name is not recognized as belonging to PostgreSQL: {0}'.format(socket_path)) else: address_port = line.split()[1] (address_hex, port_hex) = address_port.split(':') port = self._hex_to_int_str(port_hex) if socket_type == 'tcp6': address = self._hex_to_ipv6(address_hex) elif socket_type == 'tcp': address = self._hex_to_ip(address_hex) else: logger.error('unrecognized socket type: {0}'.format(socket_type)) result = (socket_type, address, port) return result
def parse_single_line(self, inode): """ apply socket-specific parsing rules """ result = None (socket_type, line) = self.sockets[inode] if socket_type == 'unix': # we are interested in everything in the last field # note that it may contain spaces or other separator characters fields = line.split(None, self.unix_socket_header_len - 1) socket_path = fields[-1] # check that it looks like a PostgreSQL socket match = re.search(r'(.*?)/\.s\.PGSQL\.(\d+)$', socket_path) if match: # path - port result = (socket_type, ) + match.groups(1) else: logger.warning( 'unix socket name is not recognized as belonging to PostgreSQL: {0}' .format(socket_path)) else: address_port = line.split()[1] (address_hex, port_hex) = address_port.split(':') port = self._hex_to_int_str(port_hex) if socket_type == 'tcp6': address = self._hex_to_ipv6(address_hex) elif socket_type == 'tcp': address = self._hex_to_ip(address_hex) else: logger.error( 'unrecognized socket type: {0}'.format(socket_type)) result = (socket_type, address, port) return result
def read_configuration(config_file_name): # read PostgreSQL connection options config_data = {} if not config_file_name: return None config = ConfigParser.ConfigParser() f = config.read(config_file_name) if not f: logger.error('Configuration file {0} is empty or not found'.format( config_file_name)) return None # get through all defined databases for section in config.sections(): config_data[section] = {} for argname in ( 'port', 'host', 'user', 'dbname', ): try: val = config.get(section, argname) except ConfigParser.NoOptionError: val = None # might happen also if the option is there, but the value is not set if val is not None: config_data[section][argname] = val return config_data
def read_socket_file(self, filename): """ read file content, produce a dict of socket inode -> line """ socket_type = filename.split('/')[-1] try: with open(filename) as fp: data = fp.readlines() except os.error as e: logger.error('unable to read from {0}: OS reported {1}'.format( filename, e)) # remove the header header = (data.pop(0)).split() if socket_type == 'unix': self.unix_socket_header_len = len(header) indexes = [ i for i, name in enumerate(header) if name.lower() == 'inode' ] if len(indexes) != 1: logger.error( 'attribute \'inode\' in the header of {0} is not unique or missing: {1}' .format(filename, header)) else: inode_idx = indexes[0] if socket_type != 'unix': # for a tcp socket, 2 pairs of fields (tx_queue:rx_queue and tr:tm->when # are separated by colons and not spaces) inode_idx -= 2 for line in data: fields = line.split() inode = int(fields[inode_idx]) self.sockets[inode] = [socket_type, line]
def read_socket_file(self, filename): """ read file content, produce a dict of socket inode -> line """ socket_type = filename.split('/')[-1] try: with open(filename) as fp: data = fp.readlines() except os.error as e: logger.error('unable to read from {0}: OS reported {1}'.format(filename, e)) # remove the header header = (data.pop(0)).split() if socket_type == 'unix': self.unix_socket_header_len = len(header) indexes = [i for i, name in enumerate(header) if name.lower() == 'inode'] if len(indexes) != 1: logger.error('attribute \'inode\' in the header of {0} is not unique or missing: {1}'.format( filename, header)) else: inode_idx = indexes[0] if socket_type != 'unix': # for a tcp socket, 2 pairs of fields (tx_queue:rx_queue and tr:tm->when # are separated by colons and not spaces) inode_idx -= 2 for line in data: fields = line.split() inode = int(fields[inode_idx]) self.sockets[inode] = [socket_type, line]
def detect_db_connection_arguments(work_directory, pid, version, username, dbname): """ Try to detect database connection arguments from the postmaster.pid We do this by first extracting useful information from postmaster.pid, next reading the postgresql.conf if necessary and, at last, """ conn_args = detect_with_proc_net(pid) if not conn_args: # if we failed to detect the arguments via the /proc/net/ readings, # perhaps we'll get better luck with just peeking into postmaster.pid. conn_args = detect_with_postmaster_pid(work_directory, version) if not conn_args: logger.error( 'unable to detect connection parameters for the PostgreSQL cluster at {0}' .format(work_directory)) return None # try all acquired connection arguments, starting from unix, then tcp, then tcp over ipv6 result = pick_connection_arguments(conn_args, username, dbname) if len(result) == 0: logger.error( 'unable to connect to PostgreSQL cluster at {0} using any of ' 'the detected connection options: {1}'.format( work_directory, conn_args)) return None return result
def _read_cpus(): cpus = 0 try: cpus = cpu_count() except: logger.error('multiprocessing does not support cpu_count') return {'cores': cpus}
def get_io_data(pnames): """ Retrieve raw data from /proc/diskstat (transformations are perfromed via io_list_transformation)""" result = {} found = 0 # stop if we found records for all partitions total = len(pnames) try: fp = None fp = open(PartitionStatCollector.DISK_STAT_FILE, 'rU') for line in fp: elements = line.split() for pname in pnames: if pname in elements: result[pname] = elements found += 1 if found == total: break if found == total: break except Exception: logger.error('Unable to read {0}'.format( PartitionStatCollector.DISK_STAT_FILE)) result = {} finally: fp and fp.close() return result
def validate_list_out(l): """ If the list element doesn't supply an out column - remove it """ for col in l: if 'out' not in col: el = l.pop(l.index(col)) logger.error('Removed {0} column because it did not specify out value'.format(el))
def read_configuration(config_file_name): # read PostgreSQL connection options config_data = {} if not config_file_name: return None config = ConfigParser.ConfigParser() f = config.read(config_file_name) if not f: logger.error('Configuration file {0} is empty or not found'.format(config_file_name)) return None # get through all defined databases for section in config.sections(): config_data[section] = {} for argname in ( 'port', 'host', 'user', 'dbname', ): try: val = config.get(section, argname) except ConfigParser.NoOptionError: val = None # might happen also if the option is there, but the value is not set if val is not None: config_data[section][argname] = val return config_data
def detect_with_proc_net(pid): inodes = fetch_socket_inodes_for_process(pid) parser = ProcNetParser() result = parser.match_socket_inodes(inodes) if not result or len(result) == 0: logger.error('could not detect connection string from /proc/net for postgres process {0}'.format(pid)) return None return result
def validate_list_out(l): """ If the list element doesn't supply an out column - remove it """ for col in l: if 'out' not in col: el = l.pop(l.index(col)) logger.error( 'Removed {0} column because it did not specify out value'. format(el))
def detect_with_proc_net(pid): result = None inodes = fetch_socket_inodes_for_process(pid) parser = ProcNetParser() result = parser.match_socket_inodes(inodes) if not result or len(result) == 0: logger.error('could not detect connection string from /proc/net for postgres process {0}'.format(pid)) return None return result
def _load_avg_status(row, col, val, bound): if val is not None: loads = str(val).split() if len(loads) != 3: logger.error('load average value is not 1min 5min 15 min') for x in loads: f = float(x) if f > bound: return True return False
def _read_proc(self, pid, is_backend, is_active): """ see man 5 proc for details (/proc/[pid]/stat) """ result = {} raw_result = {} fp = None # read raw data from /proc/stat, proc/cmdline and /proc/io for ftyp, fname in zip(( 'stat', 'cmd', 'io', ), ('/proc/{0}/stat', '/proc/{0}/cmdline', '/proc/{0}/io')): try: fp = open(fname.format(pid), 'rU') if ftyp == 'stat': raw_result[ftyp] = fp.read().strip().split() if ftyp == 'cmd': # large number of trailing \0x00 returned by python raw_result[ftyp] = fp.readline().strip('\x00').strip() if ftyp == 'io': proc_stat_io_read = {} for line in fp: x = [e.strip(':') for e in line.split()] if len(x) < 2: logger.error( '{0} content not in the "name: value" form: {1}' .format(fname.format(pid), line)) continue else: proc_stat_io_read[x[0]] = int(x[1]) raw_result[ftyp] = proc_stat_io_read except IOError: logger.warning( 'Unable to read {0}, process data will be unavailable'. format(fname.format(pid))) return None finally: fp and fp.close() # Assume we managed to read the row if we can get its PID for cat in 'stat', 'io': result.update( self._transform_input( raw_result.get(cat, {} if cat == 'io' else []))) # generated columns result['cmdline'] = raw_result.get('cmd', None) if not is_backend: result['type'], action = self._get_psinfo(result['cmdline']) if action: result['query'] = action else: result['type'] = 'backend' if is_active or not is_backend: result['uss'] = self._get_memory_usage(pid) return result
def _read_uptime(self): fp = None raw_result = [] try: fp = open(HostStatCollector.UPTIME_FILE, 'rU') raw_result = fp.read().split() except: logger.error('Unable to read uptime from {0}'.format(HostStatCollector.UPTIME_FILE)) finally: fp and fp.close() return self._transform_input(raw_result, self.transform_uptime_data)
def _read_uptime(self): fp = None raw_result = [] try: fp = open(HostStatCollector.UPTIME_FILE, 'rU') raw_result = fp.read().split() except: logger.error('Unable to read uptime from {0}'.format( HostStatCollector.UPTIME_FILE)) finally: fp and fp.close() return self._transform_input(raw_result, self.transform_uptime_data)
def read_postmaster_pid(work_directory, dbname): """ Parses the postgres directory tree and extracts the pid of the postmaster process """ fp = None try: fp = open('{0}/postmaster.pid'.format(work_directory)) pid = fp.readline().strip() except: # XXX: do not bail out in case we are collecting data for multiple PostgreSQL clusters logger.error('Unable to read postmaster.pid for {name} at {wd}\n HINT: \ make sure Postgres is running'.format(name=dbname, wd=work_directory)) return None finally: if fp is not None: fp.close() return pid
def get_du_data(self, wd): data_size = 0 xlog_size = 0 result = {'data': [], 'xlog': []} try: data_size = self.run_du(wd, BLOCK_SIZE) xlog_size = self.run_du(wd + '/pg_xlog/', BLOCK_SIZE) except Exception as e: logger.error('Unable to read free space information for the pg_xlog and data directories for the directory\ {0}: {1}'.format(wd, e)) else: # XXX: why do we pass the block size there? result['data'] = str(data_size), wd result['xlog'] = str(xlog_size), wd + '/pg_xlog' return result
def get_du_data(self, wd): data_size = 0 xlog_size = 0 result = {'data': [], 'xlog': []} try: data_size = self.run_du(wd, BLOCK_SIZE) xlog_size = self.run_du(wd + self.wal_directory, BLOCK_SIZE) except Exception as e: logger.error('Unable to read free space information for the pg_xlog and data directories for the directory\ {0}: {1}'.format(wd, e)) else: # XXX: why do we pass the block size there? result['data'] = str(data_size), wd result['xlog'] = str(xlog_size), wd + self.wal_directory return result
def do_loop(screen, groups, output_method, collectors, consumer): """ Display output (or pass it through to ncurses) """ if output_method == OUTPUT_METHOD.curses: if screen is None: logger.error( 'No parent screen is passed to the curses application') sys.exit(1) else: # initialize the curses output class. output = CursesOutput(screen) if not output.is_color_supported: logger.error( 'Curses output requires a terminal that supports color') sys.exit(1) else: output = CommonOutput() while 1: # process input: consumer.consume() for st in collectors: if output_method == OUTPUT_METHOD.curses and not poll_keys( screen, output): # bail out immediately return st.set_units_display(flags.display_units) st.set_ignore_autohide(not flags.autohide_fields) st.set_notrim(flags.notrim) process_single_collector(st) if output_method == OUTPUT_METHOD.curses and not poll_keys( screen, output): return if output_method == OUTPUT_METHOD.curses: process_groups(groups) # in the non-curses cases display actually shows the data and refresh # clears the screen, so we need to refresh before display to clear the old data. if options.clear_screen and output_method != OUTPUT_METHOD.curses: output.refresh() for st in collectors: output.display(st.output(output_method)) # in the curses case, refresh shows the data queued by display if output_method == OUTPUT_METHOD.curses: output.refresh() if not flags.realtime: time.sleep(consts.TICK_LENGTH)
def cook_row(self, row, header, method): cooked_vals = [] if not self.cook_function.get(method): return row if len(row) != len(header): logger.error( 'Unable to cook row with non-matching number of header and value columns: ' + 'row {0} header {1}'.format(row, header)) cook_fn = self.cook_function[method] for no, val in enumerate(row): # if might be tempting to just get the column from output_transform_data using # the header, but it's wrong: see _produce_output_name for details. This, of # course, assumes the number of columns in the output_transform_data is the # same as in row: thus, we need to avoid filtering rows in the collector. newval = cook_fn(val, header[no], self.output_transform_data[no]) cooked_vals.append(newval) return cooked_vals
def _read_proc(self, pid, is_backend, is_active): """ see man 5 proc for details (/proc/[pid]/stat) """ result = {} raw_result = {} fp = None # read raw data from /proc/stat, proc/cmdline and /proc/io for ftyp, fname in zip(('stat', 'cmd', 'io',), ('/proc/{0}/stat', '/proc/{0}/cmdline', '/proc/{0}/io')): try: fp = open(fname.format(pid), 'rU') if ftyp == 'stat': raw_result[ftyp] = fp.read().strip().split() if ftyp == 'cmd': # large number of trailing \0x00 returned by python raw_result[ftyp] = fp.readline().strip('\x00').strip() if ftyp == 'io': proc_stat_io_read = {} for line in fp: x = [e.strip(':') for e in line.split()] if len(x) < 2: logger.error( '{0} content not in the "name: value" form: {1}'.format(fname.format(pid), line)) continue else: proc_stat_io_read[x[0]] = int(x[1]) raw_result[ftyp] = proc_stat_io_read except IOError: logger.warning('Unable to read {0}, process data will be unavailable'.format(fname.format(pid))) return None finally: fp and fp.close() # Assume we managed to read the row if we can get its PID for cat in 'stat', 'io': result.update(self._transform_input(raw_result.get(cat, {} if cat == 'io' else []))) # generated columns result['cmdline'] = raw_result.get('cmd', None) if not is_backend: result['type'], action = self._get_psinfo(result['cmdline']) if action: result['query'] = action else: result['type'] = 'backend' if is_active or not is_backend: result['uss'] = self._get_memory_usage(pid) return result
def cook_row(self, row, header, method): cooked_vals = [] if not self.cook_function.get(method): return row if len(row) != len(header): logger.error( 'Unable to cook row with non-matching number of header and value columns: ' + 'row {0} header {1}'.format(row, header) ) cook_fn = self.cook_function[method] for no, val in enumerate(row): # if might be tempting to just get the column from output_transform_data using # the header, but it's wrong: see _produce_output_name for details. This, of # course, assumes the number of columns in the output_transform_data is the # same as in row: thus, we need to avoid filtering rows in the collector. newval = cook_fn(val, header[no], self.output_transform_data[no]) cooked_vals.append(newval) return cooked_vals
def _read_proc_stat(self): """ see man 5 proc for details (/proc/stat). We don't parse cpu info here """ raw_result = {} result = {} try: fp = open(SystemStatCollector.PROC_STAT_FILENAME, 'rU') # split /proc/stat into the name - value pairs for line in fp: elements = line.strip().split() if len(elements) > 2: raw_result[elements[0]] = elements[1:] elif len(elements) > 1: raw_result[elements[0]] = elements[1] # otherwise, the line is probably empty or bogus and should be skipped result = self._transform_input(raw_result) except IOError: logger.error('Unable to read {0}, global data will be unavailable'.format(self.PROC_STAT_FILENAME)) return result
def do_loop(screen, groups, output_method, collectors, consumer): """ Display output (or pass it through to ncurses) """ if output_method == OUTPUT_METHOD.curses: if screen is None: logger.error('No parent screen is passed to the curses application') sys.exit(1) else: # initialize the curses output class. output = CursesOutput(screen) if not output.is_color_supported: logger.error('Curses output requires a terminal that supports color') sys.exit(1) else: output = CommonOutput() while 1: # process input: consumer.consume() for st in collectors: if output_method == OUTPUT_METHOD.curses and not poll_keys(screen, output): # bail out immediately return st.set_units_display(flags.display_units) st.set_ignore_autohide(not flags.autohide_fields) st.set_notrim(flags.notrim) process_single_collector(st) if output_method == OUTPUT_METHOD.curses and not poll_keys(screen, output): return if output_method == OUTPUT_METHOD.curses: process_groups(groups) # in the non-curses cases display actually shows the data and refresh # clears the screen, so we need to refresh before display to clear the old data. if options.clear_screen and output_method != OUTPUT_METHOD.curses: output.refresh() for st in collectors: output.display(st.output(output_method)) # in the curses case, refresh shows the data queued by display if output_method == OUTPUT_METHOD.curses: output.refresh() if not flags.realtime: time.sleep(consts.TICK_LENGTH)
def fetch_socket_inodes_for_process(pid): """ read /proc/[pid]/fd and get those that correspond to sockets """ inodes = [] fd_dir = '/proc/{0}/fd'.format(pid) if not os.access(fd_dir, os.R_OK): logger.warning("unable to read {0}".format(fd_dir)) else: for link in glob.glob('{0}/*'.format(fd_dir)): if not os.access(link, os.F_OK): logger.warning("unable to access link {0}".format(link)) continue try: target = os.readlink(link) except: logger.error('coulnd\'t read link {0}'.format(link)) else: # socket:[8430] match = re.search(r'socket:\[(\d+)\]', target) if match: inodes.append(int(match.group(1))) return inodes
def _read_proc_stat(self): """ see man 5 proc for details (/proc/stat). We don't parse cpu info here """ raw_result = {} result = {} try: fp = open(SystemStatCollector.PROC_STAT_FILENAME, 'rU') # split /proc/stat into the name - value pairs for line in fp: elements = line.strip().split() if len(elements) > 2: raw_result[elements[0]] = elements[1:] elif len(elements) > 1: raw_result[elements[0]] = elements[1] # otherwise, the line is probably empty or bogus and should be skipped result = self._transform_input(raw_result) except IOError: logger.error( 'Unable to read {0}, global data will be unavailable'.format( self.PROC_STAT_FILENAME)) return result
def fetch_socket_inodes_for_process(pid): """ read /proc/[pid]/fd and get those that correspond to sockets """ inodes = [] fd_dir = '/proc/{0}/fd'.format(pid) if not os.access(fd_dir, os.R_OK): logger.warning("unable to read {0}".format(fd_dir)) else: for link in glob.glob('{0}/*'.format(fd_dir)): if not os.access(link, os.F_OK): logger.warning("unable to access link {0}".format(link)) continue try: target = os.readlink(link) except Exception: logger.error('coulnd\'t read link {0}'.format(link)) else: # socket:[8430] match = re.search(r'socket:\[(\d+)\]', target) if match: inodes.append(int(match.group(1))) return inodes
def detect_with_postmaster_pid(work_directory, version): # PostgreSQL 9.0 doesn't have enough data result = {} if version is None or version == 9.0: return None PID_FILE = '{0}/postmaster.pid'.format(work_directory) # try to access the socket directory if not os.access(work_directory, os.R_OK | os.X_OK): logger.warning( 'cannot access PostgreSQL cluster directory {0}: permission denied'.format(work_directory)) return None try: with open(PID_FILE, 'rU') as fp: lines = fp.readlines() except os.error as e: logger.error('could not read {0}: {1}'.format(PID_FILE, e)) return None if len(lines) < 6: logger.error('{0} seems to be truncated, unable to read connection information'.format(PID_FILE)) return None port = lines[3].strip() unix_socket_path = lines[4].strip() if unix_socket_path != '': result['unix'] = [(unix_socket_path, port)] tcp_address = lines[5].strip() if tcp_address != '': if tcp_address == '*': tcp_address = '127.0.0.1' result['tcp'] = [(tcp_address, port)] if len(result) == 0: logger.error('could not acquire a socket postmaster at {0} is listening on'.format(work_directory)) return None return result
def _read_memory_data(): """ Read relevant data from /proc/meminfo. We are interesed in the following fields: MemTotal, MemFree, Buffers, Cached, Dirty, CommitLimit, Committed_AS """ result = {} try: fp = open(MemoryStatCollector.MEMORY_STAT_FILE, 'rU') for l in fp: vals = l.strip().split() if len(vals) >= 2: name, val = vals[:2] # if we have units of measurement different from kB - transform the result if len(vals) == 3 and vals[2] in ('mB', 'gB'): if vals[2] == 'mB': val += '0' * 3 if vals[2] == 'gB': val += '0' * 6 if len(str(name)) > 1: result[str(name)[:-1]] = val else: logger.error('name is too short: {0}'.format( str(name))) else: logger.error( '/proc/meminfo string is not name value: {0}'.format( vals)) except: logger.error( 'Unable to read /proc/meminfo memory statistics. Check your permissions' ) return result finally: fp.close() return result
def establish_user_defined_connection(instance, conn, clusters): """ connect the database and get all necessary options like pid and work_directory we use port, host and socket_directory, prefering socket over TCP connections """ pgcon = None # establish a new connection try: pgcon = psycopg2.connect(**conn) except Exception as e: logger.error('failed to establish connection to {0} via {1}'.format( instance, conn)) logger.error('PostgreSQL exception: {0}'.format(e)) return None # get the database version from the pgcon properties dbver = dbversion_as_float(pgcon) cur = pgcon.cursor() cur.execute('show data_directory') work_directory = cur.fetchone()[0] cur.close() pgcon.commit() # now, when we have the work directory, acquire the pid of the postmaster. pid = read_postmaster_pid(work_directory, instance) if pid is None: logger.error( 'failed to read pid of the postmaster on {0}'.format(conn)) return None # check that we don't have the same pid already in the accumulated results. # for instance, a user may specify 2 different set of connection options for # the same database (one for the unix_socket_directory and another for the host) pids = [opt['pid'] for opt in clusters if 'pid' in opt] if pid in pids: duplicate_instance = [ opt['name'] for opt in clusters if 'pid' in opt and opt.get('pid', 0) == pid ][0] logger.error('duplicate connection options detected for databases ' '{0} and {1}, same pid {2}, skipping {0}'.format( instance, duplicate_instance, pid)) pgcon.close() return True # now we have all components to create a cluster descriptor desc = make_cluster_desc(name=instance, version=dbver, workdir=work_directory, pid=pid, pgcon=pgcon, conn=conn) clusters.append(desc) return True
def detect_db_connection_arguments(work_directory, pid, version, username, dbname): """ Try to detect database connection arguments from the postmaster.pid We do this by first extracting useful information from postmaster.pid, next reading the postgresql.conf if necessary and, at last, """ conn_args = detect_with_proc_net(pid) if not conn_args: # if we failed to detect the arguments via the /proc/net/ readings, # perhaps we'll get better luck with just peeking into postmaster.pid. conn_args = detect_with_postmaster_pid(work_directory, version) if not conn_args: logger.error('unable to detect connection parameters for the PostgreSQL cluster at {0}'.format( work_directory)) return None # try all acquired connection arguments, starting from unix, then tcp, then tcp over ipv6 result = pick_connection_arguments(conn_args, username, dbname) if len(result) == 0: logger.error('unable to connect to PostgreSQL cluster at {0} using any of ' 'the detected connection options: {1}'.format(work_directory, conn_args)) return None return result
def get_io_data(pnames): """ Retrieve raw data from /proc/diskstat (transformations are perfromed via io_list_transformation)""" result = {} found = 0 # stop if we found records for all partitions total = len(pnames) try: fp = None fp = open(PartitionStatCollector.DISK_STAT_FILE, 'rU') for l in fp: elements = l.split() for pname in pnames: if pname in elements: result[pname] = elements found += 1 if found == total: break if found == total: break except: logger.error('Unable to read {0}'.format(PartitionStatCollector.DISK_STAT_FILE)) result = {} finally: fp and fp.close() return result
def establish_user_defined_connection(instance, conn, clusters): """ connect the database and get all necessary options like pid and work_directory we use port, host and socket_directory, prefering socket over TCP connections """ # establish a new connection try: pgcon = psycopg2.connect(**conn) except Exception as e: logger.error('failed to establish connection to {0} via {1}'.format(instance, conn)) logger.error('PostgreSQL exception: {0}'.format(e)) return None # get the database version from the pgcon properties dbver = dbversion_as_float(pgcon) cur = pgcon.cursor() cur.execute('show data_directory') work_directory = cur.fetchone()[0] cur.close() pgcon.commit() # now, when we have the work directory, acquire the pid of the postmaster. pid = read_postmaster_pid(work_directory, instance) if pid is None: logger.error('failed to read pid of the postmaster on {0}'.format(conn)) return None # check that we don't have the same pid already in the accumulated results. # for instance, a user may specify 2 different set of connection options for # the same database (one for the unix_socket_directory and another for the host) pids = [opt['pid'] for opt in clusters if 'pid' in opt] if pid in pids: duplicate_instance = [opt['name'] for opt in clusters if 'pid' in opt and opt.get('pid', 0) == pid][0] logger.error('duplicate connection options detected for databases ' '{0} and {1}, same pid {2}, skipping {0}'.format(instance, duplicate_instance, pid)) pgcon.close() return True # now we have all components to create a cluster descriptor desc = make_cluster_desc(name=instance, version=dbver, workdir=work_directory, pid=pid, pgcon=pgcon, conn=conn) clusters.append(desc) return True
def detect_with_postmaster_pid(work_directory, version): # PostgreSQL 9.0 doesn't have enough data result = {} if version is None or version == 9.0: return None PID_FILE = '{0}/postmaster.pid'.format(work_directory) lines = [] # try to access the socket directory if not os.access(work_directory, os.R_OK | os.X_OK): logger.warning( 'cannot access PostgreSQL cluster directory {0}: permission denied' .format(work_directory)) return None try: with open(PID_FILE, 'rU') as fp: lines = fp.readlines() except os.error as e: logger.error('could not read {0}: {1}'.format(PID_FILE, e)) return None if len(lines) < 6: logger.error( '{0} seems to be truncated, unable to read connection information'. format(PID_FILE)) return None port = lines[3].strip() unix_socket_path = lines[4].strip() if unix_socket_path != '': result['unix'] = [(unix_socket_path, port)] tcp_address = lines[5].strip() if tcp_address != '': if tcp_address == '*': tcp_address = '127.0.0.1' result['tcp'] = [(tcp_address, port)] if len(result) == 0: logger.error( 'could not acquire a socket postmaster at {0} is listening on'. format(work_directory)) return None return result
def warn_non_optional_column(colname): logger.error('Column {0} is not optional, but input row has no value for it'.format(colname))
def main(): global options # bail out if we are not running Linux if platform.system() != 'Linux': print('Non Linux database hosts are not supported at the moment. Can not continue') sys.exit(243) if not psycopg2_available: print('Unable to import psycopg2 module, please, install it (python-psycopg2). Can not continue') sys.exit(254) options, args = parse_args() consts.TICK_LENGTH = options.tick output_method = options.output_method if not output_method_is_valid(output_method): print('Unsupported output method: {0}'.format(output_method)) print('Valid output methods are: {0}'.format(','.join(get_valid_output_methods()))) sys.exit(1) if output_method == OUTPUT_METHOD.curses and not curses_available: print('Curses output is selected, but curses are unavailable, falling back to console output') output_method = OUTPUT_METHOD.console # set basic logging setup_logger(options) clusters = [] config = read_configuration(options.config_file) if options.config_file else None dbversion = None # configuration file takes priority over the rest of database connection information sources. if config: for instance in config: if options.instance and instance != options.instance: continue # pass already aquired connections to make sure we only list unique clusters. host = config[instance].get('host') port = config[instance].get('port') conn = build_connection(host, port, config[instance].get('user'), config[instance].get('dbname')) if not establish_user_defined_connection(instance, conn, clusters): logger.error('failed to acquire details about ' + 'the database cluster {0}, the server will be skipped'.format(instance)) elif options.host: # connect to the database using the connection string supplied from command-line conn = build_connection(options.host, options.port, options.username, options.dbname) instance = options.instance or "default" if not establish_user_defined_connection(instance, conn, clusters): logger.error("unable to continue with cluster {0}".format(instance)) elif options.use_service and options.instance: # connect to the database using the service name if not establish_user_defined_connection(options.instance, {'service': options.instance}, clusters): logger.error("unable to continue with cluster {0}".format(options.instance)) else: # do autodetection postmasters = get_postmasters_directories() # get all PostgreSQL instances for result_work_dir, data in postmasters.items(): (ppid, dbversion, dbname) = data # if user requested a specific database name and version - don't try to connect to others if options.instance: if dbname != options.instance or not result_work_dir or not ppid: continue if options.version is not None and dbversion != options.version: continue try: conndata = detect_db_connection_arguments( result_work_dir, ppid, dbversion, options.username, options.dbname) if conndata is None: continue host = conndata['host'] port = conndata['port'] conn = build_connection(host, port, options.username, options.dbname) pgcon = psycopg2.connect(**conn) except Exception as e: logger.error('PostgreSQL exception {0}'.format(e)) pgcon = None if pgcon: desc = make_cluster_desc(name=dbname, version=dbversion, workdir=result_work_dir, pid=ppid, pgcon=pgcon, conn=conn) clusters.append(desc) collectors = [] groups = {} try: if len(clusters) == 0: logger.error('No suitable PostgreSQL instances detected, exiting...') logger.error('hint: use -v for details, ' + 'or specify connection parameters manually in the configuration file (-c)') sys.exit(1) # initialize the disks stat collector process and create an exchange queue q = JoinableQueue(1) work_directories = [cl['wd'] for cl in clusters if 'wd' in cl] dbversion = dbversion or clusters[0]['ver'] collector = DetachedDiskStatCollector(q, work_directories, dbversion) collector.start() consumer = DiskCollectorConsumer(q) collectors.append(HostStatCollector()) collectors.append(SystemStatCollector()) collectors.append(MemoryStatCollector()) for cl in clusters: part = PartitionStatCollector(cl['name'], cl['ver'], cl['wd'], consumer) pg = PgstatCollector(cl['pgcon'], cl['reconnect'], cl['pid'], cl['name'], cl['ver'], options.pid) groupname = cl['wd'] groups[groupname] = {'pg': pg, 'partitions': part} collectors.append(part) collectors.append(pg) # we don't want to mix diagnostics messages with useful output, so we log the former into a file. disable_logging_to_stderr() loop(collectors, consumer, groups, output_method) enable_logging_to_stderr() except KeyboardInterrupt: pass except curses.error: print(traceback.format_exc()) if 'SSH_CLIENT' in os.environ and 'SSH_TTY' not in os.environ: print('Unable to initialize curses. Make sure you supply -t option (force psedo-tty allocation) to ssh') except: print(traceback.format_exc()) finally: sys.exit(0)
def warn_non_optional_column(colname): logger.error( 'Column {0} is not optional, but input row has no value for it'. format(colname))
def get_postmasters_directories(): """ detect all postmasters running and get their pids """ pg_pids = [] postmasters = {} pg_proc_stat = {} # get all 'number' directories from /proc/ and sort them for f in glob.glob('/proc/[0-9]*/stat'): # make sure the particular pid is accessible to us if not os.access(f, os.R_OK): continue try: with open(f, 'rU') as fp: stat_fields = fp.read().strip().split() except: logger.error('failed to read {0}'.format(f)) continue # read PostgreSQL processes. Avoid zombies if len(stat_fields) < STAT_FIELD.st_start_time + 1 or stat_fields[STAT_FIELD.st_process_name] not in \ ('(postgres)', '(postmaster)') or stat_fields[STAT_FIELD.st_state] == 'Z': if stat_fields[STAT_FIELD.st_state] == 'Z': logger.warning('zombie process {0}'.format(f)) if len(stat_fields) < STAT_FIELD.st_start_time + 1: logger.error('{0} output is too short'.format(f)) continue # convert interesting fields to int for no in STAT_FIELD.st_pid, STAT_FIELD.st_ppid, STAT_FIELD.st_start_time: stat_fields[no] = int(stat_fields[no]) pid = stat_fields[STAT_FIELD.st_pid] pg_proc_stat[pid] = stat_fields pg_pids.append(pid) # we have a pid -> stat fields map, and an array of all pids. # sort pids array by the start time of the process, so that we # minimize the number of looks into /proc/../cmdline latter # the idea is that processes starting earlier are likely to be # parent ones. pg_pids.sort(key=lambda pid: pg_proc_stat[pid][STAT_FIELD.st_start_time]) for pid in pg_pids: st = pg_proc_stat[pid] ppid = st[STAT_FIELD.st_ppid] # if parent is also a postgres process - no way this is a postmaster if ppid in pg_pids: continue link_filename = '/proc/{0}/cwd'.format(pid) # now get its data directory in the /proc/[pid]/cmdline if not os.access(link_filename, os.R_OK): logger.warning( 'potential postmaster work directory file {0} is not accessible'.format(link_filename)) continue # now read the actual directory, check this is accessible to us and belongs to PostgreSQL # additionally, we check that we haven't seen this directory before, in case the check # for a parent pid still produce a postmaster child. Be extra careful to catch all exceptions # at this phase, we don't want one bad postmaster to be the reason of tool's failure for the # other good ones. try: pg_dir = os.readlink(link_filename) except os.error as e: logger.error('unable to readlink {0}: OS reported {1}'.format(link_filename, e)) continue if pg_dir in postmasters: continue if not os.access(pg_dir, os.R_OK): logger.warning( 'unable to access the PostgreSQL candidate directory {0}, have to skip it'.format(pg_dir)) continue # if PG_VERSION file is missing, this is not a postgres directory PG_VERSION_FILENAME = '{0}/PG_VERSION'.format(link_filename) if not os.access(PG_VERSION_FILENAME, os.R_OK): logger.warning( 'PostgreSQL candidate directory {0} is missing PG_VERSION file, have to skip it'.format(pg_dir)) continue try: fp = open(PG_VERSION_FILENAME, 'rU') val = fp.read().strip() if val is not None and len(val) >= 2: version = float(val) except os.error as e: logger.error( 'unable to read version number from PG_VERSION directory {0}, have to skip it'.format(pg_dir)) continue except ValueError: logger.error('PG_VERSION doesn\'t contain a valid version number: {0}'.format(val)) continue else: dbname = get_dbname_from_path(pg_dir) postmasters[pg_dir] = [pid, version, dbname] return postmasters
def get_postmasters_directories(): """ detect all postmasters running and get their pids """ pg_pids = [] postmasters = {} pg_proc_stat = {} # get all 'number' directories from /proc/ and sort them for f in glob.glob('/proc/[0-9]*/stat'): # make sure the particular pid is accessible to us if not os.access(f, os.R_OK): continue try: with open(f, 'rU') as fp: stat_fields = fp.read().strip().split() except Exception: logger.error('failed to read {0}'.format(f)) continue # read PostgreSQL processes. Avoid zombies if len(stat_fields) < STAT_FIELD.st_start_time + 1 or stat_fields[STAT_FIELD.st_process_name] not in \ ('(postgres)', '(postmaster)') or stat_fields[STAT_FIELD.st_state] == 'Z': if stat_fields[STAT_FIELD.st_state] == 'Z': logger.warning('zombie process {0}'.format(f)) if len(stat_fields) < STAT_FIELD.st_start_time + 1: logger.error('{0} output is too short'.format(f)) continue # convert interesting fields to int for no in STAT_FIELD.st_pid, STAT_FIELD.st_ppid, STAT_FIELD.st_start_time: stat_fields[no] = int(stat_fields[no]) pid = stat_fields[STAT_FIELD.st_pid] pg_proc_stat[pid] = stat_fields pg_pids.append(pid) # we have a pid -> stat fields map, and an array of all pids. # sort pids array by the start time of the process, so that we # minimize the number of looks into /proc/../cmdline latter # the idea is that processes starting earlier are likely to be # parent ones. pg_pids.sort(key=lambda pid: pg_proc_stat[pid][STAT_FIELD.st_start_time]) for pid in pg_pids: st = pg_proc_stat[pid] ppid = st[STAT_FIELD.st_ppid] # if parent is also a postgres process - no way this is a postmaster if ppid in pg_pids: continue link_filename = '/proc/{0}/cwd'.format(pid) # now get its data directory in the /proc/[pid]/cmdline if not os.access(link_filename, os.R_OK): logger.warning( 'potential postmaster work directory file {0} is not accessible' .format(link_filename)) continue # now read the actual directory, check this is accessible to us and belongs to PostgreSQL # additionally, we check that we haven't seen this directory before, in case the check # for a parent pid still produce a postmaster child. Be extra careful to catch all exceptions # at this phase, we don't want one bad postmaster to be the reason of tool's failure for the # other good ones. try: pg_dir = os.readlink(link_filename) except os.error as e: logger.error('unable to readlink {0}: OS reported {1}'.format( link_filename, e)) continue if pg_dir in postmasters: continue if not os.access(pg_dir, os.R_OK): logger.warning( 'unable to access the PostgreSQL candidate directory {0}, have to skip it' .format(pg_dir)) continue # if PG_VERSION file is missing, this is not a postgres directory PG_VERSION_FILENAME = '{0}/PG_VERSION'.format(link_filename) if not os.access(PG_VERSION_FILENAME, os.R_OK): logger.warning( 'PostgreSQL candidate directory {0} is missing PG_VERSION file, have to skip it' .format(pg_dir)) continue try: fp = open(PG_VERSION_FILENAME, 'rU') val = fp.read().strip() if val is not None and len(val) >= 2: version = float(val) except os.error: logger.error( 'unable to read version number from PG_VERSION directory {0}, have to skip it' .format(pg_dir)) continue except ValueError: logger.error( 'PG_VERSION doesn\'t contain a valid version number: {0}'. format(val)) continue else: dbname = get_dbname_from_path(pg_dir) postmasters[pg_dir] = [pid, version, dbname] return postmasters
def main(): global options if not psycopg2_available: print( 'Unable to import psycopg2 module, please, install it (python-psycopg2). Can not continue' ) sys.exit(254) options, args = parse_args() consts.TICK_LENGTH = options.tick output_method = options.output_method if not output_method_is_valid(output_method): print('Unsupported output method: {0}'.format(output_method)) print('Valid output methods are: {0}'.format(','.join( get_valid_output_methods()))) sys.exit(1) if output_method == OUTPUT_METHOD.curses and not curses_available: print( 'Curses output is selected, but curses are unavailable, falling back to console output' ) output_method = OUTPUT_METHOD.console # set basic logging setup_logger(options) clusters = [] config = read_configuration( options.config_file) if options.config_file else None dbversion = None # configuration file takes priority over the rest of database connection information sources. if config: for instance in config: if options.instance and instance != options.instance: continue # pass already aquired connections to make sure we only list unique clusters. host = config[instance].get('host') port = config[instance].get('port') conn = build_connection(host, port, config[instance].get('user'), config[instance].get('dbname')) if not establish_user_defined_connection(instance, conn, clusters): logger.error( 'failed to acquire details about ' + 'the database cluster {0}, the server will be skipped'. format(instance)) elif options.host: # connect to the database using the connection string supplied from command-line conn = build_connection(options.host, options.port, options.username, options.dbname) instance = options.instance or "default" if not establish_user_defined_connection(instance, conn, clusters): logger.error( "unable to continue with cluster {0}".format(instance)) elif options.use_service and options.instance: # connect to the database using the service name if not establish_user_defined_connection( options.instance, {'service': options.instance}, clusters): logger.error("unable to continue with cluster {0}".format( options.instance)) else: # do autodetection postmasters = get_postmasters_directories() # get all PostgreSQL instances for result_work_dir, data in postmasters.items(): (ppid, dbversion, dbname) = data # if user requested a specific database name and version - don't try to connect to others if options.instance: if dbname != options.instance or not result_work_dir or not ppid: continue if options.version is not None and dbversion != options.version: continue try: conndata = detect_db_connection_arguments( result_work_dir, ppid, dbversion, options.username, options.dbname) if conndata is None: continue host = conndata['host'] port = conndata['port'] conn = build_connection(host, port, options.username, options.dbname) pgcon = psycopg2.connect(**conn) except Exception as e: logger.error('PostgreSQL exception {0}'.format(e)) pgcon = None if pgcon: desc = make_cluster_desc(name=dbname, version=dbversion, workdir=result_work_dir, pid=ppid, pgcon=pgcon, conn=conn) clusters.append(desc) collectors = [] groups = {} try: if len(clusters) == 0: logger.error( 'No suitable PostgreSQL instances detected, exiting...') logger.error( 'hint: use -v for details, ' + 'or specify connection parameters manually in the configuration file (-c)' ) sys.exit(1) # initialize the disks stat collector process and create an exchange queue q = JoinableQueue(1) work_directories = [cl['wd'] for cl in clusters if 'wd' in cl] dbversion = dbversion or clusters[0]['ver'] collector = DetachedDiskStatCollector(q, work_directories, dbversion) collector.start() consumer = DiskCollectorConsumer(q) collectors.append(HostStatCollector()) collectors.append(SystemStatCollector()) collectors.append(MemoryStatCollector()) for cl in clusters: part = PartitionStatCollector(cl['name'], cl['ver'], cl['wd'], consumer) pg = PgstatCollector(cl['pgcon'], cl['reconnect'], cl['pid'], cl['name'], cl['ver'], options.pid) groupname = cl['wd'] groups[groupname] = {'pg': pg, 'partitions': part} collectors.append(part) collectors.append(pg) # we don't want to mix diagnostics messages with useful output, so we log the former into a file. disable_logging_to_stderr() loop(collectors, consumer, groups, output_method) enable_logging_to_stderr() except KeyboardInterrupt: pass except curses.error: print(traceback.format_exc()) if 'SSH_CLIENT' in os.environ and 'SSH_TTY' not in os.environ: print( 'Unable to initialize curses. Make sure you supply -t option (force psedo-tty allocation) to ssh' ) except: print(traceback.format_exc()) finally: sys.exit(0)