def exec_diagnose(servers, errors_list): """ Diagnostic command: gathers information from backup server and from all the configured servers. Gathered information should be used for support and problems detection :param dict(str,frabit.server.Server) servers: list of configured servers :param list errors_list: list of global errors """ # global section. info about frabit server diagnosis = {'global': {}, 'servers': {}} # frabit global config diagnosis['global']['config'] = dict(frabit.__config__._global_config) diagnosis['global']['config']['errors_list'] = errors_list try: command = filesystem.UnixLocalCommand() # basic system info diagnosis['global']['system_info'] = command.get_system_info() except CommandFailedException as e: diagnosis['global']['system_info'] = {'error': repr(e)} diagnosis['global']['system_info']['frabit_ver'] = frabit.__version__ diagnosis['global']['system_info']['timestamp'] = datetime.datetime.now() # per server section for name in sorted(servers): server = servers[name] if server is None: output.error("Unknown server '{}'".format(name)) continue # server configuration diagnosis['servers'][name] = {} diagnosis['servers'][name]['config'] = vars(server.config) del diagnosis['servers'][name]['config']['config'] # server system info if server.config.ssh_command: try: command = filesystem.UnixRemoteCommand( ssh_command=server.config.ssh_command, path=server.path) diagnosis['servers'][name]['system_info'] = (command.get_system_info()) except FsOperationFailed: pass # frabit status information for the server diagnosis['servers'][name]['status'] = server.get_remote_status() # backup list backups = server.get_available_backups(BackupInfo.STATUS_ALL) diagnosis['servers'][name]['backups'] = backups # wal status diagnosis['servers'][name]['wals'] = { 'last_archived_wal_per_timeline': server.backup_manager.get_latest_archived_wals_info(), } # Release any PostgreSQL resource server.close() output.info(json.dumps(diagnosis, cls=FrabitEncoder, indent=4, sort_keys=True))
def sync_wals(args): """ Command that synchronises WAL files from a master to a passive node """ server = get_server(args) try: server.sync_wals() except SyncError as e: # Catch SyncError exceptions and output only the error message, # preventing from logging the stack trace output.error(e) output.close_and_exit()
def delete(args): """ Delete a backup """ server = get_server(args) # Retrieves the backup backup_id = parse_backup_id(server, args) with closing(server): if not server.delete_backup(backup_id): output.error("Cannot delete backup (%s %s)" % (server.config.name, backup_id)) output.close_and_exit()
def kill(self, process_info, retries=10): """ Kill a process Returns True if killed successfully False otherwise :param ProcessInfo process_info: representation of the process we want to kill :param int retries: number of times the method will check if the process is still alive :rtype: bool """ # Try to kill the process try: _logger.debug("Sending SIGINT to PID {}".format(process_info.pid)) os.kill(process_info.pid, signal.SIGINT) _logger.debug("os.kill call succeeded") except OSError as e: _logger.debug("os.kill call failed: {}".format(e)) # The process doesn't exists. It has probably just terminated. if e.errno == errno.ESRCH: return True # Something unexpected has happened output.error("{}".format(e)) return False # Check if the process have been killed. the fastest (and maybe safest) # way is to send a kill with 0 as signal. # If the method returns an OSError exceptions, the process have been # killed successfully, otherwise is still alive. for counter in range(retries): try: _logger.debug("Checking with SIG_DFL if PID {} is still alive".format(process_info.pid)) os.kill(process_info.pid, signal.SIG_DFL) _logger.debug("os.kill call succeeded") except OSError as e: _logger.debug("os.kill call failed: {}".format(e)) # If the process doesn't exists, we are done. if e.errno == errno.ESRCH: return True # Something unexpected has happened output.error("{}".format(e)) return False time.sleep(1) _logger.debug("The PID {pid} has not been terminated after {retries} retries".format(pid=process_info.pid, retries=retries)) return False
def list_files(args): """ List all the files for a single backup """ server = get_server(args) # Retrieves the backup backup_info = parse_backup_id(server, args) try: for line in backup_info.get_list_of_files(args.target): output.info(line, log=False) except BinlogHasPurged as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue", force_str(e), server.config.name) output.close_and_exit()
def sync_info(args): """ Output the internal synchronisation status. Used to sync_backup with a passive node """ server = get_server(args) try: # if called with --primary option if getattr(args, 'primary', False): primary_info = server.primary_node_info(args.last_wal, args.last_position) output.info(json.dumps(primary_info, cls=FrabitEncoder, indent=4), log=False) else: server.sync_status(args.last_wal, args.last_position) except SyncError as e: # Catch SyncError exceptions and output only the error message, # preventing from logging the stack trace output.error(e) output.close_and_exit()
def parse_backup_id(server, args): """ Parses backup IDs including special words such as latest, oldest, etc. Exit with error if the backup id doesn't exist. :param Server server: server object to search for the required backup :param args: command lien arguments namespace :rtype: barman.infofile.LocalBackupInfo """ if args.backup_id in ('latest', 'last'): backup_id = server.get_last_backup_id() elif args.backup_id in ('oldest', 'first'): backup_id = server.get_first_backup_id() else: backup_id = args.backup_id backup_info = server.get_backup(backup_id) if backup_info is None: output.error("Unknown backup '%s' for server '%s'", args.backup_id, server.config.name) output.close_and_exit() return backup_info
def receive_wal(args): """ Start a receive-wal process. The process uses the streaming protocol to receive WAL files from the PostgreSQL server. """ server = get_server(args) if args.stop and args.reset: output.error("--stop and --reset options are not compatible") # If the caller requested to shutdown the receive-wal process deliver the # termination signal, otherwise attempt to start it elif args.stop: server.kill('receive-wal') elif args.create_slot: with closing(server): server.create_physical_repslot() elif args.drop_slot: with closing(server): server.drop_repslot() else: with closing(server): server.receive_wal(reset=args.reset) output.close_and_exit()
def manage_server_command(server, name=None, inactive_is_error=False, disabled_is_error=True, skip_inactive=True, skip_disabled=True): """ Standard and consistent method for managing server errors within a server command execution. By default, suggests to skip any inactive and disabled server; it also emits errors for disabled servers by default. Returns True if the command has to be executed for this server. :param barman.server.Server server: server to be checked for errors :param str name: name of the server, in a multi-server command :param bool inactive_is_error: treat inactive server as error :param bool disabled_is_error: treat disabled server as error :param bool skip_inactive: skip if inactive :param bool skip_disabled: skip if disabled :return: True if the command has to be executed on this server :rtype: boolean """ # Unknown server (skip it) if not server: output.error("Unknown server '%s'" % name) return False if not server.config.active: # Report inactive server as error if inactive_is_error: output.error('Inactive server: %s' % server.config.name) if skip_inactive: return False # Report disabled server as error if server.config.disabled: # Output all the messages as errors, and exit terminating the run. if disabled_is_error: for message in server.config.msg_list: output.error(message) if skip_disabled: return False # All ok, execute the command return True
def recover(args): """ Recover a server at a given time, name, LSN or xid """ server = get_server(args) # Retrieves the backup backup_id = parse_backup_id(server, args) if backup_id.status not in BackupInfo.STATUS_COPY_DONE: output.error("Cannot recover from backup '{id}' of server '{name}': " "backup status is not DONE".format( id=args.backup_id, name=server.config.name)) output.close_and_exit() # decode the tablespace relocation rules tablespaces = {} if args.tablespace: for rule in args.tablespace: try: tablespaces.update([rule.split(':', 1)]) except ValueError: output.error( "Invalid tablespace relocation rule '%s'\n" "HINT: The valid syntax for a relocation rule is " "NAME:LOCATION", rule) output.close_and_exit() # validate the rules against the tablespace list valid_tablespaces = [] if backup_id.tablespaces: valid_tablespaces = [ tablespace_data.name for tablespace_data in backup_id.tablespaces ] for item in tablespaces: if item not in valid_tablespaces: output.error( "Invalid tablespace name '%s'\n" "HINT: Please use any of the following " "tablespaces: %s", item, ', '.join(valid_tablespaces)) output.close_and_exit() # explicitly disallow the rsync remote syntax (common mistake) if ':' in args.destination_directory: output.error( "The destination directory parameter " "cannot contain the ':' character\n" "HINT: If you want to do a remote recovery you have to use " "the --remote-ssh-command option") output.close_and_exit() if args.retry_sleep is not None: server.config.basebackup_retry_sleep = args.retry_sleep if args.retry_times is not None: server.config.basebackup_retry_times = args.retry_times if hasattr(args, 'get_wal'): if args.get_wal: server.config.recovery_options.add(RecoveryOptions.GET_WAL) else: server.config.recovery_options.remove(RecoveryOptions.GET_WAL) if args.jobs is not None: server.config.parallel_jobs = args.jobs if hasattr(args, 'bwlimit'): server.config.bandwidth_limit = args.bwlimit target_options = [ 'target_tli', 'target_time', 'target_xid', 'target_lsn', 'target_name', 'target_immediate' ] specified_target_options = len( [option for option in target_options if getattr(args, option)]) if specified_target_options > 1: output.error( "You cannot specify multiple targets for the recovery operation") output.close_and_exit() if hasattr(args, 'network_compression'): if args.network_compression and args.remote_ssh_command is None: output.error("Network compression can only be used with " "remote recovery.\n" "HINT: If you want to do a remote recovery " "you have to use the --remote-ssh-command option") output.close_and_exit() server.config.network_compression = args.network_compression with closing(server): try: server.recover(backup_id, args.destination_directory, tablespaces=tablespaces, target_tli=args.target_tli, target_time=args.target_time, target_xid=args.target_xid, target_lsn=args.target_lsn, target_name=args.target_name, target_immediate=args.target_immediate, exclusive=args.exclusive, remote_command=args.remote_ssh_command, target_action=getattr(args, 'target_action', None), standby_mode=getattr(args, 'standby_mode', None)) except RecoveryException as exc: output.error(force_str(exc)) output.close_and_exit()
def main(): """ The main method of Frabit """ p = ArghParser(epilog='Frabit by Frabit (www.frabit.com)') p.add_argument( '-v', '--version', action='version', version='{version}\n\nFrabit by Frabit (www.frabit.com)'.format( version=frabit.__version__)) p.add_argument('-c', '--config', help='uses a configuration file (defaults: %s)' % ', '.join(frabit.config.Config.CONFIG_FILES), default=SUPPRESS) p.add_argument('--color', '--colour', help='Whether to use colors in the output', choices=['never', 'always', 'auto'], default='auto') p.add_argument('--log-level', help='Override the default log level', choices=list(get_log_levels()), default=SUPPRESS) p.add_argument('-q', '--quiet', help='be quiet', action='store_true') p.add_argument('-d', '--debug', help='debug output', action='store_true') p.add_argument('-f', '--format', help='output format', choices=output.AVAILABLE_WRITERS.keys(), default=output.DEFAULT_WRITER) p.add_commands([ archive_wal, backup, check, check_backup, cron, delete, diagnose, get_wal, list_backup, list_files, list_server, put_wal, rebuild_xlogdb, receive_wal, recover, show_backup, show_server, replication_status, status, switch_wal, switch_xlog, sync_info, sync_backup, sync_wals, ]) # noinspection PyBroadException try: p.dispatch(pre_call=global_config) except KeyboardInterrupt: msg = "Process interrupted by user (KeyboardInterrupt)" output.error(msg) except Exception as e: msg = "%s\nSee log file for more details." % e output.exception(msg) # cleanup output API and exit honoring output.error_occurred and # output.error_exit_code output.close_and_exit()
def get_server_list(args=None, skip_inactive=False, skip_disabled=False, skip_passive=False, on_error_stop=True, suppress_error=False): """ Get the server list from the configuration If args the parameter is None or arg.server_name is ['all'] returns all defined servers :param args: an argparse namespace containing a list server_name parameter :param bool skip_inactive: skip inactive servers when 'all' is required :param bool skip_disabled: skip disabled servers when 'all' is required :param bool skip_passive: skip passive servers when 'all' is required :param bool on_error_stop: stop if an error is found :param bool suppress_error: suppress display of errors (e.g. diagnose) :rtype: dict[str,Server] """ server_dict = {} # This function must to be called with in a multiple-server context assert not args or isinstance(args.server_name, list) # Generate the list of servers (required for global errors) available_servers = frabit.__config__.server_names() # Get a list of configuration errors from all the servers global_error_list = frabit.__config__.servers_msg_list # Global errors have higher priority if global_error_list: # Output the list of global errors if not suppress_error: for error in global_error_list: output.error(error) # If requested, exit on first error if on_error_stop: output.close_and_exit() # The following return statement will never be reached # but it is here for clarity return {} # Handle special 'all' server cases # - args is None # - 'all' special name if not args or 'all' in args.server_name: # When 'all' is used, it must be the only specified argument if args and len(args.server_name) != 1: output.error("You cannot use 'all' with other server names") servers = available_servers else: # Put servers in a set, so multiple occurrences are counted only once servers = set(args.server_name) # Loop through all the requested servers for server in servers: conf = frabit.__config__.get_server(server) if conf is None: # Unknown server server_dict[server] = None else: server_object = Server(conf) # Skip inactive servers, if requested if skip_inactive and not server_object.config.active: output.info("Skipping inactive server '%s'" % conf.name) continue # Skip disabled servers, if requested if skip_disabled and server_object.config.disabled: output.info("Skipping temporarily disabled server '%s'" % conf.name) continue # Skip passive nodes, if requested if skip_passive and server_object.replica_node: output.info("Skipping passive server '%s'", conf.name) continue server_dict[server] = server_object return server_dict
def get_server(args, skip_inactive=True, skip_disabled=False, skip_passive=False, inactive_is_error=False, on_error_stop=True, suppress_error=False): """ Get a single server retrieving its configuration (wraps get_server_list()) Returns a Server object or None if the required server is unknown and on_error_stop is False. WARNING: this function modifies the 'args' parameter :param args: an argparse namespace containing a single server_name parameter WARNING: the function modifies the content of this parameter :param bool skip_inactive: do nothing if the server is inactive :param bool skip_disabled: do nothing if the server is disabled :param bool skip_passive: do nothing if the server is passive :param bool inactive_is_error: treat inactive server as error :param bool on_error_stop: stop if an error is found :param bool suppress_error: suppress display of errors (e.g. diagnose) :rtype: Server|None """ # This function must to be called with in a single-server context name = args.server_name assert isinstance(name, str) # The 'all' special name is forbidden in this context if name == 'all': output.error("You cannot use 'all' in a single server context") output.close_and_exit() # The following return statement will never be reached # but it is here for clarity return None # Builds a list from a single given name args.server_name = [name] # Skip_inactive is reset if inactive_is_error is set, because # it needs to retrieve the inactive server to emit the error. skip_inactive &= not inactive_is_error # Retrieve the requested server servers = get_server_list(args, skip_inactive, skip_disabled, skip_passive, on_error_stop, suppress_error) # The requested server has been excluded from get_server_list result if len(servers) == 0: output.close_and_exit() # The following return statement will never be reached # but it is here for clarity return None # retrieve the server object server = servers[name] # Apply standard validation control and skips # the server if inactive or disabled, displaying standard # error messages. If on_error_stop (default) exits if not manage_server_command(server, name, inactive_is_error) and \ on_error_stop: output.close_and_exit() # The following return statement will never be reached # but it is here for clarity return None # Returns the filtered server return server