def get_server_list(args=None, skip_disabled=False): """ Get the server list from the configuration If args the parameter is None or arg.server_name[0] is 'all' returns all defined servers :param args: an argparse namespace containing a list server_name parameter :param bool skip_disabled: skip disabled servers when 'all' is required """ server_dict = {} if args is None or args.server_name[0] == 'all': for conf in barman.__config__.servers(): server = Server(conf) if skip_disabled and not conf.active: output.info("Skipping disabled server '%s'", conf.name) continue server_dict[conf.name] = server else: for server in args.server_name: conf = barman.__config__.get_server(server) if conf is None: server_dict[server] = None else: server_dict[server] = Server(conf) return server_dict
def is_wal_relevant(self, wal_info, first_backup): """ Check the relevance of a WAL file according to a provided BackupInfo (usually the oldest on the server) to ensure that the WAL is newer than the start_wal of the backup. :param WalFileInfo wal_info: the WAL file we are checking :param BackupInfo first_backup: the backup used for the checks (usually the oldest available on the server) """ # Skip history files if xlog.is_history_file(wal_info.name): return True # If the WAL file has a timeline smaller than the one of # the oldest backup it cannot be used in any way. wal_timeline = xlog.decode_segment_name(wal_info.name)[0] if wal_timeline < first_backup.timeline: output.info("\tThe timeline of the WAL file %s (%s), is lower " "than the one of the oldest backup of " "server %s (%s). Moving the WAL in " "the error directory", wal_info.name, wal_timeline, self.config.name, first_backup.timeline) return False # Manage xlog segments older than the first backup if wal_info.name < first_backup.begin_wal: output.info("\tOlder than first backup of server %s. " "Moving the WAL file %s in the error directory", self.config.name, wal_info.name) return False return True
def get_wal(args): """ Retrieve WAL_NAME file from SERVER_NAME archive. The content will be streamed on standard output unless the --output-directory option is specified. """ server = get_server(args, inactive_is_error=True) if getattr(args, 'test', None): output.info("Ready to retrieve WAL files from the server %s", server.config.name) return # Retrieve optional arguments. If an argument is not specified, # the namespace doesn't contain it due to SUPPRESS default. # In that case we pick 'None' using getattr third argument. compression = getattr(args, 'compression', None) output_directory = getattr(args, 'output_directory', None) peek = getattr(args, 'peek', None) with closing(server): server.get_wal(args.wal_name, compression=compression, output_directory=output_directory, peek=peek) output.close_and_exit()
def _truncate_partial_file_if_needed(self, xlog_segment_size): """ Truncate .partial WAL file if size is not 0 or xlog_segment_size :param int xlog_segment_size: """ # Retrieve the partial list (only one is expected) partial_files = glob( os.path.join(self.config.streaming_wals_directory, "*.partial")) # Take the last partial file, ignoring wrongly formatted file names last_partial = None for partial in partial_files: if not is_partial_file(partial): continue if not last_partial or partial > last_partial: last_partial = partial # Skip further work if there is no good partial file if not last_partial: return # If size is either 0 or wal_segment_size everything is fine... partial_size = os.path.getsize(last_partial) if partial_size == 0 or partial_size == xlog_segment_size: return # otherwise truncate the file to be empty. This is safe because # pg_receivewal pads the file to the full size before start writing. output.info("Truncating partial file %s that has wrong size %s " "while %s was expected." % (last_partial, partial_size, xlog_segment_size)) open(last_partial, "wb").close()
def exec_diagnose(servers, errors_list): """ Diagnostic command: gathers information from backup server and from all the configured servers. Gathered information should be used for support and problems detection :param dict(str,barman.server.Server) servers: list of configured servers :param list errors_list: list of global errors """ # global section. info about barman server diagnosis = {"global": {}, "servers": {}} # barman global config diagnosis["global"]["config"] = dict(barman.__config__._global_config) diagnosis["global"]["config"]["errors_list"] = errors_list try: command = fs.UnixLocalCommand() # basic system info diagnosis["global"]["system_info"] = command.get_system_info() except CommandFailedException as e: diagnosis["global"]["system_info"] = {"error": repr(e)} diagnosis["global"]["system_info"]["barman_ver"] = barman.__version__ diagnosis["global"]["system_info"]["timestamp"] = datetime.datetime.now() # per server section for name in sorted(servers): server = servers[name] if server is None: output.error("Unknown server '%s'" % name) continue # server configuration diagnosis["servers"][name] = {} diagnosis["servers"][name]["config"] = vars(server.config) del diagnosis["servers"][name]["config"]["config"] # server system info if server.config.ssh_command: try: command = fs.UnixRemoteCommand( ssh_command=server.config.ssh_command, path=server.path) diagnosis["servers"][name][ "system_info"] = command.get_system_info() except FsOperationFailed: pass # barman status information for the server diagnosis["servers"][name]["status"] = server.get_remote_status() # backup list backups = server.get_available_backups(BackupInfo.STATUS_ALL) diagnosis["servers"][name]["backups"] = backups # wal status diagnosis["servers"][name]["wals"] = { "last_archived_wal_per_timeline": server.backup_manager.get_latest_archived_wals_info(), } # Release any PostgreSQL resource server.close() output.info(json.dumps(diagnosis, cls=BarmanEncoder, indent=4, sort_keys=True), log=False)
def receive_wal(self, reset=False): """ Creates a PgReceiveXlog object and issues the pg_receivexlog command for a specific server :param bool reset: When set reset the status of receive-wal :raise ArchiverFailure: when something goes wrong """ # Ensure the presence of the destination directory mkpath(self.config.streaming_wals_directory) # Check if is a reset request if reset: self._reset_streaming_status() return # Execute basic sanity checks on PostgreSQL connection postgres_status = self.server.streaming.get_remote_status() if postgres_status["streaming_supported"] is None: raise ArchiverFailure( 'failed opening the PostgreSQL streaming connection') elif not postgres_status["streaming_supported"]: raise ArchiverFailure( 'PostgreSQL version too old (%s < 9.2)' % self.server.streaming.server_txt_version) # Execute basic sanity checks on pg_receivexlog remote_status = self.get_remote_status() if not remote_status["pg_receivexlog_installed"]: raise ArchiverFailure( 'pg_receivexlog not present in $PATH') if not remote_status['pg_receivexlog_compatible']: raise ArchiverFailure( 'pg_receivexlog version not compatible with ' 'PostgreSQL server version') # Make sure we are not wasting precious PostgreSQL resources self.server.postgres.close() self.server.streaming.close() _logger.info('Activating WAL archiving through streaming protocol') try: output_handler = PgReceiveXlog.make_output_handler( self.config.name + ': ') receive = PgReceiveXlog(remote_status['pg_receivexlog_path'], self.config.streaming_conninfo, self.config.streaming_wals_directory, out_handler=output_handler, err_handler=output_handler) receive.execute() except CommandFailedException as e: _logger.error(e) raise ArchiverFailure("pg_receivexlog exited with an error. " "Check the logs for more information.") except KeyboardInterrupt: # This is a normal termination, so there is nothing to do beside # informing the user. output.info('SIGINT received. Terminate gracefully.')
def backup(self): """ Performs a backup for the server """ _logger.debug("initialising backup information") self.executor.init() backup_info = None try: # Create the BackupInfo object representing the backup backup_info = BackupInfo( self.server, backup_id=datetime.datetime.now().strftime('%Y%m%dT%H%M%S')) backup_info.save() self.backup_cache_add(backup_info) output.info( "Starting backup for server %s in %s", self.config.name, backup_info.get_basebackup_directory()) # Run the pre-backup-script if present. script = HookScriptRunner(self, 'backup_script', 'pre') script.env_from_backup_info(backup_info) script.run() # Run the pre-backup-retry-script if present. retry_script = RetryHookScriptRunner( self, 'backup_retry_script', 'pre') retry_script.env_from_backup_info(backup_info) retry_script.run() # Do the backup using the BackupExecutor self.executor.backup(backup_info) # Compute backup size and fsync it on disk self.backup_fsync_and_set_sizes(backup_info) # Mark the backup as DONE backup_info.set_attribute("status", "DONE") # Use BaseException instead of Exception to catch events like # KeyboardInterrupt (e.g.: CRTL-C) except BaseException, e: msg_lines = str(e).strip().splitlines() if backup_info: # Use only the first line of exception message # in backup_info error field backup_info.set_attribute("status", "FAILED") # If the exception has no attached message use the raw type name if len(msg_lines) == 0: msg_lines = [type(e).__name__] backup_info.set_attribute( "error", "failure %s (%s)" % ( self.executor.current_action, msg_lines[0])) output.error("Backup failed %s.\nDETAILS: %s\n%s", self.executor.current_action, msg_lines[0], '\n'.join(msg_lines[1:]))
def exec_diagnose(servers, errors_list): """ Diagnostic command: gathers information from backup server and from all the configured servers. Gathered information should be used for support and problems detection :param dict(str,barman.server.Server) servers: list of configured servers :param list errors_list: list of global errors """ # global section. info about barman server diagnosis = {'global': {}, 'servers': {}} # barman global config diagnosis['global']['config'] = dict(barman.__config__._global_config) diagnosis['global']['config']['errors_list'] = errors_list command = fs.UnixLocalCommand() # basic system info diagnosis['global']['system_info'] = command.get_system_info() diagnosis['global']['system_info']['barman_ver'] = barman.__version__ # per server section for name in sorted(servers): server = servers[name] if server is None: output.error("Unknown server '%s'" % name) continue # server configuration diagnosis['servers'][name] = {} diagnosis['servers'][name]['config'] = vars(server.config) del diagnosis['servers'][name]['config']['config'] # server system info if server.config.ssh_command: try: command = fs.UnixRemoteCommand( ssh_command=server.config.ssh_command, path=server.path) diagnosis['servers'][name]['system_info'] = ( command.get_system_info()) except FsOperationFailed: pass # barman statuts information for the server diagnosis['servers'][name]['status'] = server.get_remote_status() # backup list backups = server.get_available_backups(BackupInfo.STATUS_ALL) diagnosis['servers'][name]['backups'] = backups # wal status diagnosis['servers'][name]['wals'] = { 'last_archived_wal_per_timeline': server.backup_manager.get_latest_archived_wals_info(), } # Release any PostgreSQL resource server.close() output.info( json.dumps(diagnosis, sys.stdout, cls=BarmanEncoder, indent=4, sort_keys=True))
def _reset_streaming_status(self): """ Reset the status of receive-wal removing any .partial files """ output.info("Resetting receive-wal directory status") partial_files = glob(os.path.join( self.config.streaming_wals_directory, '*.partial')) for partial in partial_files: output.info("Removing status file %s" % partial) os.unlink(partial)
def _reset_streaming_status(self): """ Reset the status of receive-wal removing any .partial files """ output.info("Resetting receive-wal directory status") partial_files = glob( os.path.join(self.config.streaming_wals_directory, '*.partial')) for partial in partial_files: output.info("Removing status file %s" % partial) os.unlink(partial)
def list_files(args): """ List all the files for a single backup """ server = get_server(args) # Retrieves the backup backup_id = parse_backup_id(server, args) for line in backup_id.get_list_of_files(args.target): output.info(line, log=False) output.close_and_exit()
def prepare_tablespaces(self, backup_info, cmd, dest, tablespaces): """ Prepare the directory structure for required tablespaces, taking care of tablespaces relocation, if requested. :param barman.infofile.BackupInfo backup_info: backup representation :param barman.fs.UnixLocalCommand cmd: Object for filesystem interaction :param str dest: destination dir for the recovery :param dict tablespaces: dict of all the tablespaces and their location """ tblspc_dir = os.path.join(dest, 'pg_tblspc') try: # check for pg_tblspc dir into recovery destination folder. # if it does not exists, create it cmd.create_dir_if_not_exists(tblspc_dir) except FsOperationFailed as e: output.exception( "unable to initialise tablespace directory " "'%s': %s", tblspc_dir, e) output.close_and_exit() for item in backup_info.tablespaces: # build the filename of the link under pg_tblspc directory pg_tblspc_file = os.path.join(tblspc_dir, str(item.oid)) # by default a tablespace goes in the same location where # it was on the source server when the backup was taken location = item.location # if a relocation has been requested for this tablespace, # use the target directory provided by the user if tablespaces and item.name in tablespaces: location = tablespaces[item.name] try: # remove the current link in pg_tblspc, if it exists # (raise an exception if it is a directory) cmd.delete_if_exists(pg_tblspc_file) # create tablespace location, if does not exist # (raise an exception if it is not possible) cmd.create_dir_if_not_exists(location) # check for write permissions on destination directory cmd.check_write_permission(location) # create symlink between tablespace and recovery folder cmd.create_symbolic_link(location, pg_tblspc_file) except FsOperationFailed as e: output.exception( "unable to prepare '%s' tablespace " "(destination '%s'): %s", item.name, location, e) output.close_and_exit() output.info("\t%s, %s, %s", item.oid, item.name, location)
def prepare_tablespaces(self, backup_info, cmd, dest, tablespaces): """ Prepare the directory structure for required tablespaces, taking care of tablespaces relocation, if requested. :param barman.infofile.BackupInfo backup_info: backup representation :param barman.fs.UnixLocalCommand cmd: Object for filesystem interaction :param str dest: destination dir for the recovery :param dict tablespaces: dict of all the tablespaces and their location """ tblspc_dir = os.path.join(dest, 'pg_tblspc') try: # check for pg_tblspc dir into recovery destination folder. # if it does not exists, create it cmd.create_dir_if_not_exists(tblspc_dir) except FsOperationFailed as e: output.exception("unable to initialise tablespace directory " "'%s': %s", tblspc_dir, e) output.close_and_exit() for item in backup_info.tablespaces: # build the filename of the link under pg_tblspc directory pg_tblspc_file = os.path.join(tblspc_dir, str(item.oid)) # by default a tablespace goes in the same location where # it was on the source server when the backup was taken location = item.location # if a relocation has been requested for this tablespace, # use the target directory provided by the user if tablespaces and item.name in tablespaces: location = tablespaces[item.name] try: # remove the current link in pg_tblspc, if it exists # (raise an exception if it is a directory) cmd.delete_if_exists(pg_tblspc_file) # create tablespace location, if does not exist # (raise an exception if it is not possible) cmd.create_dir_if_not_exists(location) # check for write permissions on destination directory cmd.check_write_permission(location) # create symlink between tablespace and recovery folder cmd.create_symbolic_link(location, pg_tblspc_file) except FsOperationFailed as e: output.exception("unable to prepare '%s' tablespace " "(destination '%s'): %s", item.name, location, e) output.close_and_exit() output.info("\t%s, %s, %s", item.oid, item.name, location)
def backup_fsync_and_set_sizes(self, backup_info): """ Fsync all files in a backup and set the actual size on disk of a backup. Also evaluate the deduplication ratio and the deduplicated size if applicable. :param barman.infofile.BackupInfo backup_info: the backup to update """ # Calculate the base backup size self.executor.current_action = "calculating backup size" _logger.debug(self.executor.current_action) backup_size = 0 deduplicated_size = 0 backup_dest = backup_info.get_basebackup_directory() for dir_path, _, file_names in os.walk(backup_dest): # execute fsync() on the containing directory fsync_dir(dir_path) # execute fsync() on all the contained files for filename in file_names: file_path = os.path.join(dir_path, filename) file_fd = os.open(file_path, os.O_RDONLY) file_stat = os.fstat(file_fd) backup_size += file_stat.st_size # Excludes hard links from real backup size if file_stat.st_nlink == 1: deduplicated_size += file_stat.st_size os.fsync(file_fd) os.close(file_fd) # Save size into BackupInfo object backup_info.set_attribute('size', backup_size) backup_info.set_attribute('deduplicated_size', deduplicated_size) if backup_info.size > 0: deduplication_ratio = 1 - (float( backup_info.deduplicated_size) / backup_info.size) else: deduplication_ratio = 0 if self.config.reuse_backup == 'link': output.info( "Backup size: %s. Actual size on disk: %s" " (-%s deduplication ratio)." % ( pretty_size(backup_info.size), pretty_size(backup_info.deduplicated_size), '{percent:.2%}'.format(percent=deduplication_ratio) )) else: output.info("Backup size: %s" % pretty_size(backup_info.size))
def delete_backup(self, backup): """ Delete a backup :param backup: the backup to delete """ available_backups = self.get_available_backups() minimum_redundancy = self.server.config.minimum_redundancy # Honour minimum required redundancy if backup.status == BackupInfo.DONE and \ minimum_redundancy >= len(available_backups): output.warning("Skipping delete of backup %s for server %s " "due to minimum redundancy requirements " "(minimum redundancy = %s, " "current redundancy = %s)", backup.backup_id, self.config.name, len(available_backups), minimum_redundancy) return output.info("Deleting backup %s for server %s", backup.backup_id, self.config.name) previous_backup = self.get_previous_backup(backup.backup_id) next_backup = self.get_next_backup(backup.backup_id) # Delete all the data contained in the backup try: self.delete_backup_data(backup) except OSError as e: output.error("Failure deleting backup %s for server %s.\n%s", backup.backup_id, self.config.name, e) return # Check if we are deleting the first available backup if not previous_backup: # In the case of exclusive backup (default), removes any WAL # files associated to the backup being deleted. # In the case of concurrent backup, removes only WAL files # prior to the start of the backup being deleted, as they # might be useful to any concurrent backup started immediately # after. remove_until = None # means to remove all WAL files if next_backup: remove_until = next_backup elif BackupOptions.CONCURRENT_BACKUP in self.config.backup_options: remove_until = backup output.info("Delete associated WAL segments:") for name in self.remove_wal_before_backup(remove_until): output.info("\t%s", name) # As last action, remove the backup directory, # ending the delete operation try: self.delete_basebackup(backup) except OSError as e: output.error("Failure deleting backup %s for server %s.\n%s\n" "Please manually remove the '%s' directory", backup.backup_id, self.config.name, e, backup.get_basebackup_directory()) return self.backup_cache_remove(backup) output.info("Done")
def exec_diagnose(servers, errors_list): """ Diagnostic command: gathers information from backup server and from all the configured servers. Gathered information should be used for support and problems detection :param dict(str,barman.server.Server) servers: list of configured servers :param list errors_list: list of global errors """ # global section. info about barman server diagnosis = {} diagnosis['global'] = {} diagnosis['servers'] = {} # barman global config diagnosis['global']['config'] = dict(barman.__config__._global_config) diagnosis['global']['config']['errors_list'] = errors_list command = fs.UnixLocalCommand() # basic system info diagnosis['global']['system_info'] = command.get_system_info() diagnosis['global']['system_info']['barman_ver'] = barman.__version__ # per server section for name in sorted(servers): server = servers[name] if server is None: output.error("Unknown server '%s'" % name) continue # server configuration diagnosis['servers'][name] = {} diagnosis['servers'][name]['config'] = vars(server.config) del diagnosis['servers'][name]['config']['config'] # server system info if server.config.ssh_command: try: command = fs.UnixRemoteCommand( ssh_command=server.config.ssh_command) diagnosis['servers'][name]['system_info'] = ( command.get_system_info()) except FsOperationFailed: pass # barman statuts information for the server diagnosis['servers'][name]['status'] = server.get_remote_status() # backup list backups = server.get_available_backups(BackupInfo.STATUS_ALL) diagnosis['servers'][name]['backups'] = backups # Release any PostgreSQL resource server.close() output.info(json.dumps(diagnosis, sys.stdout, cls=BarmanEncoder, indent=4, sort_keys=True))
def cron_retention_policy(self): """ Retention policy management """ if (self.server.enforce_retention_policies and self.config.retention_policy_mode == 'auto'): available_backups = self.get_available_backups( BackupInfo.STATUS_ALL) retention_status = self.config.retention_policy.report() for bid in sorted(retention_status.keys()): if retention_status[bid] == BackupInfo.OBSOLETE: output.info( "Enforcing retention policy: removing backup %s for " "server %s" % (bid, self.config.name)) self.delete_backup(available_backups[bid])
def cron(): """ Run maintenance tasks """ try: with lockfile.GlobalCronLock(barman.__config__.barman_lock_directory): servers = [Server(conf) for conf in barman.__config__.servers() if conf.active] for server in servers: server.cron() except lockfile.LockFileBusy: output.info("Another cron is running") except lockfile.LockFilePermissionDenied, e: output.error("Permission denied, unable to access '%s'", e)
def cron_retention_policy(self): """ Retention policy management """ if (self.server.enforce_retention_policies and self.config.retention_policy_mode == 'auto'): available_backups = self.get_available_backups( BackupInfo.STATUS_ALL) retention_status = self.config.retention_policy.report() for bid in sorted(retention_status.iterkeys()): if retention_status[bid] == BackupInfo.OBSOLETE: output.info( "Enforcing retention policy: removing backup %s for " "server %s" % (bid, self.config.name)) self.delete_backup(available_backups[bid])
def delete_backup(self, backup): """ Delete a backup :param backup: the backup to delete """ available_backups = self.get_available_backups() minimum_redundancy = self.server.config.minimum_redundancy # Honour minimum required redundancy if backup.status == BackupInfo.DONE and \ minimum_redundancy >= len(available_backups): output.warning( "Skipping delete of backup %s for server %s " "due to minimum redundancy requirements " "(minimum redundancy = %s, " "current redundancy = %s)", backup.backup_id, self.config.name, len(available_backups), minimum_redundancy) return output.info("Deleting backup %s for server %s", backup.backup_id, self.config.name) previous_backup = self.get_previous_backup(backup.backup_id) next_backup = self.get_next_backup(backup.backup_id) # Delete all the data contained in the backup try: self.delete_backup_data(backup) except OSError as e: output.error("Failure deleting backup %s for server %s.\n%s", backup.backup_id, self.config.name, e) return # Check if we are deleting the first available backup if not previous_backup: # In the case of exclusive backup (default), removes any WAL # files associated to the backup being deleted. # In the case of concurrent backup, removes only WAL files # prior to the start of the backup being deleted, as they # might be useful to any concurrent backup started immediately # after. remove_until = None # means to remove all WAL files if next_backup: remove_until = next_backup elif BackupOptions.CONCURRENT_BACKUP in self.config.backup_options: remove_until = backup output.info("Delete associated WAL segments:") for name in self.remove_wal_before_backup(remove_until): output.info("\t%s", name) # As last action, remove the backup directory, # ending the delete operation try: self.delete_basebackup(backup) except OSError as e: output.error( "Failure deleting backup %s for server %s.\n%s\n" "Please manually remove the '%s' directory", backup.backup_id, self.config.name, e, backup.get_basebackup_directory()) return self.backup_cache_remove(backup) output.info("Done")
def _retrieve_safe_horizon(self, recovery_info, backup_info, dest): """ Retrieve the safe_horizon for smart copy If the target directory contains a previous recovery, it is safe to pick the least of the two backup "begin times" (the one we are recovering now and the one previously recovered in the target directory). Set the value in the given recovery_info dictionary. :param dict recovery_info: Dictionary containing all the recovery parameters :param barman.infofile.BackupInfo backup_info: a backup representation :param str dest: recovery destination directory """ # noinspection PyBroadException try: backup_begin_time = backup_info.begin_time # Retrieve previously recovered backup metadata (if available) dest_info_txt = recovery_info['cmd'].get_file_content( os.path.join(dest, '.barman-recover.info')) dest_info = BackupInfo(self.server, info_file=StringIO(dest_info_txt)) dest_begin_time = dest_info.begin_time # Pick the earlier begin time. Both are tz-aware timestamps because # BackupInfo class ensure it safe_horizon = min(backup_begin_time, dest_begin_time) output.info("Using safe horizon time for smart rsync copy: %s", safe_horizon) except FsOperationFailed as e: # Setting safe_horizon to None will effectively disable # the time-based part of smart_copy method. However it is still # faster than running all the transfers with checksum enabled. # # FsOperationFailed means the .barman-recover.info is not available # on destination directory safe_horizon = None _logger.warning( 'Unable to retrieve safe horizon time ' 'for smart rsync copy: %s', e) except Exception as e: # Same as above, but something failed decoding .barman-recover.info # or comparing times, so log the full traceback safe_horizon = None _logger.exception( 'Error retrieving safe horizon time ' 'for smart rsync copy: %s', e) recovery_info['safe_horizon'] = safe_horizon
def retrieve_safe_horizon(self, recovery_info, backup_info, dest): """ Retrieve the safe_horizon for smart copy If the target directory contains a previous recovery, it is safe to pick the least of the two backup "begin times" (the one we are recovering now and the one previously recovered in the target directory). Set the value in the given recovery_info dictionary. :param dict recovery_info: Dictionary containing all the recovery parameters :param barman.infofile.BackupInfo backup_info: a backup representation :param str dest: recovery destination directory """ # noinspection PyBroadException try: backup_begin_time = backup_info.begin_time # Retrieve previously recovered backup metadata (if available) dest_info_txt = recovery_info['cmd'].get_file_content( os.path.join(dest, '.barman-recover.info')) dest_info = BackupInfo( self.server, info_file=StringIO(dest_info_txt)) dest_begin_time = dest_info.begin_time # Pick the earlier begin time. Both are tz-aware timestamps because # BackupInfo class ensure it safe_horizon = min(backup_begin_time, dest_begin_time) output.info("Using safe horizon time for smart rsync copy: %s", safe_horizon) except FsOperationFailed as e: # Setting safe_horizon to None will effectively disable # the time-based part of smart_copy method. However it is still # faster than running all the transfers with checksum enabled. # # FsOperationFailed means the .barman-recover.info is not available # on destination directory safe_horizon = None _logger.warning('Unable to retrieve safe horizon time ' 'for smart rsync copy: %s', e) except Exception as e: # Same as above, but something failed decoding .barman-recover.info # or comparing times, so log the full traceback safe_horizon = None _logger.exception('Error retrieving safe horizon time ' 'for smart rsync copy: %s', e) recovery_info['safe_horizon'] = safe_horizon
def recover(self, backup_info, dest, tablespaces, target_tli, target_time, target_xid, target_name, exclusive, remote_command): """ Performs a recovery of a backup :param barman.infofile.BackupInfo backup_info: the backup to recover :param str dest: the destination directory :param dict[str,str]|None tablespaces: a tablespace name -> location map (for relocation) :param str|None target_tli: the target timeline :param str|None target_time: the target time :param str|None target_xid: the target xid :param str|None target_name: the target name created previously with pg_create_restore_point() function call :param bool exclusive: whether the recovery is exclusive or not :param str|None remote_command: The remote command to recover the base backup, in case of remote backup. """ # Run the cron to be sure the wal catalog is up to date # Prepare a map that contains all the objects required for a recovery recovery_info = self.setup(backup_info, remote_command, dest) output.info("Starting %s restore for server %s using backup %s", recovery_info['recovery_dest'], self.server.config.name, backup_info.backup_id) output.info("Destination directory: %s", dest) # Set targets for PITR self.set_pitr_targets(recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid) # Retrieve the safe_horizon for smart copy self.retrieve_safe_horizon(recovery_info, backup_info, dest) # check destination directory. If doesn't exist create it try: recovery_info['cmd'].create_dir_if_not_exists(dest) except FsOperationFailed, e: output.exception("unable to initialise destination directory " "'%s': %s", dest, e) output.close_and_exit()
def list_files(args): """ List all the files for a single backup """ server = get_server(args) if server is None: output.error("Unknown server '%s'", args.server_name) output.close_and_exit() # Retrieves the backup backup = parse_backup_id(server, args) if backup is None: output.error("Unknown backup '%s' for server '%s'", args.backup_id, args.server_name) output.close_and_exit() for line in backup.get_list_of_files(args.target): output.info(line, log=False) output.close_and_exit()
def backup(self, backup_info): """ Perform a backup for the server - invoked by BackupManager.backup() through the generic interface of a BackupExecutor. This implementation is responsible for performing a backup through a remote connection to the PostgreSQL server via Ssh. The specific set of instructions depends on both the specific class that derives from SshBackupExecutor and the selected strategy (e.g. exclusive backup through Rsync). :param barman.infofile.BackupInfo backup_info: backup information """ # Start the backup, all the subsequent code must be wrapped in a # try except block which finally issues a backup_stop command try: self.strategy.start_backup(backup_info) except BaseException: self._update_action_from_strategy() raise try: # save any metadata changed by start_backup() call # This must be inside the try-except, because it could fail backup_info.save() # If this is the first backup, purge unused WAL files previous_backup = self.backup_manager.get_previous_backup( backup_info.backup_id) if not previous_backup: self.backup_manager.remove_wal_before_backup(backup_info) output.info("Backup start at xlog location: %s (%s, %08X)", backup_info.begin_xlog, backup_info.begin_wal, backup_info.begin_offset) # Start the copy self.current_action = "copying files" output.info("Copying files.") # perform the backup copy, honouring the retry option if set self.backup_manager.retry_backup_copy(self.backup_copy, backup_info) output.info("Copy done.") except: # we do not need to do anything here besides re-raising the # exception. It will be handled in the external try block. raise else: self.current_action = "issuing stop of the backup" output.info("Asking PostgreSQL server to finalize the backup.") finally: try: self.strategy.stop_backup(backup_info) except BaseException: self._update_action_from_strategy() raise
def cron(): """ Run maintenance tasks """ lockname = os.path.join(barman.__config__.barman_home, '.cron.lock') try: with lockfile.LockFile(lockname, raise_if_fail=True): servers = [Server(conf) for conf in barman.__config__.servers()] for server in servers: server.cron() except lockfile.LockFileBusy: output.info("Another cron is running") except lockfile.LockFilePermissionDenied: output.error("Permission denied, unable to access '%s'", lockname) output.close_and_exit()
def list_files(args): """ List all the files for a single backup """ server = get_server(args) # Retrieves the backup backup_info = parse_backup_id(server, args) try: for line in backup_info.get_list_of_files(args.target): output.info(line, log=False) except BadXlogSegmentName as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue", str(e), server.config.name) output.close_and_exit()
def list_files(args): """ List all the files for a single backup """ server = get_server(args) # Retrieves the backup backup_id = parse_backup_id(server, args) try: for line in backup_id.get_list_of_files(args.target): output.info(line, log=False) except BadXlogSegmentName as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue", str(e), server.config.name) output.close_and_exit()
def test_info_error(self, caplog): # preparation writer = self._mock_writer() msg = 'test message' output.info(msg, is_error=True) # logging test for record in caplog.records: assert record.levelname == 'INFO' assert record.name == __name__ assert msg in caplog.text # writer test writer.error_occurred.assert_called_once_with() writer.info.assert_called_once_with(msg) # global status test assert output.error_occurred
def test_info_error(self, caplog): # preparation writer = self._mock_writer() msg = 'test message' output.info(msg, is_error=True) # logging test for record in caplog.records(): assert record.levelname == 'INFO' assert record.name == __name__ assert msg in caplog.text() # writer test writer.error_occurred.assert_called_once_with() writer.info.assert_called_once_with(msg) # global status test assert output.error_occurred
def test_info(self, caplog): # preparation writer = self._mock_writer() msg = "test message" output.info(msg) # logging test for record in caplog.records: assert record.levelname == "INFO" assert record.name == __name__ assert msg in caplog.text # writer test assert not writer.error_occurred.called writer.info.assert_called_once_with(msg) # global status test assert not output.error_occurred
def check_directories(self, check_strategy): """ Checks backup directories and creates them if they do not exist :param CheckStrategy check_strategy: the strategy for the management of the results of the various checks """ if self.config.disabled: check_strategy.result(self.config.name, 'directories', False) for conflict_paths in self.config.msg_list: output.info("\t%s" % conflict_paths) else: try: self._make_directories() except OSError, e: check_strategy.result(self.config.name, 'directories', False, "%s: %s" % (e.filename, e.strerror)) else:
def _reset_streaming_status(self, postgres_status, streaming_status): """ Reset the status of receive-wal by removing the .partial file that is marking the current position and creating one that is current with the PostgreSQL insert location """ current_wal = xlog.location_to_xlogfile_name_offset( postgres_status["current_lsn"], streaming_status["timeline"], postgres_status["xlog_segment_size"], )["file_name"] restart_wal = current_wal if (postgres_status["replication_slot"] and postgres_status["replication_slot"].restart_lsn): restart_wal = xlog.location_to_xlogfile_name_offset( postgres_status["replication_slot"].restart_lsn, streaming_status["timeline"], postgres_status["xlog_segment_size"], )["file_name"] restart_path = os.path.join(self.config.streaming_wals_directory, restart_wal) restart_partial_path = restart_path + ".partial" wal_files = sorted(glob( os.path.join(self.config.streaming_wals_directory, "*")), reverse=True) # Pick the newer file last = None for last in wal_files: if xlog.is_wal_file(last) or xlog.is_partial_file(last): break # Check if the status is already up-to-date if not last or last == restart_partial_path or last == restart_path: output.info("Nothing to do. Position of receive-wal is aligned.") return if os.path.basename(last) > current_wal: output.error( "The receive-wal position is ahead of PostgreSQL " "current WAL lsn (%s > %s)", os.path.basename(last), postgres_status["current_xlog"], ) return output.info("Resetting receive-wal directory status") if xlog.is_partial_file(last): output.info("Removing status file %s" % last) os.unlink(last) output.info("Creating status file %s" % restart_partial_path) open(restart_partial_path, "w").close()
def test_info_with_args(self, caplog): # preparation writer = self._mock_writer() msg = 'test format %02d %s' args = (1, '2nd') output.info(msg, *args) # logging test for record in caplog.records(): assert record.levelname == 'INFO' assert record.name == __name__ assert msg % args in caplog.text() # writer test assert not writer.error_occurred.called writer.info.assert_called_once_with(msg, *args) # global status test assert not output.error_occurred
def put_wal(args): """ Receive a WAL file from SERVER_NAME and securely store it in the incoming directory. The file will be read from standard input in tar format. """ server = get_server(args, inactive_is_error=True) if getattr(args, "test", None): output.info("Ready to accept WAL files for the server %s", server.config.name) return try: # Python 3.x stream = sys.stdin.buffer except AttributeError: # Python 2.x stream = sys.stdin with closing(server): server.put_wal(stream) output.close_and_exit()
def test_info_with_args(self, caplog): # preparation writer = self._mock_writer() msg = 'test format %02d %s' args = (1, '2nd') output.info(msg, *args) # logging test for record in caplog.records: assert record.levelname == 'INFO' assert record.name == __name__ assert msg % args in caplog.text # writer test assert not writer.error_occurred.called writer.info.assert_called_once_with(msg, *args) # global status test assert not output.error_occurred
def sync_info(args): """ Output the internal synchronisation status. Used to sync_backup with a passive node """ server = get_server(args) try: # if called with --primary option if getattr(args, 'primary', False): primary_info = server.primary_node_info(args.last_wal, args.last_position) output.info(json.dumps(primary_info, cls=BarmanEncoder, indent=4), log=False) else: server.sync_status(args.last_wal, args.last_position) except SyncError as e: # Catch SyncError exceptions and output only the error message, # preventing from logging the stack trace output.error(e) output.close_and_exit()
def exec_diagnose(servers): """ Diagnostic command: gathers information from backup server and from all the configured servers. Gathered information should be used for support and problems detection :param servers: list of configured servers """ # global section. info about barman server diagnosis = {} diagnosis['global'] = {} diagnosis['servers'] = {} # barman global config diagnosis['global']['config'] = dict(barman.__config__._global_config) command = fs.UnixLocalCommand() # basic system info diagnosis['global']['system_info'] = command.get_system_info() diagnosis['global']['system_info']['barman_ver'] = barman.__version__ # per server section for name in sorted(servers): server = servers[name] if server is None: output.error("Unknown server '%s'" % name) continue # server configuration diagnosis['servers'][name] = {} diagnosis['servers'][name]['config'] = vars(server.config) del diagnosis['servers'][name]['config']['config'] # server system info command = fs.UnixRemoteCommand(ssh_command=server.config.ssh_command) diagnosis['servers'][name]['system_info'] = command.get_system_info() # barman statuts information for the server diagnosis['servers'][name]['status'] = server.get_remote_status() # backup list status_filter = BackupInfo.STATUS_NOT_EMPTY backups = server.get_available_backups(status_filter) diagnosis['servers'][name]['backups'] = backups output.info(json.dumps(diagnosis, sys.stdout, cls=BarmanEncoder, indent=4, sort_keys=True))
def create_backup_manifest(self): """ Will create a manifest file if it doesn't exists. :return: """ if self.file_manager.file_exist(self._get_manifest_file_path()): msg = ("File %s already exists. Skip file creation." % self._get_manifest_file_path()) logging.info(msg) output.info(msg) return self._create_files_metadata() str_manifest = self._get_manifest_str() # Create checksum from string without last '}' and ',' instead manifest_checksum = self.checksum_algorithm.checksum_from_str( str_manifest) last_line = '"Manifest-Checksum": "%s"}\n' % manifest_checksum full_manifest = str_manifest + last_line self.file_manager.save_content_to_file(self._get_manifest_file_path(), full_manifest.encode(), file_mode="wb")
def backup(self, backup_info): """ Implementation of the BackupExecutor.backup(backup_info) method. Execute the copy of a backup from a remote server using rsync :param barman.infofile.BackupInfo backup_info: the object representing the backup. :returns: the representation of a finalized backup. """ # Start the backup, all the subsequent code must be wrapped in a # try except block which finally issue a backup_stop command self.start_backup(backup_info) try: # save any metadata changed by start_backup() call # This must be inside the try-except, because it could fail backup_info.save() # If we are the first backup, purge unused WAL files previous_backup = self.backup_manager.get_previous_backup( backup_info.backup_id) if not previous_backup: self.backup_manager.remove_wal_before_backup(backup_info) output.info("Backup start at xlog location: %s (%s, %08X)", backup_info.begin_xlog, backup_info.begin_wal, backup_info.begin_offset) # Start the copy self.current_action = "copying files" output.info("Copying files.") # perform the backup copy, honouring the retry option if set self.backup_manager.retry_backup_copy(self.backup_copy, backup_info) output.info("Copy done.") except: # we do not need to do anything here besides re-raising the # exception. It will be handled in the external try block. raise else: self.current_action = "issuing stop of the backup" output.info("Asking PostgreSQL server to finalize the backup.") finally: self.stop_backup(backup_info) if BackupOptions.CONCURRENT_BACKUP in self.config.backup_options: self.current_action = "writing backup label" self._write_backup_label(backup_info)
def cron(): """ Run maintenance tasks (global command) """ try: with lockfile.GlobalCronLock(barman.__config__.barman_lock_directory): # Skip inactive and temporarily disabled servers servers = get_server_list(skip_inactive=True, skip_disabled=True) for name in sorted(servers): server = servers[name] # Exception: manage_server_command is not invoked here # Normally you would call manage_server_command to check if the # server is None and to report inactive and disabled servers, # but here we have only active and well configured servers. server.cron() except lockfile.LockFileBusy: output.info("Another cron is running") except lockfile.LockFilePermissionDenied, e: output.error("Permission denied, unable to access '%s'", e)
def _set_pitr_targets(self, recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid, target_immediate, target_action): """ Set PITR targets - as specified by the user :param dict recovery_info: Dictionary containing all the recovery parameters :param barman.infofile.BackupInfo backup_info: representation of a backup :param str dest: destination directory of the recovery :param str|None target_name: recovery target name for PITR :param str|None target_time: recovery target time for PITR :param str|None target_tli: recovery target timeline for PITR :param str|None target_xid: recovery target transaction id for PITR :param bool|None target_immediate: end recovery as soon as consistency is reached :param str|None target_action: recovery target action for PITR """ target_epoch = None target_datetime = None d_immediate = backup_info.version >= 90400 and target_immediate d_tli = target_tli and target_tli != backup_info.timeline # Detect PITR if target_time or target_xid or d_tli or target_name or d_immediate: recovery_info['is_pitr'] = True targets = {} if target_time: try: target_datetime = dateutil.parser.parse(target_time) except ValueError as e: raise RecoveryInvalidTargetException( "Unable to parse the target time parameter %r: %s" % (target_time, e)) except TypeError: # this should not happen, but there is a known bug in # dateutil.parser.parse() implementation # ref: https://bugs.launchpad.net/dateutil/+bug/1247643 raise RecoveryInvalidTargetException( "Unable to parse the target time parameter %r" % target_time) # If the parsed timestamp is naive, forces it to local timezone if target_datetime.tzinfo is None: target_datetime = target_datetime.replace( tzinfo=dateutil.tz.tzlocal()) # Check if the target time is reachable from the # selected backup if backup_info.end_time > target_datetime: raise RecoveryInvalidTargetException( "The requested target time %s " "is before the backup end time %s" % (target_datetime, backup_info.end_time)) ms = target_datetime.microsecond / 1000000. target_epoch = time.mktime(target_datetime.timetuple()) + ms targets['time'] = str(target_datetime) if target_xid: targets['xid'] = str(target_xid) if target_tli and target_tli != backup_info.timeline: targets['timeline'] = str(target_tli) if target_name: targets['name'] = str(target_name) if target_immediate: targets['immediate'] = target_immediate # Manage the target_action option if backup_info.version < 90100: if target_action: raise RecoveryTargetActionException( "Illegal target action '%s' " "for this version of PostgreSQL" % target_action) elif 90100 <= backup_info.version < 90500: if target_action == 'pause': recovery_info['pause_at_recovery_target'] = "on" elif target_action: raise RecoveryTargetActionException( "Illegal target action '%s' " "for this version of PostgreSQL" % target_action) else: if target_action in ('pause', 'shutdown', 'promote'): recovery_info['recovery_target_action'] = target_action elif target_action: raise RecoveryTargetActionException( "Illegal target action '%s' " "for this version of PostgreSQL" % target_action) output.info( "Doing PITR. Recovery target %s", (", ".join(["%s: %r" % (k, v) for k, v in targets.items()]))) recovery_info['wal_dest'] = os.path.join(dest, 'barman_xlog') # With a PostgreSQL version older than 8.4, it is the user's # responsibility to delete the "barman_xlog" directory as the # restore_command option in recovery.conf is not supported if backup_info.version < 80400 and \ not recovery_info['get_wal']: recovery_info['results']['delete_barman_xlog'] = True else: if target_action: raise RecoveryTargetActionException( "Can't enable recovery target action when PITR " "is not required") recovery_info['target_epoch'] = target_epoch recovery_info['target_datetime'] = target_datetime
def recover(self, backup_info, dest, tablespaces=None, remote_command=None, target_tli=None, target_time=None, target_xid=None, target_name=None, target_immediate=False, exclusive=False, target_action=None, standby_mode=None): """ Performs a recovery of a backup This method should be called in a closing context :param barman.infofile.BackupInfo backup_info: the backup to recover :param str dest: the destination directory :param dict[str,str]|None tablespaces: a tablespace name -> location map (for relocation) :param str|None remote_command: The remote command to recover the base backup, in case of remote backup. :param str|None target_tli: the target timeline :param str|None target_time: the target time :param str|None target_xid: the target xid :param str|None target_name: the target name created previously with pg_create_restore_point() function call :param str|None target_immediate: end recovery as soon as consistency is reached :param bool exclusive: whether the recovery is exclusive or not :param str|None target_action: The recovery target action :param bool|None standby_mode: standby mode """ # Run the cron to be sure the wal catalog is up to date # Prepare a map that contains all the objects required for a recovery recovery_info = self._setup(backup_info, remote_command, dest) output.info("Starting %s restore for server %s using backup %s", recovery_info['recovery_dest'], self.server.config.name, backup_info.backup_id) output.info("Destination directory: %s", dest) if remote_command: output.info("Remote command: %s", remote_command) # If the backup we are recovering is still not validated and we # haven't requested the get-wal feature, display a warning message if not recovery_info['get_wal']: if backup_info.status == BackupInfo.WAITING_FOR_WALS: output.warning( "IMPORTANT: You have requested a recovery operation for " "a backup that does not have yet all the WAL files that " "are required for consistency.") # Set targets for PITR self._set_pitr_targets(recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid, target_immediate, target_action) # Retrieve the safe_horizon for smart copy self._retrieve_safe_horizon(recovery_info, backup_info, dest) # check destination directory. If doesn't exist create it try: recovery_info['cmd'].create_dir_if_not_exists(dest) except FsOperationFailed as e: output.error( "unable to initialise destination directory " "'%s': %s", dest, e) output.close_and_exit() # Initialize tablespace directories if backup_info.tablespaces: self._prepare_tablespaces(backup_info, recovery_info['cmd'], dest, tablespaces) # Copy the base backup output.info("Copying the base backup.") try: self._backup_copy(backup_info, dest, tablespaces, remote_command, recovery_info['safe_horizon']) except DataTransferFailure as e: output.error("Failure copying base backup: %s", e) output.close_and_exit() # Copy the backup.info file in the destination as # ".barman-recover.info" if remote_command: try: recovery_info['rsync'](backup_info.filename, ':%s/.barman-recover.info' % dest) except CommandFailedException as e: output.error('copy of recovery metadata file failed: %s', e) output.close_and_exit() else: backup_info.save(os.path.join(dest, '.barman-recover.info')) # Standby mode is not available for PostgreSQL older than 9.0 if backup_info.version < 90000 and standby_mode: raise RecoveryStandbyModeException( 'standby_mode is available only from PostgreSQL 9.0') # Restore the WAL segments. If GET_WAL option is set, skip this phase # as they will be retrieved using the wal-get command. if not recovery_info['get_wal']: # If the backup we restored is still waiting for WALS, read the # backup info again and check whether it has been validated. # Notify the user if it is still not DONE. if backup_info.status == BackupInfo.WAITING_FOR_WALS: data = BackupInfo(self.server, backup_info.filename) if data.status == BackupInfo.WAITING_FOR_WALS: output.warning( "IMPORTANT: The backup we have recovered IS NOT " "VALID. Required WAL files for consistency are " "missing. Please verify that WAL archiving is " "working correctly or evaluate using the 'get-wal' " "option for recovery") output.info("Copying required WAL segments.") try: # Retrieve a list of required log files required_xlog_files = tuple( self.server.get_required_xlog_files( backup_info, target_tli, recovery_info['target_epoch'])) # Restore WAL segments into the wal_dest directory self._xlog_copy(required_xlog_files, recovery_info['wal_dest'], remote_command) except DataTransferFailure as e: output.error("Failure copying WAL files: %s", e) output.close_and_exit() except BadXlogSegmentName as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue", force_str(e), self.config.name) output.close_and_exit() # If WAL files are put directly in the pg_xlog directory, # avoid shipping of just recovered files # by creating the corresponding archive status file if not recovery_info['is_pitr']: output.info("Generating archive status files") self._generate_archive_status(recovery_info, remote_command, required_xlog_files) # Generate recovery.conf file (only if needed by PITR or get_wal) is_pitr = recovery_info['is_pitr'] get_wal = recovery_info['get_wal'] if is_pitr or get_wal or standby_mode: output.info("Generating recovery.conf") self._generate_recovery_conf(recovery_info, backup_info, dest, target_immediate, exclusive, remote_command, target_name, target_time, target_tli, target_xid, standby_mode) # Create archive_status directory if necessary archive_status_dir = os.path.join(recovery_info['wal_dest'], 'archive_status') try: recovery_info['cmd'].create_dir_if_not_exists(archive_status_dir) except FsOperationFailed as e: output.error( "unable to create the archive_status directory " "'%s': %s", archive_status_dir, e) output.close_and_exit() # As last step, analyse configuration files in order to spot # harmful options. Barman performs automatic conversion of # some options as well as notifying users of their existence. # # This operation is performed in three steps: # 1) mapping # 2) analysis # 3) copy output.info("Identify dangerous settings in destination directory.") self._map_temporary_config_files(recovery_info, backup_info, remote_command) self._analyse_temporary_config_files(recovery_info) self._copy_temporary_config_files(dest, remote_command, recovery_info) return recovery_info
def _set_pitr_targets(self, recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid): """ Set PITR targets - as specified by the user :param dict recovery_info: Dictionary containing all the recovery parameters :param barman.infofile.BackupInfo backup_info: representation of a backup :param str dest: destination directory of the recovery :param str|None target_name: recovery target name for PITR :param str|None target_time: recovery target time for PITR :param str|None target_tli: recovery target timeline for PITR :param str|None target_xid: recovery target transaction id for PITR """ target_epoch = None target_datetime = None if (target_time or target_xid or (target_tli and target_tli != backup_info.timeline) or target_name or recovery_info['get_wal']): recovery_info['is_pitr'] = True targets = {} if target_time: # noinspection PyBroadException try: target_datetime = dateutil.parser.parse(target_time) except ValueError as e: output.error( "unable to parse the target time parameter %r: %s", target_time, e) self._teardown(recovery_info) output.close_and_exit() except Exception: # this should not happen, but there is a known bug in # dateutil.parser.parse() implementation # ref: https://bugs.launchpad.net/dateutil/+bug/1247643 output.error( "unable to parse the target time parameter %r", target_time) output.close_and_exit() target_epoch = (time.mktime(target_datetime.timetuple()) + (target_datetime.microsecond / 1000000.)) targets['time'] = str(target_datetime) if target_xid: targets['xid'] = str(target_xid) if target_tli and target_tli != backup_info.timeline: targets['timeline'] = str(target_tli) if target_name: targets['name'] = str(target_name) output.info( "Doing PITR. Recovery target %s", (", ".join(["%s: %r" % (k, v) for k, v in targets.items()]))) recovery_info['wal_dest'] = os.path.join(dest, 'barman_xlog') # With a PostgreSQL version older than 8.4, it is the user's # responsibility to delete the "barman_xlog" directory as the # restore_command option in recovery.conf is not supported if backup_info.version < 80400 and \ not recovery_info['get_wal']: recovery_info['results']['delete_barman_xlog'] = True recovery_info['target_epoch'] = target_epoch recovery_info['target_datetime'] = target_datetime
def backup(self, wait=False, wait_timeout=None): """ Performs a backup for the server :param bool wait: wait for all the required WAL files to be archived :param int|None wait_timeout: :return BackupInfo: the generated BackupInfo """ _logger.debug("initialising backup information") self.executor.init() backup_info = None try: # Create the BackupInfo object representing the backup backup_info = LocalBackupInfo( self.server, backup_id=datetime.datetime.now().strftime('%Y%m%dT%H%M%S')) backup_info.set_attribute('systemid', self.server.systemid) backup_info.save() self.backup_cache_add(backup_info) output.info("Starting backup using %s method for server %s in %s", self.mode, self.config.name, backup_info.get_basebackup_directory()) # Run the pre-backup-script if present. script = HookScriptRunner(self, 'backup_script', 'pre') script.env_from_backup_info(backup_info) script.run() # Run the pre-backup-retry-script if present. retry_script = RetryHookScriptRunner(self, 'backup_retry_script', 'pre') retry_script.env_from_backup_info(backup_info) retry_script.run() # Do the backup using the BackupExecutor self.executor.backup(backup_info) # Create a restore point after a backup target_name = 'barman_%s' % backup_info.backup_id self.server.postgres.create_restore_point(target_name) # Free the Postgres connection self.server.postgres.close() # Compute backup size and fsync it on disk self.backup_fsync_and_set_sizes(backup_info) # Mark the backup as WAITING_FOR_WALS backup_info.set_attribute("status", BackupInfo.WAITING_FOR_WALS) # Use BaseException instead of Exception to catch events like # KeyboardInterrupt (e.g.: CTRL-C) except BaseException as e: msg_lines = force_str(e).strip().splitlines() # If the exception has no attached message use the raw # type name if len(msg_lines) == 0: msg_lines = [type(e).__name__] if backup_info: # Use only the first line of exception message # in backup_info error field backup_info.set_attribute("status", BackupInfo.FAILED) backup_info.set_attribute( "error", "failure %s (%s)" % (self.executor.current_action, msg_lines[0])) output.error("Backup failed %s.\nDETAILS: %s", self.executor.current_action, '\n'.join(msg_lines)) else: output.info("Backup end at LSN: %s (%s, %08X)", backup_info.end_xlog, backup_info.end_wal, backup_info.end_offset) executor = self.executor output.info( "Backup completed (start time: %s, elapsed time: %s)", self.executor.copy_start_time, human_readable_timedelta(datetime.datetime.now() - executor.copy_start_time)) # If requested, wait for end_wal to be archived if wait: try: self.server.wait_for_wal(backup_info.end_wal, wait_timeout) self.check_backup(backup_info) except KeyboardInterrupt: # Ignore CTRL-C pressed while waiting for WAL files output.info( "Got CTRL-C. Continuing without waiting for '%s' " "to be archived", backup_info.end_wal) finally: if backup_info: backup_info.save() # Make sure we are not holding any PostgreSQL connection # during the post-backup scripts self.server.close() # Run the post-backup-retry-script if present. try: retry_script = RetryHookScriptRunner( self, 'backup_retry_script', 'post') retry_script.env_from_backup_info(backup_info) retry_script.run() except AbortedRetryHookScript as e: # Ignore the ABORT_STOP as it is a post-hook operation _logger.warning( "Ignoring stop request after receiving " "abort (exit code %d) from post-backup " "retry hook script: %s", e.hook.exit_status, e.hook.script) # Run the post-backup-script if present. script = HookScriptRunner(self, 'backup_script', 'post') script.env_from_backup_info(backup_info) script.run() output.result('backup', backup_info) return backup_info
def archive(self, fxlogdb, verbose=True): """ Archive WAL files, discarding duplicates or those that are not valid. :param file fxlogdb: File object for xlogdb interactions :param boolean verbose: Flag for verbose output """ compressor = self.backup_manager.compression_manager.get_compressor() stamp = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') found = False if verbose: output.info("Processing xlog segments from %s for %s", self.name, self.config.name, log=False) batch = self.get_next_batch() for wal_info in batch: if not found and not verbose: output.info("Processing xlog segments from %s for %s", self.name, self.config.name, log=False) found = True # Report to the user the WAL file we are archiving output.info("\t%s", wal_info.name, log=False) _logger.info("Archiving %s/%s", self.config.name, wal_info.name) # Archive the WAL file try: self.archive_wal(compressor, wal_info) except MatchingDuplicateWalFile: # We already have this file. Simply unlink the file. os.unlink(wal_info.orig_filename) continue except DuplicateWalFile: output.info("\tError: %s is already present in server %s. " "File moved to errors directory.", wal_info.name, self.config.name) error_dst = os.path.join( self.config.errors_directory, "%s.%s.duplicate" % (wal_info.name, stamp)) # TODO: cover corner case of duplication (unlikely, # but theoretically possible) shutil.move(wal_info.orig_filename, error_dst) continue except AbortedRetryHookScript as e: _logger.warning("Archiving of %s/%s aborted by " "pre_archive_retry_script." "Reason: %s" % (self.config.name, wal_info.name, e)) return # Updates the information of the WAL archive with # the latest segments fxlogdb.write(wal_info.to_xlogdb_line()) # flush and fsync for every line fxlogdb.flush() os.fsync(fxlogdb.fileno()) if not found and verbose: output.info("\tno file found", log=False) if batch.errors: output.info("Some unknown objects have been found while " "processing xlog segments for %s. " "Objects moved to errors directory:", self.config.name, log=False) for error in batch.errors: output.info("\t%s", error) error_dst = os.path.join( self.config.errors_directory, "%s.%s.unknown" % (os.path.basename(error), stamp)) shutil.move(error, error_dst)
def delete_backup(self, backup): """ Delete a backup :param backup: the backup to delete """ available_backups = self.get_available_backups() minimum_redundancy = self.server.config.minimum_redundancy # Honour minimum required redundancy if backup.status == BackupInfo.DONE and \ minimum_redundancy >= len(available_backups): output.warning( "Skipping delete of backup %s for server %s " "due to minimum redundancy requirements " "(minimum redundancy = %s, " "current redundancy = %s)", backup.backup_id, self.config.name, len(available_backups), minimum_redundancy) return # Keep track of when the delete operation started. delete_start_time = datetime.datetime.now() output.info("Deleting backup %s for server %s", backup.backup_id, self.config.name) previous_backup = self.get_previous_backup(backup.backup_id) next_backup = self.get_next_backup(backup.backup_id) # Delete all the data contained in the backup try: self.delete_backup_data(backup) except OSError as e: output.error("Failure deleting backup %s for server %s.\n%s", backup.backup_id, self.config.name, e) return # Check if we are deleting the first available backup if not previous_backup: # In the case of exclusive backup (default), removes any WAL # files associated to the backup being deleted. # In the case of concurrent backup, removes only WAL files # prior to the start of the backup being deleted, as they # might be useful to any concurrent backup started immediately # after. remove_until = None # means to remove all WAL files if next_backup: remove_until = next_backup elif BackupOptions.CONCURRENT_BACKUP in self.config.backup_options: remove_until = backup timelines_to_protect = set() # If remove_until is not set there are no backup left if remove_until: # Retrieve the list of extra timelines that contains at least # a backup. On such timelines we don't want to delete any WAL for value in self.get_available_backups( BackupInfo.STATUS_ARCHIVING).values(): # Ignore the backup that is being deleted if value == backup: continue timelines_to_protect.add(value.timeline) # Remove the timeline of `remove_until` from the list. # We have enough information to safely delete unused WAL files # on it. timelines_to_protect -= set([remove_until.timeline]) output.info("Delete associated WAL segments:") for name in self.remove_wal_before_backup(remove_until, timelines_to_protect): output.info("\t%s", name) # As last action, remove the backup directory, # ending the delete operation try: self.delete_basebackup(backup) except OSError as e: output.error( "Failure deleting backup %s for server %s.\n%s\n" "Please manually remove the '%s' directory", backup.backup_id, self.config.name, e, backup.get_basebackup_directory()) return self.backup_cache_remove(backup) # Save the time of the complete removal of the backup delete_end_time = datetime.datetime.now() output.info( "Deleted backup %s (start time: %s, elapsed time: %s)", backup.backup_id, delete_start_time.ctime(), human_readable_timedelta(delete_end_time - delete_start_time))
def rebuild_xlogdb(self): """ Rebuild the whole xlog database guessing it from the archive content. """ from os.path import isdir, join output.info("Rebuilding xlogdb for server %s", self.config.name) root = self.config.wals_directory default_compression = self.config.compression wal_count = label_count = history_count = 0 # lock the xlogdb as we are about replacing it completely with self.server.xlogdb('w') as fxlogdb: xlogdb_new = fxlogdb.name + ".new" with open(xlogdb_new, 'w') as fxlogdb_new: for name in sorted(os.listdir(root)): # ignore the xlogdb and its lockfile if name.startswith(self.server.XLOG_DB): continue fullname = join(root, name) if isdir(fullname): # all relevant files are in subdirectories hash_dir = fullname for wal_name in sorted(os.listdir(hash_dir)): fullname = join(hash_dir, wal_name) if isdir(fullname): _logger.warning( 'unexpected directory ' 'rebuilding the wal database: %s', fullname) else: if xlog.is_wal_file(fullname): wal_count += 1 elif xlog.is_backup_file(fullname): label_count += 1 else: _logger.warning( 'unexpected file ' 'rebuilding the wal database: %s', fullname) continue wal_info = WalFileInfo.from_file( fullname, default_compression=default_compression) fxlogdb_new.write(wal_info.to_xlogdb_line()) else: # only history files are here if xlog.is_history_file(fullname): history_count += 1 wal_info = WalFileInfo.from_file( fullname, default_compression=default_compression) fxlogdb_new.write(wal_info.to_xlogdb_line()) else: _logger.warning( 'unexpected file ' 'rebuilding the wal database: %s', fullname) os.fsync(fxlogdb_new.fileno()) shutil.move(xlogdb_new, fxlogdb.name) fsync_dir(os.path.dirname(fxlogdb.name)) output.info('Done rebuilding xlogdb for server %s ' '(history: %s, backup_labels: %s, wal_file: %s)', self.config.name, history_count, label_count, wal_count)
def backup(self): """ Performs a backup for the server """ _logger.debug("initialising backup information") self.executor.init() backup_info = None try: # Create the BackupInfo object representing the backup backup_info = BackupInfo( self.server, backup_id=datetime.datetime.now().strftime('%Y%m%dT%H%M%S')) backup_info.save() self.backup_cache_add(backup_info) output.info("Starting backup using %s method for server %s in %s", self.mode, self.config.name, backup_info.get_basebackup_directory()) # Run the pre-backup-script if present. script = HookScriptRunner(self, 'backup_script', 'pre') script.env_from_backup_info(backup_info) script.run() # Run the pre-backup-retry-script if present. retry_script = RetryHookScriptRunner(self, 'backup_retry_script', 'pre') retry_script.env_from_backup_info(backup_info) retry_script.run() # Do the backup using the BackupExecutor self.executor.backup(backup_info) # Compute backup size and fsync it on disk self.backup_fsync_and_set_sizes(backup_info) # Mark the backup as DONE backup_info.set_attribute("status", "DONE") # Use BaseException instead of Exception to catch events like # KeyboardInterrupt (e.g.: CRTL-C) except BaseException as e: msg_lines = str(e).strip().splitlines() if backup_info: # Use only the first line of exception message # in backup_info error field backup_info.set_attribute("status", "FAILED") # If the exception has no attached message use the raw # type name if len(msg_lines) == 0: msg_lines = [type(e).__name__] backup_info.set_attribute( "error", "failure %s (%s)" % (self.executor.current_action, msg_lines[0])) output.error("Backup failed %s.\nDETAILS: %s\n%s", self.executor.current_action, msg_lines[0], '\n'.join(msg_lines[1:])) else: output.info("Backup end at LSN: %s (%s, %08X)", backup_info.end_xlog, backup_info.end_wal, backup_info.end_offset) output.info( "Backup completed (start time: %s, elapsed time: %s)", self.executor.copy_start_time, human_readable_timedelta(self.executor.copy_end_time - self.executor.copy_start_time)) # Create a restore point after a backup target_name = 'barman_%s' % backup_info.backup_id self.server.postgres.create_restore_point(target_name) finally: if backup_info: backup_info.save() # Make sure we are not holding any PostgreSQL connection # during the post-backup scripts self.server.close() # Run the post-backup-retry-script if present. try: retry_script = RetryHookScriptRunner( self, 'backup_retry_script', 'post') retry_script.env_from_backup_info(backup_info) retry_script.run() except AbortedRetryHookScript as e: # Ignore the ABORT_STOP as it is a post-hook operation _logger.warning( "Ignoring stop request after receiving " "abort (exit code %d) from post-backup " "retry hook script: %s", e.hook.exit_status, e.hook.script) # Run the post-backup-script if present. script = HookScriptRunner(self, 'backup_script', 'post') script.env_from_backup_info(backup_info) script.run() output.result('backup', backup_info)
def archive_wal(self, verbose=True): """ Executes WAL maintenance operations, such as archiving and compression If verbose is set to False, outputs something only if there is at least one file :param bool verbose: report even if no actions """ found = False compressor = self.compression_manager.get_compressor() with self.server.xlogdb('a') as fxlogdb: if verbose: output.info("Processing xlog segments for %s", self.config.name, log=False) # Get the first available backup first_backup_id = self.get_first_backup(BackupInfo.STATUS_NOT_EMPTY) first_backup = self.server.get_backup(first_backup_id) for filename in sorted(glob( os.path.join(self.config.incoming_wals_directory, '*'))): if not found and not verbose: output.info("Processing xlog segments for %s", self.config.name, log=False) found = True # Create WAL Info object wal_info = WalFileInfo.from_file(filename, compression=None) # If there are no available backups ... if first_backup is None: # ... delete xlog segments only for exclusive backups if BackupOptions.CONCURRENT_BACKUP \ not in self.config.backup_options: # Skipping history files if not xlog.is_history_file(filename): output.info("\tNo base backup available." " Trashing file %s" " from server %s", wal_info.name, self.config.name) os.unlink(filename) continue # ... otherwise else: # ... delete xlog segments older than the first backup if wal_info.name < first_backup.begin_wal: # Skipping history files if not xlog.is_history_file(filename): output.info("\tOlder than first backup." " Trashing file %s" " from server %s", wal_info.name, self.config.name) os.unlink(filename) continue # Report to the user the WAL file we are archiving output.info("\t%s", os.path.basename(filename), log=False) _logger.info("Archiving %s/%s", self.config.name, os.path.basename(filename)) # Archive the WAL file try: self.cron_wal_archival(compressor, wal_info) except AbortedRetryHookScript as e: _logger.warning("Archiving of %s/%s aborted by " "pre_archive_retry_script." "Reason: %s" % (self.config.name, os.path.basename(), e)) return # Updates the information of the WAL archive with # the latest segments fxlogdb.write(wal_info.to_xlogdb_line()) # flush and fsync for every line fxlogdb.flush() os.fsync(fxlogdb.fileno()) if not found and verbose: output.info("\tno file found", log=False)
def delete_backup(self, backup): """ Delete a backup :param backup: the backup to delete :return bool: True if deleted, False if could not delete the backup """ available_backups = self.get_available_backups( status_filter=(BackupInfo.DONE, )) minimum_redundancy = self.server.config.minimum_redundancy # Honour minimum required redundancy if backup.status == BackupInfo.DONE and \ minimum_redundancy >= len(available_backups): output.warning( "Skipping delete of backup %s for server %s " "due to minimum redundancy requirements " "(minimum redundancy = %s, " "current redundancy = %s)", backup.backup_id, self.config.name, minimum_redundancy, len(available_backups)) return False # Keep track of when the delete operation started. delete_start_time = datetime.datetime.now() # Run the pre_delete_script if present. script = HookScriptRunner(self, 'delete_script', 'pre') script.env_from_backup_info(backup) script.run() # Run the pre_delete_retry_script if present. retry_script = RetryHookScriptRunner(self, 'delete_retry_script', 'pre') retry_script.env_from_backup_info(backup) retry_script.run() output.info("Deleting backup %s for server %s", backup.backup_id, self.config.name) previous_backup = self.get_previous_backup(backup.backup_id) next_backup = self.get_next_backup(backup.backup_id) # Delete all the data contained in the backup try: self.delete_backup_data(backup) except OSError as e: output.error("Failure deleting backup %s for server %s.\n%s", backup.backup_id, self.config.name, e) return False # Check if we are deleting the first available backup if not previous_backup: # In the case of exclusive backup (default), removes any WAL # files associated to the backup being deleted. # In the case of concurrent backup, removes only WAL files # prior to the start of the backup being deleted, as they # might be useful to any concurrent backup started immediately # after. remove_until = None # means to remove all WAL files if next_backup: remove_until = next_backup elif BackupOptions.CONCURRENT_BACKUP in self.config.backup_options: remove_until = backup timelines_to_protect = set() # If remove_until is not set there are no backup left if remove_until: # Retrieve the list of extra timelines that contains at least # a backup. On such timelines we don't want to delete any WAL for value in self.get_available_backups( BackupInfo.STATUS_ARCHIVING).values(): # Ignore the backup that is being deleted if value == backup: continue timelines_to_protect.add(value.timeline) # Remove the timeline of `remove_until` from the list. # We have enough information to safely delete unused WAL files # on it. timelines_to_protect -= set([remove_until.timeline]) output.info("Delete associated WAL segments:") for name in self.remove_wal_before_backup(remove_until, timelines_to_protect): output.info("\t%s", name) # As last action, remove the backup directory, # ending the delete operation try: self.delete_basebackup(backup) except OSError as e: output.error( "Failure deleting backup %s for server %s.\n%s\n" "Please manually remove the '%s' directory", backup.backup_id, self.config.name, e, backup.get_basebackup_directory()) return False self.backup_cache_remove(backup) # Save the time of the complete removal of the backup delete_end_time = datetime.datetime.now() output.info( "Deleted backup %s (start time: %s, elapsed time: %s)", backup.backup_id, delete_start_time.ctime(), human_readable_timedelta(delete_end_time - delete_start_time)) # Remove the sync lockfile if exists sync_lock = ServerBackupSyncLock(self.config.barman_lock_directory, self.config.name, backup.backup_id) if os.path.exists(sync_lock.filename): _logger.debug("Deleting backup sync lockfile: %s" % sync_lock.filename) os.unlink(sync_lock.filename) # Run the post_delete_retry_script if present. try: retry_script = RetryHookScriptRunner(self, 'delete_retry_script', 'post') retry_script.env_from_backup_info(backup) retry_script.run() except AbortedRetryHookScript as e: # Ignore the ABORT_STOP as it is a post-hook operation _logger.warning( "Ignoring stop request after receiving " "abort (exit code %d) from post-delete " "retry hook script: %s", e.hook.exit_status, e.hook.script) # Run the post_delete_script if present. script = HookScriptRunner(self, 'delete_script', 'post') script.env_from_backup_info(backup) script.run() return True
def get_server_list(args=None, skip_inactive=False, skip_disabled=False, on_error_stop=True, suppress_error=False): """ Get the server list from the configuration If args the parameter is None or arg.server_name is ['all'] returns all defined servers :param args: an argparse namespace containing a list server_name parameter :param bool skip_inactive: skip inactive servers when 'all' is required :param bool skip_disabled: skip disabled servers when 'all' is required :param bool on_error_stop: stop if an error is found :param bool suppress_error: suppress display of errors (e.g. diagnose) :rtype: dict(str,barman.server.Server|None) """ server_dict = {} # This function must to be called with in a multiple-server context assert not args or isinstance(args.server_name, list) # Generate the list of servers (required for global errors) available_servers = barman.__config__.server_names() # Get a list of configuration errors from all the servers global_error_list = barman.__config__.servers_msg_list # Global errors have higher priority if global_error_list: # Output the list of global errors if not suppress_error: for error in global_error_list: output.error(error) # If requested, exit on first error if on_error_stop: output.close_and_exit() # The following return statement will never be reached # but it is here for clarity return {} # Handle special 'all' server cases # - args is None # - 'all' special name if not args or 'all' in args.server_name: # When 'all' is used, it must be the only specified argument if args and len(args.server_name) != 1: output.error("You cannot use 'all' with other server names") servers = available_servers else: servers = args.server_name # Loop through all the requested servers for server in servers: conf = barman.__config__.get_server(server) if conf is None: # Unknown server server_dict[server] = None else: server_object = Server(conf) # Skip inactive servers, if requested if skip_inactive and not server_object.config.active: output.info("Skipping inactive server '%s'" % conf.name) continue # Skip disabled servers, if requested if skip_disabled and server_object.config.disabled: output.info("Skipping temporarily disabled server '%s'" % conf.name) continue server_dict[server] = server_object return server_dict
def set_pitr_targets(self, recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid): """ Set PITR targets - as specified by the user :param dict recovery_info: Dictionary containing all the recovery parameters :param barman.infofile.BackupInfo backup_info: representation of a backup :param str dest: destination directory of the recovery :param str|None target_name: recovery target name for PITR :param str|None target_time: recovery target time for PITR :param str|None target_tli: recovery target timeline for PITR :param str|None target_xid: recovery target transaction id for PITR """ target_epoch = None target_datetime = None if (target_time or target_xid or (target_tli and target_tli != backup_info.timeline) or target_name or recovery_info['get_wal']): recovery_info['is_pitr'] = True targets = {} if target_time: # noinspection PyBroadException try: target_datetime = dateutil.parser.parse(target_time) except ValueError as e: output.exception( "unable to parse the target time parameter %r: %s", target_time, e) output.close_and_exit() except Exception: # this should not happen, but there is a known bug in # dateutil.parser.parse() implementation # ref: https://bugs.launchpad.net/dateutil/+bug/1247643 output.exception( "unable to parse the target time parameter %r", target_time) output.close_and_exit() target_epoch = ( time.mktime(target_datetime.timetuple()) + (target_datetime.microsecond / 1000000.)) targets['time'] = str(target_datetime) if target_xid: targets['xid'] = str(target_xid) if target_tli and target_tli != backup_info.timeline: targets['timeline'] = str(target_tli) if target_name: targets['name'] = str(target_name) output.info( "Doing PITR. Recovery target %s", (", ".join(["%s: %r" % (k, v) for k, v in targets.items()]))) recovery_info['wal_dest'] = os.path.join(dest, 'barman_xlog') # With a PostgreSQL version older than 8.4, it is the user's # responsibility to delete the "barman_xlog" directory as the # restore_command option in recovery.conf is not supported if backup_info.version < 80400 and \ not recovery_info['get_wal']: recovery_info['results']['delete_barman_xlog'] = True recovery_info['target_epoch'] = target_epoch recovery_info['target_datetime'] = target_datetime
def recover(self, backup_info, dest, tablespaces, target_tli, target_time, target_xid, target_name, exclusive, remote_command): """ Performs a recovery of a backup :param barman.infofile.BackupInfo backup_info: the backup to recover :param str dest: the destination directory :param dict[str,str]|None tablespaces: a tablespace name -> location map (for relocation) :param str|None target_tli: the target timeline :param str|None target_time: the target time :param str|None target_xid: the target xid :param str|None target_name: the target name created previously with pg_create_restore_point() function call :param bool exclusive: whether the recovery is exclusive or not :param str|None remote_command: The remote command to recover the base backup, in case of remote backup. """ # Run the cron to be sure the wal catalog is up to date # Prepare a map that contains all the objects required for a recovery recovery_info = self.setup(backup_info, remote_command, dest) output.info("Starting %s restore for server %s using backup %s", recovery_info['recovery_dest'], self.server.config.name, backup_info.backup_id) output.info("Destination directory: %s", dest) # Set targets for PITR self.set_pitr_targets(recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid) # Retrieve the safe_horizon for smart copy self.retrieve_safe_horizon(recovery_info, backup_info, dest) # check destination directory. If doesn't exist create it try: recovery_info['cmd'].create_dir_if_not_exists(dest) except FsOperationFailed as e: output.exception("unable to initialise destination directory " "'%s': %s", dest, e) output.close_and_exit() # Initialize tablespace directories if backup_info.tablespaces: self.prepare_tablespaces(backup_info, recovery_info['cmd'], dest, tablespaces) # Copy the base backup output.info("Copying the base backup.") try: # perform the backup copy, honoring the retry option if set self.backup_manager.retry_backup_copy( self.basebackup_copy, backup_info, dest, tablespaces, remote_command, recovery_info['safe_horizon']) except DataTransferFailure as e: output.exception("Failure copying base backup: %s", e) output.close_and_exit() # Copy the backup.info file in the destination as # ".barman-recover.info" if remote_command: try: recovery_info['rsync'](backup_info.filename, ':%s/.barman-recover.info' % dest) except CommandFailedException as e: output.exception( 'copy of recovery metadata file failed: %s', e) output.close_and_exit() else: backup_info.save(os.path.join(dest, '.barman-recover.info')) # Restore the WAL segments. If GET_WAL option is set, skip this phase # as they will be retrieved using the wal-get command. if not recovery_info['get_wal']: output.info("Copying required WAL segments.") try: # Retrieve a list of required log files required_xlog_files = tuple( self.server.get_required_xlog_files( backup_info, target_tli, recovery_info['target_epoch'])) # Restore WAL segments into the wal_dest directory self.xlog_copy(required_xlog_files, recovery_info['wal_dest'], remote_command) except DataTransferFailure as e: output.exception("Failure copying WAL files: %s", e) output.close_and_exit() except xlog.BadXlogSegmentName as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue" % str(e), self.config.name) output.close_and_exit() # If WAL files are put directly in the pg_xlog directory, # avoid shipping of just recovered files # by creating the corresponding archive status file if not recovery_info['is_pitr']: output.info("Generating archive status files") self.generate_archive_status(recovery_info, remote_command, required_xlog_files) # Generate recovery.conf file (only if needed by PITR) if recovery_info['is_pitr']: output.info("Generating recovery.conf") self.generate_recovery_conf(recovery_info, backup_info, dest, exclusive, remote_command, target_name, target_time, target_tli, target_xid) # Create archive_status directory if necessary archive_status_dir = os.path.join(dest, 'pg_xlog', 'archive_status') try: recovery_info['cmd'].create_dir_if_not_exists(archive_status_dir) except FsOperationFailed as e: output.exception("unable to create the archive_status directory " "'%s': %s", archive_status_dir, e) output.close_and_exit() # As last step, analyse configuration files in order to spot # harmful options. Barman performs automatic conversion of # some options as well as notifying users of their existence. # # This operation is performed in three steps: # 1) mapping # 2) analysis # 3) copy output.info("Identify dangerous settings in destination directory.") self.map_temporary_config_files(recovery_info, backup_info, remote_command) self.analyse_temporary_config_files(recovery_info) self.copy_temporary_config_files(dest, remote_command, recovery_info) # Cleanup operations self.teardown(recovery_info) return recovery_info
def receive_wal(self, reset=False): """ Creates a PgReceiveXlog object and issues the pg_receivexlog command for a specific server :param bool reset: When set reset the status of receive-wal :raise ArchiverFailure: when something goes wrong """ # Ensure the presence of the destination directory mkpath(self.config.streaming_wals_directory) # Check if is a reset request if reset: self._reset_streaming_status() return # Execute basic sanity checks on PostgreSQL connection streaming_status = self.server.streaming.get_remote_status() if streaming_status["streaming_supported"] is None: raise ArchiverFailure( 'failed opening the PostgreSQL streaming connection ' 'for server %s' % (self.config.name)) elif not streaming_status["streaming_supported"]: raise ArchiverFailure('PostgreSQL version too old (%s < 9.2)' % self.server.streaming.server_txt_version) # Execute basic sanity checks on pg_receivexlog remote_status = self.get_remote_status() if not remote_status["pg_receivexlog_installed"]: raise ArchiverFailure('pg_receivexlog not present in $PATH') if not remote_status['pg_receivexlog_compatible']: raise ArchiverFailure('pg_receivexlog version not compatible with ' 'PostgreSQL server version') # Execute sanity check on replication slot usage if self.config.slot_name: # Check if slots are supported if not remote_status['pg_receivexlog_supports_slots']: raise ArchiverFailure( 'Physical replication slot not supported by %s ' '(9.4 or higher is required)' % self.server.streaming.server_txt_version) # Check if the required slot exists postgres_status = self.server.postgres.get_remote_status() if postgres_status['replication_slot'] is None: raise ArchiverFailure( "replication slot '%s' doesn't exist. " "Please execute " "'barman receive-wal --create-slot %s'" % (self.config.slot_name, self.config.name)) # Check if the required slot is available if postgres_status['replication_slot'].active: raise ArchiverFailure( "replication slot '%s' is already in use" % (self.config.slot_name, )) # Make sure we are not wasting precious PostgreSQL resources self.server.close() _logger.info('Activating WAL archiving through streaming protocol') try: output_handler = PgReceiveXlog.make_output_handler( self.config.name + ': ') receive = PgReceiveXlog( connection=self.server.streaming, destination=self.config.streaming_wals_directory, command=remote_status['pg_receivexlog_path'], version=remote_status['pg_receivexlog_version'], app_name=self.config.streaming_archiver_name, path=self.server.path, slot_name=self.config.slot_name, synchronous=remote_status['pg_receivexlog_synchronous'], out_handler=output_handler, err_handler=output_handler) # Finally execute the pg_receivexlog process receive.execute() except CommandFailedException as e: # Retrieve the return code from the exception ret_code = e.args[0]['ret'] if ret_code < 0: # If the return code is negative, then pg_receivexlog # was terminated by a signal msg = "pg_receivexlog terminated by signal: %s" \ % abs(ret_code) else: # Otherwise terminated with an error msg = "pg_receivexlog terminated with error code: %s"\ % ret_code raise ArchiverFailure(msg) except KeyboardInterrupt: # This is a normal termination, so there is nothing to do beside # informing the user. output.info('SIGINT received. Terminate gracefully.')
def archive(self, fxlogdb, verbose=True): """ Archive WAL files, discarding duplicates or those that are not valid. :param file fxlogdb: File object for xlogdb interactions :param boolean verbose: Flag for verbose output """ compressor = self.backup_manager.compression_manager.get_compressor() stamp = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') processed = 0 header = "Processing xlog segments from %s for %s" % (self.name, self.config.name) # Get the next batch of WAL files to be processed batch = self.get_next_batch() # Analyse the batch and properly log the information if batch.size: if batch.size > batch.run_size: # Batch mode enabled _logger.info( "Found %s xlog segments from %s for %s." " Archive a batch of %s segments in this run.", batch.size, self.name, self.config.name, batch.run_size) header += " (batch size: %s)" % batch.run_size else: # Single run mode (traditional) _logger.info( "Found %s xlog segments from %s for %s." " Archive all segments in one run.", batch.size, self.name, self.config.name) else: _logger.info("No xlog segments found from %s for %s.", self.name, self.config.name) # Print the header (verbose mode) if verbose: output.info(header, log=False) # Loop through all available WAL files for wal_info in batch: # Print the header (non verbose mode) if not processed and not verbose: output.info(header, log=False) # Exit when archive batch size is reached if processed >= batch.run_size: _logger.debug( "Batch size reached (%s) - " "Exit %s process for %s", batch.batch_size, self.name, self.config.name) break processed += 1 # Report to the user the WAL file we are archiving output.info("\t%s", wal_info.name, log=False) _logger.info("Archiving segment %s of %s from %s: %s/%s", processed, batch.run_size, self.name, self.config.name, wal_info.name) # Archive the WAL file try: self.archive_wal(compressor, wal_info) except MatchingDuplicateWalFile: # We already have this file. Simply unlink the file. os.unlink(wal_info.orig_filename) continue except DuplicateWalFile: output.info( "\tError: %s is already present in server %s. " "File moved to errors directory.", wal_info.name, self.config.name) error_dst = os.path.join( self.config.errors_directory, "%s.%s.duplicate" % (wal_info.name, stamp)) # TODO: cover corner case of duplication (unlikely, # but theoretically possible) shutil.move(wal_info.orig_filename, error_dst) continue except AbortedRetryHookScript as e: _logger.warning("Archiving of %s/%s aborted by " "pre_archive_retry_script." "Reason: %s" % (self.config.name, wal_info.name, e)) return # Updates the information of the WAL archive with # the latest segments fxlogdb.write(wal_info.to_xlogdb_line()) # flush and fsync for every line fxlogdb.flush() os.fsync(fxlogdb.fileno()) if processed: _logger.debug("Archived %s out of %s xlog segments from %s for %s", processed, batch.size, self.name, self.config.name) elif verbose: output.info("\tno file found", log=False) if batch.errors: output.info( "Some unknown objects have been found while " "processing xlog segments for %s. " "Objects moved to errors directory:", self.config.name, log=False) # Log unexpected files _logger.warning( "Archiver is about to move %s unexpected file(s) " "to errors directory for %s from %s", len(batch.errors), self.config.name, self.name) for error in batch.errors: basename = os.path.basename(error) output.info("\t%s", basename, log=False) # Print informative log line. _logger.warning("Moving unexpected file for %s from %s: %s", self.config.name, self.name, basename) error_dst = os.path.join(self.config.errors_directory, "%s.%s.unknown" % (basename, stamp)) try: shutil.move(error, error_dst) except IOError as e: if e.errno == errno.ENOENT: _logger.warning('%s not found' % error)
def recover(self, backup_info, dest, tablespaces, target_tli, target_time, target_xid, target_name, exclusive, remote_command): """ Performs a recovery of a backup :param barman.infofile.BackupInfo backup_info: the backup to recover :param str dest: the destination directory :param dict[str,str]|None tablespaces: a tablespace name -> location map (for relocation) :param str|None target_tli: the target timeline :param str|None target_time: the target time :param str|None target_xid: the target xid :param str|None target_name: the target name created previously with pg_create_restore_point() function call :param bool exclusive: whether the recovery is exclusive or not :param str|None remote_command: The remote command to recover the base backup, in case of remote backup. """ # Run the cron to be sure the wal catalog is up to date # Prepare a map that contains all the objects required for a recovery recovery_info = self._setup(backup_info, remote_command, dest) output.info("Starting %s restore for server %s using backup %s", recovery_info['recovery_dest'], self.server.config.name, backup_info.backup_id) output.info("Destination directory: %s", dest) # Set targets for PITR self._set_pitr_targets(recovery_info, backup_info, dest, target_name, target_time, target_tli, target_xid) # Retrieve the safe_horizon for smart copy self._retrieve_safe_horizon(recovery_info, backup_info, dest) # check destination directory. If doesn't exist create it try: recovery_info['cmd'].create_dir_if_not_exists(dest) except FsOperationFailed as e: output.error( "unable to initialise destination directory " "'%s': %s", dest, e) output.close_and_exit() # Initialize tablespace directories if backup_info.tablespaces: self._prepare_tablespaces(backup_info, recovery_info['cmd'], dest, tablespaces) # Copy the base backup output.info("Copying the base backup.") try: self._backup_copy(backup_info, dest, tablespaces, remote_command, recovery_info['safe_horizon']) except DataTransferFailure as e: output.error("Failure copying base backup: %s", e) output.close_and_exit() # Copy the backup.info file in the destination as # ".barman-recover.info" if remote_command: try: recovery_info['rsync'](backup_info.filename, ':%s/.barman-recover.info' % dest) except CommandFailedException as e: output.error('copy of recovery metadata file failed: %s', e) output.close_and_exit() else: backup_info.save(os.path.join(dest, '.barman-recover.info')) # Restore the WAL segments. If GET_WAL option is set, skip this phase # as they will be retrieved using the wal-get command. if not recovery_info['get_wal']: output.info("Copying required WAL segments.") try: # Retrieve a list of required log files required_xlog_files = tuple( self.server.get_required_xlog_files( backup_info, target_tli, recovery_info['target_epoch'])) # Restore WAL segments into the wal_dest directory self._xlog_copy(required_xlog_files, recovery_info['wal_dest'], remote_command) except DataTransferFailure as e: output.error("Failure copying WAL files: %s", e) output.close_and_exit() except BadXlogSegmentName as e: output.error( "invalid xlog segment name %r\n" "HINT: Please run \"barman rebuild-xlogdb %s\" " "to solve this issue", str(e), self.config.name) output.close_and_exit() # If WAL files are put directly in the pg_xlog directory, # avoid shipping of just recovered files # by creating the corresponding archive status file if not recovery_info['is_pitr']: output.info("Generating archive status files") self._generate_archive_status(recovery_info, remote_command, required_xlog_files) # Generate recovery.conf file (only if needed by PITR) if recovery_info['is_pitr']: output.info("Generating recovery.conf") self._generate_recovery_conf(recovery_info, backup_info, dest, exclusive, remote_command, target_name, target_time, target_tli, target_xid) # Create archive_status directory if necessary archive_status_dir = os.path.join(recovery_info['wal_dest'], 'archive_status') try: recovery_info['cmd'].create_dir_if_not_exists(archive_status_dir) except FsOperationFailed as e: output.error( "unable to create the archive_status directory " "'%s': %s", archive_status_dir, e) output.close_and_exit() # As last step, analyse configuration files in order to spot # harmful options. Barman performs automatic conversion of # some options as well as notifying users of their existence. # # This operation is performed in three steps: # 1) mapping # 2) analysis # 3) copy output.info("Identify dangerous settings in destination directory.") self._map_temporary_config_files(recovery_info, backup_info, remote_command) self._analyse_temporary_config_files(recovery_info) self._copy_temporary_config_files(dest, remote_command, recovery_info) # Cleanup operations self._teardown(recovery_info) return recovery_info
def handler(line): if line: if prefix: output.info("%s%s", prefix, line) else: output.info("%s", line)
def rebuild_xlogdb(self): """ Rebuild the whole xlog database guessing it from the archive content. """ from os.path import isdir, join output.info("Rebuilding xlogdb for server %s", self.config.name) root = self.config.wals_directory comp_manager = self.compression_manager wal_count = label_count = history_count = 0 # lock the xlogdb as we are about replacing it completely with self.server.xlogdb('w') as fxlogdb: xlogdb_new = fxlogdb.name + ".new" with open(xlogdb_new, 'w') as fxlogdb_new: for name in sorted(os.listdir(root)): # ignore the xlogdb and its lockfile if name.startswith(self.server.XLOG_DB): continue fullname = join(root, name) if isdir(fullname): # all relevant files are in subdirectories hash_dir = fullname for wal_name in sorted(os.listdir(hash_dir)): fullname = join(hash_dir, wal_name) if isdir(fullname): _logger.warning( 'unexpected directory ' 'rebuilding the wal database: %s', fullname) else: if xlog.is_wal_file(fullname): wal_count += 1 elif xlog.is_backup_file(fullname): label_count += 1 elif fullname.endswith('.tmp'): _logger.warning( 'temporary file found ' 'rebuilding the wal database: %s', fullname) continue else: _logger.warning( 'unexpected file ' 'rebuilding the wal database: %s', fullname) continue wal_info = comp_manager.get_wal_file_info( fullname) fxlogdb_new.write(wal_info.to_xlogdb_line()) else: # only history files are here if xlog.is_history_file(fullname): history_count += 1 wal_info = comp_manager.get_wal_file_info(fullname) fxlogdb_new.write(wal_info.to_xlogdb_line()) else: _logger.warning( 'unexpected file ' 'rebuilding the wal database: %s', fullname) os.fsync(fxlogdb_new.fileno()) shutil.move(xlogdb_new, fxlogdb.name) fsync_dir(os.path.dirname(fxlogdb.name)) output.info( 'Done rebuilding xlogdb for server %s ' '(history: %s, backup_labels: %s, wal_file: %s)', self.config.name, history_count, label_count, wal_count)