Beispiel #1
0
def exec_diagnose(servers, errors_list):
    """
    Diagnostic command: gathers information from backup server
    and from all the configured servers.

    Gathered information should be used for support and problems detection

    :param dict(str,frabit.server.Server) servers: list of configured servers
    :param list errors_list: list of global errors
    """
    # global section. info about frabit server
    diagnosis = {'global': {}, 'servers': {}}
    # frabit global config
    diagnosis['global']['config'] = dict(frabit.__config__._global_config)
    diagnosis['global']['config']['errors_list'] = errors_list
    try:
        command = filesystem.UnixLocalCommand()
        # basic system info
        diagnosis['global']['system_info'] = command.get_system_info()
    except CommandFailedException as e:
        diagnosis['global']['system_info'] = {'error': repr(e)}
    diagnosis['global']['system_info']['frabit_ver'] = frabit.__version__
    diagnosis['global']['system_info']['timestamp'] = datetime.datetime.now()
    # per server section
    for name in sorted(servers):
        server = servers[name]
        if server is None:
            output.error("Unknown server '{}'".format(name))
            continue
        # server configuration
        diagnosis['servers'][name] = {}
        diagnosis['servers'][name]['config'] = vars(server.config)
        del diagnosis['servers'][name]['config']['config']
        # server system info
        if server.config.ssh_command:
            try:
                command = filesystem.UnixRemoteCommand( ssh_command=server.config.ssh_command, path=server.path)
                diagnosis['servers'][name]['system_info'] = (command.get_system_info())
            except FsOperationFailed:
                pass
        # frabit status information for the server
        diagnosis['servers'][name]['status'] = server.get_remote_status()
        # backup list
        backups = server.get_available_backups(BackupInfo.STATUS_ALL)
        diagnosis['servers'][name]['backups'] = backups
        # wal status
        diagnosis['servers'][name]['wals'] = {
            'last_archived_wal_per_timeline':
                server.backup_manager.get_latest_archived_wals_info(),
        }
        # Release any PostgreSQL resource
        server.close()
    output.info(json.dumps(diagnosis, cls=FrabitEncoder, indent=4,
                           sort_keys=True))
Beispiel #2
0
def sync_wals(args):
    """
    Command that synchronises WAL files from a master to a passive node
    """
    server = get_server(args)
    try:
        server.sync_wals()
    except SyncError as e:
        # Catch SyncError exceptions and output only the error message,
        # preventing from logging the stack trace
        output.error(e)
    output.close_and_exit()
Beispiel #3
0
def delete(args):
    """
    Delete a backup
    """
    server = get_server(args)

    # Retrieves the backup
    backup_id = parse_backup_id(server, args)
    with closing(server):
        if not server.delete_backup(backup_id):
            output.error("Cannot delete backup (%s %s)" %
                         (server.config.name, backup_id))
    output.close_and_exit()
Beispiel #4
0
    def kill(self, process_info, retries=10):
        """
        Kill a process

        Returns True if killed successfully False otherwise

        :param ProcessInfo process_info: representation of the process
            we want to kill
        :param int retries: number of times the method will check
            if the process is still alive
        :rtype: bool
        """
        # Try to kill the process
        try:
            _logger.debug("Sending SIGINT to PID {}".format(process_info.pid))
            os.kill(process_info.pid, signal.SIGINT)
            _logger.debug("os.kill call succeeded")
        except OSError as e:
            _logger.debug("os.kill call failed: {}".format(e))
            # The process doesn't exists. It has probably just terminated.
            if e.errno == errno.ESRCH:
                return True
            # Something unexpected has happened
            output.error("{}".format(e))
            return False
        # Check if the process have been killed. the fastest (and maybe safest)
        # way is to send a kill with 0 as signal.
        # If the method returns an OSError exceptions, the process have been
        # killed successfully, otherwise is still alive.
        for counter in range(retries):
            try:
                _logger.debug("Checking with SIG_DFL if PID {} is still alive".format(process_info.pid))
                os.kill(process_info.pid, signal.SIG_DFL)
                _logger.debug("os.kill call succeeded")
            except OSError as e:
                _logger.debug("os.kill call failed: {}".format(e))
                # If the process doesn't exists, we are done.
                if e.errno == errno.ESRCH:
                    return True
                # Something unexpected has happened
                output.error("{}".format(e))
                return False
            time.sleep(1)
        _logger.debug("The PID {pid} has not been terminated after {retries} retries".format(pid=process_info.pid,
                                                                                             retries=retries))
        return False
Beispiel #5
0
def list_files(args):
    """
    List all the files for a single backup
    """
    server = get_server(args)

    # Retrieves the backup
    backup_info = parse_backup_id(server, args)
    try:
        for line in backup_info.get_list_of_files(args.target):
            output.info(line, log=False)
    except BinlogHasPurged as e:
        output.error(
            "invalid xlog segment name %r\n"
            "HINT: Please run \"barman rebuild-xlogdb %s\" "
            "to solve this issue", force_str(e), server.config.name)
        output.close_and_exit()
Beispiel #6
0
def sync_info(args):
    """
    Output the internal synchronisation status.
    Used to sync_backup with a passive node
    """
    server = get_server(args)
    try:
        # if called with --primary option
        if getattr(args, 'primary', False):
            primary_info = server.primary_node_info(args.last_wal,
                                                    args.last_position)
            output.info(json.dumps(primary_info, cls=FrabitEncoder, indent=4),
                        log=False)
        else:
            server.sync_status(args.last_wal, args.last_position)
    except SyncError as e:
        # Catch SyncError exceptions and output only the error message,
        # preventing from logging the stack trace
        output.error(e)

    output.close_and_exit()
Beispiel #7
0
def parse_backup_id(server, args):
    """
    Parses backup IDs including special words such as latest, oldest, etc.

    Exit with error if the backup id doesn't exist.

    :param Server server: server object to search for the required backup
    :param args: command lien arguments namespace
    :rtype: barman.infofile.LocalBackupInfo
    """
    if args.backup_id in ('latest', 'last'):
        backup_id = server.get_last_backup_id()
    elif args.backup_id in ('oldest', 'first'):
        backup_id = server.get_first_backup_id()
    else:
        backup_id = args.backup_id
    backup_info = server.get_backup(backup_id)
    if backup_info is None:
        output.error("Unknown backup '%s' for server '%s'", args.backup_id,
                     server.config.name)
        output.close_and_exit()
    return backup_info
Beispiel #8
0
def receive_wal(args):
    """
    Start a receive-wal process.
    The process uses the streaming protocol to receive WAL files
    from the PostgreSQL server.
    """
    server = get_server(args)
    if args.stop and args.reset:
        output.error("--stop and --reset options are not compatible")
    # If the caller requested to shutdown the receive-wal process deliver the
    # termination signal, otherwise attempt to start it
    elif args.stop:
        server.kill('receive-wal')
    elif args.create_slot:
        with closing(server):
            server.create_physical_repslot()
    elif args.drop_slot:
        with closing(server):
            server.drop_repslot()
    else:
        with closing(server):
            server.receive_wal(reset=args.reset)
    output.close_and_exit()
Beispiel #9
0
def manage_server_command(server,
                          name=None,
                          inactive_is_error=False,
                          disabled_is_error=True,
                          skip_inactive=True,
                          skip_disabled=True):
    """
    Standard and consistent method for managing server errors within
    a server command execution. By default, suggests to skip any inactive
    and disabled server; it also emits errors for disabled servers by
    default.

    Returns True if the command has to be executed for this server.

    :param barman.server.Server server: server to be checked for errors
    :param str name: name of the server, in a multi-server command
    :param bool inactive_is_error: treat inactive server as error
    :param bool disabled_is_error: treat disabled server as error
    :param bool skip_inactive: skip if inactive
    :param bool skip_disabled: skip if disabled
    :return: True if the command has to be executed on this server
    :rtype: boolean
    """

    # Unknown server (skip it)
    if not server:
        output.error("Unknown server '%s'" % name)
        return False

    if not server.config.active:
        # Report inactive server as error
        if inactive_is_error:
            output.error('Inactive server: %s' % server.config.name)
        if skip_inactive:
            return False

    # Report disabled server as error
    if server.config.disabled:
        # Output all the messages as errors, and exit terminating the run.
        if disabled_is_error:
            for message in server.config.msg_list:
                output.error(message)
        if skip_disabled:
            return False

    # All ok, execute the command
    return True
Beispiel #10
0
def recover(args):
    """
    Recover a server at a given time, name, LSN or xid
    """
    server = get_server(args)

    # Retrieves the backup
    backup_id = parse_backup_id(server, args)
    if backup_id.status not in BackupInfo.STATUS_COPY_DONE:
        output.error("Cannot recover from backup '{id}' of server '{name}': "
                     "backup status is not DONE".format(
                         id=args.backup_id, name=server.config.name))
        output.close_and_exit()

    # decode the tablespace relocation rules
    tablespaces = {}
    if args.tablespace:
        for rule in args.tablespace:
            try:
                tablespaces.update([rule.split(':', 1)])
            except ValueError:
                output.error(
                    "Invalid tablespace relocation rule '%s'\n"
                    "HINT: The valid syntax for a relocation rule is "
                    "NAME:LOCATION", rule)
                output.close_and_exit()

    # validate the rules against the tablespace list
    valid_tablespaces = []
    if backup_id.tablespaces:
        valid_tablespaces = [
            tablespace_data.name for tablespace_data in backup_id.tablespaces
        ]
    for item in tablespaces:
        if item not in valid_tablespaces:
            output.error(
                "Invalid tablespace name '%s'\n"
                "HINT: Please use any of the following "
                "tablespaces: %s", item, ', '.join(valid_tablespaces))
            output.close_and_exit()

    # explicitly disallow the rsync remote syntax (common mistake)
    if ':' in args.destination_directory:
        output.error(
            "The destination directory parameter "
            "cannot contain the ':' character\n"
            "HINT: If you want to do a remote recovery you have to use "
            "the --remote-ssh-command option")
        output.close_and_exit()
    if args.retry_sleep is not None:
        server.config.basebackup_retry_sleep = args.retry_sleep
    if args.retry_times is not None:
        server.config.basebackup_retry_times = args.retry_times
    if hasattr(args, 'get_wal'):
        if args.get_wal:
            server.config.recovery_options.add(RecoveryOptions.GET_WAL)
        else:
            server.config.recovery_options.remove(RecoveryOptions.GET_WAL)
    if args.jobs is not None:
        server.config.parallel_jobs = args.jobs
    if hasattr(args, 'bwlimit'):
        server.config.bandwidth_limit = args.bwlimit

    target_options = [
        'target_tli', 'target_time', 'target_xid', 'target_lsn', 'target_name',
        'target_immediate'
    ]
    specified_target_options = len(
        [option for option in target_options if getattr(args, option)])
    if specified_target_options > 1:
        output.error(
            "You cannot specify multiple targets for the recovery operation")
        output.close_and_exit()

    if hasattr(args, 'network_compression'):
        if args.network_compression and args.remote_ssh_command is None:
            output.error("Network compression can only be used with "
                         "remote recovery.\n"
                         "HINT: If you want to do a remote recovery "
                         "you have to use the --remote-ssh-command option")
            output.close_and_exit()
        server.config.network_compression = args.network_compression

    with closing(server):
        try:
            server.recover(backup_id,
                           args.destination_directory,
                           tablespaces=tablespaces,
                           target_tli=args.target_tli,
                           target_time=args.target_time,
                           target_xid=args.target_xid,
                           target_lsn=args.target_lsn,
                           target_name=args.target_name,
                           target_immediate=args.target_immediate,
                           exclusive=args.exclusive,
                           remote_command=args.remote_ssh_command,
                           target_action=getattr(args, 'target_action', None),
                           standby_mode=getattr(args, 'standby_mode', None))
        except RecoveryException as exc:
            output.error(force_str(exc))

    output.close_and_exit()
Beispiel #11
0
def main():
    """
    The main method of Frabit
    """
    p = ArghParser(epilog='Frabit by Frabit (www.frabit.com)')
    p.add_argument(
        '-v',
        '--version',
        action='version',
        version='{version}\n\nFrabit by Frabit (www.frabit.com)'.format(
            version=frabit.__version__))
    p.add_argument('-c',
                   '--config',
                   help='uses a configuration file (defaults: %s)' %
                   ', '.join(frabit.config.Config.CONFIG_FILES),
                   default=SUPPRESS)
    p.add_argument('--color',
                   '--colour',
                   help='Whether to use colors in the output',
                   choices=['never', 'always', 'auto'],
                   default='auto')
    p.add_argument('--log-level',
                   help='Override the default log level',
                   choices=list(get_log_levels()),
                   default=SUPPRESS)
    p.add_argument('-q', '--quiet', help='be quiet', action='store_true')
    p.add_argument('-d', '--debug', help='debug output', action='store_true')
    p.add_argument('-f',
                   '--format',
                   help='output format',
                   choices=output.AVAILABLE_WRITERS.keys(),
                   default=output.DEFAULT_WRITER)
    p.add_commands([
        archive_wal,
        backup,
        check,
        check_backup,
        cron,
        delete,
        diagnose,
        get_wal,
        list_backup,
        list_files,
        list_server,
        put_wal,
        rebuild_xlogdb,
        receive_wal,
        recover,
        show_backup,
        show_server,
        replication_status,
        status,
        switch_wal,
        switch_xlog,
        sync_info,
        sync_backup,
        sync_wals,
    ])
    # noinspection PyBroadException
    try:
        p.dispatch(pre_call=global_config)
    except KeyboardInterrupt:
        msg = "Process interrupted by user (KeyboardInterrupt)"
        output.error(msg)
    except Exception as e:
        msg = "%s\nSee log file for more details." % e
        output.exception(msg)

    # cleanup output API and exit honoring output.error_occurred and
    # output.error_exit_code
    output.close_and_exit()
Beispiel #12
0
def get_server_list(args=None,
                    skip_inactive=False,
                    skip_disabled=False,
                    skip_passive=False,
                    on_error_stop=True,
                    suppress_error=False):
    """
    Get the server list from the configuration

    If args the parameter is None or arg.server_name is ['all']
    returns all defined servers

    :param args: an argparse namespace containing a list server_name parameter
    :param bool skip_inactive: skip inactive servers when 'all' is required
    :param bool skip_disabled: skip disabled servers when 'all' is required
    :param bool skip_passive: skip passive servers when 'all' is required
    :param bool on_error_stop: stop if an error is found
    :param bool suppress_error: suppress display of errors (e.g. diagnose)
    :rtype: dict[str,Server]
    """
    server_dict = {}

    # This function must to be called with in a multiple-server context
    assert not args or isinstance(args.server_name, list)

    # Generate the list of servers (required for global errors)
    available_servers = frabit.__config__.server_names()

    # Get a list of configuration errors from all the servers
    global_error_list = frabit.__config__.servers_msg_list

    # Global errors have higher priority
    if global_error_list:
        # Output the list of global errors
        if not suppress_error:
            for error in global_error_list:
                output.error(error)

        # If requested, exit on first error
        if on_error_stop:
            output.close_and_exit()
            # The following return statement will never be reached
            # but it is here for clarity
            return {}

    # Handle special 'all' server cases
    # - args is None
    # - 'all' special name
    if not args or 'all' in args.server_name:
        # When 'all' is used, it must be the only specified argument
        if args and len(args.server_name) != 1:
            output.error("You cannot use 'all' with other server names")
        servers = available_servers
    else:
        # Put servers in a set, so multiple occurrences are counted only once
        servers = set(args.server_name)

    # Loop through all the requested servers
    for server in servers:
        conf = frabit.__config__.get_server(server)
        if conf is None:
            # Unknown server
            server_dict[server] = None
        else:
            server_object = Server(conf)
            # Skip inactive servers, if requested
            if skip_inactive and not server_object.config.active:
                output.info("Skipping inactive server '%s'" % conf.name)
                continue
            # Skip disabled servers, if requested
            if skip_disabled and server_object.config.disabled:
                output.info("Skipping temporarily disabled server '%s'" %
                            conf.name)
                continue
            # Skip passive nodes, if requested
            if skip_passive and server_object.replica_node:
                output.info("Skipping passive server '%s'", conf.name)
                continue
            server_dict[server] = server_object

    return server_dict
Beispiel #13
0
def get_server(args,
               skip_inactive=True,
               skip_disabled=False,
               skip_passive=False,
               inactive_is_error=False,
               on_error_stop=True,
               suppress_error=False):
    """
    Get a single server retrieving its configuration (wraps get_server_list())

    Returns a Server object or None if the required server is unknown and
    on_error_stop is False.

    WARNING: this function modifies the 'args' parameter

    :param args: an argparse namespace containing a single
        server_name parameter
        WARNING: the function modifies the content of this parameter
    :param bool skip_inactive: do nothing if the server is inactive
    :param bool skip_disabled: do nothing if the server is disabled
    :param bool skip_passive: do nothing if the server is passive
    :param bool inactive_is_error: treat inactive server as error
    :param bool on_error_stop: stop if an error is found
    :param bool suppress_error: suppress display of errors (e.g. diagnose)
    :rtype: Server|None
    """
    # This function must to be called with in a single-server context
    name = args.server_name
    assert isinstance(name, str)

    # The 'all' special name is forbidden in this context
    if name == 'all':
        output.error("You cannot use 'all' in a single server context")
        output.close_and_exit()
        # The following return statement will never be reached
        # but it is here for clarity
        return None

    # Builds a list from a single given name
    args.server_name = [name]

    # Skip_inactive is reset if inactive_is_error is set, because
    # it needs to retrieve the inactive server to emit the error.
    skip_inactive &= not inactive_is_error

    # Retrieve the requested server
    servers = get_server_list(args, skip_inactive, skip_disabled, skip_passive,
                              on_error_stop, suppress_error)

    # The requested server has been excluded from get_server_list result
    if len(servers) == 0:
        output.close_and_exit()
        # The following return statement will never be reached
        # but it is here for clarity
        return None

    # retrieve the server object
    server = servers[name]

    # Apply standard validation control and skips
    # the server if inactive or disabled, displaying standard
    # error messages. If on_error_stop (default) exits
    if not manage_server_command(server, name,
                                 inactive_is_error) and \
            on_error_stop:
        output.close_and_exit()
        # The following return statement will never be reached
        # but it is here for clarity
        return None

    # Returns the filtered server
    return server