Example #1
0
def replication_resume(params):
    if not postgresql.is_secondary():
        hookenv.action_fail("Not a hot standby")
        return

    con = postgresql.connect()
    con.autocommit = True

    offset = postgresql.wal_received_offset(con)
    hookenv.action_set(dict(offset=offset))

    cur = con.cursor()
    if postgresql.has_version("10"):
        cur.execute("SELECT pg_is_wal_replay_paused()")
    else:
        cur.execute("SELECT pg_is_xlog_replay_paused()")
    if cur.fetchone()[0] is False:
        # Not a failure, per lp:1670613
        hookenv.action_set(dict(result="Already resumed"))
        return
    if postgresql.has_version("10"):
        cur.execute("SELECT pg_wal_replay_resume()")
    else:
        cur.execute("SELECT pg_xlog_replay_resume()")
    hookenv.action_set(dict(result="Resumed"))
Example #2
0
    def test_has_version(self, version):
        version.return_value = '9.4'
        self.assertTrue(postgresql.has_version('9.1'))
        self.assertTrue(postgresql.has_version('9.4'))
        self.assertFalse(postgresql.has_version('9.5'))

        # New version scheme starting PostgreSQL 10
        version.return_value = '10'
        self.assertTrue(postgresql.has_version('9.6'))
        self.assertFalse(postgresql.has_version('11'))
Example #3
0
def postgresql_conf_defaults():
    """Return the postgresql.conf defaults, which we parse from config.yaml"""
    # We load defaults from the extra_pg_conf default in config.yaml,
    # which ensures that they never get out of sync.
    raw = helpers.config_yaml()["options"]["extra_pg_conf"]["default"]
    defaults = postgresql.parse_config(raw)

    # And recalculate some defaults, which could get out of sync.
    # Settings with mandatory minimums like wal_senders are handled
    # later, in ensure_viable_postgresql_conf().
    ram = int(host.get_total_ram() / (1024 * 1024))  # Working in megabytes.

    # Default shared_buffers to 25% of ram, minimum 16MB, maximum 8GB,
    # per current best practice rules of thumb. Rest is cache.
    shared_buffers = max(min(math.ceil(ram * 0.25), 8192), 16)
    effective_cache_size = max(1, ram - shared_buffers)
    defaults["shared_buffers"] = "{} MB".format(shared_buffers)
    defaults["effective_cache_size"] = "{} MB".format(effective_cache_size)

    # PostgreSQL 10 introduces multiple password encryption methods.
    if postgresql.has_version("10"):
        # Change this to scram-sha-256 next LTS release, when we can
        # start assuming clients have libpq 10. The setting can of
        # course still be overridden in the config.
        defaults["password_encryption"] = "md5"
    else:
        defaults["password_encryption"] = True

    return defaults
Example #4
0
def failover():
    if get_following() is None:
        hookenv.log("Failover already in progress", DEBUG)
        return

    # Stop replicating the doomed master, or we risk diverging
    # timelines.

    pg12 = postgresql.has_version("12")
    if pg12:
        path = postgresql.hot_standby_conf_path()
        template = "hot_standby.conf.tmpl"
    else:
        path = postgresql.recovery_conf_path()
        template = "recovery.conf.tmpl"

    templating.render(template,
                      path, {},
                      owner="postgres",
                      group="postgres",
                      perms=0o600)

    if pg12:
        touch(postgresql.hot_standby_signal_path())

    # Kick off a rolling restart to apply the change.
    reactive.set_state("postgresql.cluster.needs_restart")

    # Publish the change after the restart.
    set_following(None)
    reactive.set_state("postgresql.replication.publish_following")
Example #5
0
def update_recovery_conf(follow):
    assert follow != hookenv.local_unit()

    peer_rel = helpers.get_peer_relation()
    follow_relinfo = peer_rel.get(follow)
    assert follow_relinfo is not None, "Invalid upstream {}".format(follow)

    current_follow = get_following()

    if follow != current_follow:
        status_set("maintenance", "Following new unit {}".format(follow))
        set_following(follow)
        # Setting the state to defer publication until after restart.
        reactive.set_state("postgresql.replication.publish_following")

    else:
        # Even though the master is unchanged, we still regenerate
        # recovery.conf in case connection details such as IP addresses
        # have changed.
        hookenv.log("Continuing to follow {}".format(follow))

    pg12 = postgresql.has_version("12")
    if pg12:
        path = postgresql.hot_standby_conf_path()
        template = "hot_standby.conf.tmpl"
    else:
        path = postgresql.recovery_conf_path()
        template = "recovery.conf.tmpl"

    config = hookenv.config()

    data = dict(
        streaming_replication=config["streaming_replication"],
        host=follow_relinfo["host"],
        port=follow_relinfo["port"],
        user=replication_username(),
        password=leader_get("replication_password"),
    )

    if reactive.helpers.is_state("postgresql.wal_e.enabled"):
        data["restore_command"] = wal_e.wal_e_restore_command()

    templating.render(template,
                      path,
                      data,
                      owner="postgres",
                      group="postgres",
                      perms=0o600)

    if pg12:
        touch(postgresql.hot_standby_signal_path())

    # Use @when_file_changed for this when Issue #44 is resolved.
    if reactive.helpers.any_file_changed([path]):
        reactive.set_state("postgresql.cluster.needs_restart")
        if reactive.is_state("postgresql.replication.cloned"):
            reactive.set_state("postgresql.replication.check_following")
Example #6
0
def update_postgresql_conf():
    settings = assemble_postgresql_conf()
    path = postgresql.postgresql_conf_path()

    with open(path, "r") as f:
        pg_conf = f.read()

    start_mark = "### BEGIN JUJU SETTINGS ###"
    end_mark = "### END JUJU SETTINGS ###"

    # Strip the existing settings section, including the markers.
    pg_conf = re.sub(
        r"^\s*{}.*^\s*{}\s*$".format(re.escape(start_mark), re.escape(end_mark)),
        "",
        pg_conf,
        flags=re.I | re.M | re.DOTALL,
    )

    for k in settings:
        # Comment out conflicting options. We could just allow later
        # options to override earlier ones, but this is less surprising.
        pg_conf = re.sub(
            r"^\s*({}[\s=].*)$".format(re.escape(k)),
            r"# juju # \1",
            pg_conf,
            flags=re.M | re.I,
        )

    # Store the updated charm options. This is compared with the
    # live config to detect if a restart is required.
    store = unitdata.kv()
    current_prefix = "postgresql.cluster.pgconf.current."
    store.unsetrange(prefix=current_prefix)
    store.update(settings, prefix=current_prefix)

    # Generate the charm config section, adding it to the end of the
    # config file.
    simple_re = re.compile(r"^[-.\w]+$")
    override_section = [start_mark]
    for k, v in settings.items():
        v = str(v)
        assert "\n" not in v, "Invalid config value {!r}".format(v)
        if simple_re.search(v) is None:
            v = "'{}'".format(v.replace("'", "''"))
        override_section.append("{} = {}".format(k, v))
    if postgresql.has_version("12"):
        override_section.append("include_if_exists '{}'".format(postgresql.hot_standby_conf_path()))
    override_section.append(end_mark)
    pg_conf += "\n" + "\n".join(override_section)

    helpers.rewrite(path, pg_conf)
Example #7
0
def promote_to_master():
    status_set("maintenance", "Promoting to master")
    postgresql.promote()

    set_following(None)
    publish_following()

    while postgresql.is_in_recovery():
        status_set("maintenance", "Waiting for startup")
        time.sleep(1)

    if postgresql.has_version("12"):
        assert not os.path.exists(postgresql.hot_standby_signal_path(
        )), "standby.signal still exists after promotion"
    else:
        assert not os.path.exists(postgresql.recovery_conf_path(
        )), "recovery.conf still exists after promotion"

    update_replication_states()
    helpers.ping_peers()
Example #8
0
def ensure_viable_postgresql_conf(opts):
    def force(**kw):
        for k, v in kw.items():
            if opts.get(k) != v:
                hookenv.log("Setting {} to {}".format(k, v), DEBUG)
                opts[k] = v

    config = hookenv.config()
    rels = context.Relations()

    # Number of standby units - count peers and 'master' relations.
    num_standbys = len(helpers.get_peer_relation() or {})
    for rel in rels["master"].values():
        num_standbys += len(rel)

    num_clients = 0
    for rel in list(rels["db"]) + list(rels["db-admin"]):
        num_clients += len(rel)

    # Even without replication, replication slots get used by
    # pg_basebackup(1). Bump up max_wal_senders so things work. It is
    # cheap, so perhaps we should just pump it to several thousand.
    min_wal_senders = num_standbys * 2 + 5
    if min_wal_senders > int(opts.get("max_wal_senders", 0)):
        force(max_wal_senders=min_wal_senders)

    # We used to calculate a minimum max_connections here, ensuring
    # that we had at least one per client and enough for replication
    # and backups. It wasn't much use though, as the major variable
    # is not the number of clients but how many connections the
    # clients open (connection pools of 20 or more are not uncommon).
    # lp:1594667 required the calculation to be synchronized, or just
    # removed. So removed to avoid complexity for dubious gains.
    #
    # max_connections. One per client unit, plus replication.
    # max_wal_senders = int(opts.get('max_wal_senders', 0))
    # assert max_wal_senders > 0
    # min_max_connections = max_wal_senders + max(1, num_clients)
    #
    min_max_connections = 100
    if min_max_connections > int(opts.get("max_connections", 0)):
        force(max_connections=min_max_connections)

    # We want 'hot_standby' at a minimum, as it lets us run
    # pg_basebackup() and it is recommended over the more
    # minimal 'archive'. Is it worth only enabling the higher-still
    # 'logical' level only when necessary? How do we detect that?
    force(hot_standby=True)
    if postgresql.has_version("9.4"):
        force(wal_level="logical")
    else:
        force(wal_level="hot_standby")

    # Having two config options for the one setting is confusing. Perhaps
    # we should deprecate this.
    if num_standbys and (int(config["replicated_wal_keep_segments"]) > int(opts.get("wal_keep_segments", 0))):
        force(wal_keep_segments=config["replicated_wal_keep_segments"])

    # Log shipping with WAL-E.
    if config["wal_e_storage_uri"]:
        force(archive_mode="on")  # Boolean pre-9.5, enum 9.5+
        force(archive_command=wal_e.wal_e_archive_command())

    # Log destinations for syslog. This charm only supports standard
    # Debian logging, or Debian + syslog. This will grow more complex in
    # the future, as the local logs are redundant if you are using syslog
    # for log aggregation, and we will want to add csvlog because it is
    # so much easier to parse.
    if context.Relations()["syslog"]:
        force(
            log_destination="stderr,syslog",
            syslog_ident=hookenv.local_unit().replace("/", "_"),
        )
Example #9
0
def wal_e_restore():
    reactive.remove_state("action.wal-e-restore")
    params = hookenv.action_get()
    backup = params["backup-name"].strip().replace("-", "_")
    storage_uri = params["storage-uri"].strip()

    ship_uri = hookenv.config().get("wal_e_storage_uri")
    if storage_uri == ship_uri:
        hookenv.action_fail(
            "The storage-uri parameter is identical to "
            "the wal_e_storage_uri config setting. Your "
            "restoration source cannot be the same as the "
            "folder you are archiving too to avoid corrupting "
            "the backups."
        )
        return

    if not params["confirm"]:
        m = "Recovery from {}.".format(storage_uri)
        if ship_uri:
            m += "\nContents of {} will be destroyed.".format(ship_uri)
        m += "\nExisting local database will be destroyed."
        m += "\nRerun action with 'confirm=true' to proceed."
        hookenv.action_set({"info": m})
        return

    with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir:
        update_wal_e_env_dir(envdir, storage_uri)

        # Confirm there is a backup to restore
        backups = wal_e_list_backups(envdir)
        if not backups:
            hookenv.action_fail("No backups found at {}".format(storage_uri))
            return
        if backup != "LATEST" and backup not in (b["name"] for b in backups):
            hookenv.action_fail("Backup {} not found".format(backup))
            return

        # Shutdown PostgreSQL. Note we want this action to run synchronously,
        # so there is no opportunity to ask permission from the leader. If
        # there are other units cloning this database, those clone operations
        # will fail. Which seems preferable to blocking a recovery operation
        # in any case, because if we are doing disaster recovery we generally
        # want to do it right now.
        status_set("maintenance", "Stopping PostgreSQL for backup restoration")
        postgresql.stop()

        # Trash the existing database. Its dangerous to do this first, but
        # we probably need the space.
        data_dir = postgresql.data_dir()  # May be a symlink
        for content in os.listdir(data_dir):
            cpath = os.path.join(data_dir, content)
            if os.path.isdir(cpath) and not os.path.islink(cpath):
                shutil.rmtree(cpath)
            else:
                os.remove(cpath)

        # WAL-E recover
        status_set("maintenance", "Restoring backup {}".format(backup))
        wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir)

        # Create recovery.conf to complete recovery
        is_master = reactive.is_state("postgresql.replication.is_master")
        standby_mode = "off" if is_master else "on"
        if params.get("target-time"):
            target_time = "recovery_target_time='{}'" "".format(params["target-time"])
        else:
            target_time = ""
        target_action = "promote" if is_master else "shutdown"
        immediate = "" if is_master else "recovery_target='immediate'"
        helpers.write(
            postgresql.recovery_conf_path(),
            dedent(
                """\
                             # Managed by Juju. PITR in progress.
                             standby_mode = {}
                             restore_command='{}'
                             recovery_target_timeline = {}
                             recovery_target_action = {}
                             {}
                             {}
                             """
            ).format(
                standby_mode,
                wal_e_restore_command(envdir=envdir),
                params["target-timeline"],
                target_action,
                target_time,
                immediate,
            ),
            mode=0o600,
            user="******",
            group="postgres",
        )

        # Avoid circular import. We could also avoid the import entirely
        # with a sufficiently complex set of handlers in the replication
        # module, but that seems to be a worse solution. Better to break
        # out this action into a separate module.
        from reactive.postgresql import replication

        if is_master:
            if ship_uri:
                # If master, trash the configured wal-e storage. This may
                # contain WAL and backups from the old cluster which will
                # conflict with the new cluster. Hopefully it does not
                # contain anything important, because we have no way to
                # prompt the user for confirmation.
                wal_e_run(["delete", "--confirm", "everything"])

            # Then, wait for recovery and promotion.
            postgresql.start()
            con = postgresql.connect()
            cur = con.cursor()
            while True:
                if postgresql.has_version("10"):
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_wal_replay_lsn()"""
                    )
                else:
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_xlog_replay_location()"""
                    )
                in_rec, loc = cur.fetchone()
                if not in_rec:
                    break
                status_set("maintenance", "Recovery at {}".format(loc))
                time.sleep(10)
        else:
            # If standby, startup and wait for recovery to complete and
            # shutdown.
            status_set("maintenance", "Recovery")
            # Startup might shutdown immediately and look like a failure.
            postgresql.start(ignore_failure=True)
            # No recovery point status yet for standbys, as we would need
            # to handle connection failures when the DB shuts down. We
            # should do this.
            while postgresql.is_running():
                time.sleep(5)
            replication.update_recovery_conf(follow=replication.get_master())

    # Reactive handlers will deal with the rest of the cleanup.
    # eg. ensuring required users and roles exist
    replication.update_replication_states()
    reactive.remove_state("postgresql.cluster.configured")
    reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running())
    reactive.remove_state("postgresql.nagios.user_ensured")
    reactive.remove_state("postgresql.replication.replication_user_created")
    reactive.remove_state("postgresql.client.published")
 def test_has_version(self, version):
     version.return_value = "9.4"
     self.assertTrue(postgresql.has_version("9.1"))
     self.assertTrue(postgresql.has_version("9.4"))
     self.assertFalse(postgresql.has_version("9.5"))
Example #11
0
def ensure_viable_postgresql_conf(opts):
    def force(**kw):
        for k, v in kw.items():
            if opts.get(k) != v:
                hookenv.log('Setting {} to {}'.format(k, v), DEBUG)
                opts[k] = v

    config = hookenv.config()
    rels = context.Relations()

    # Number of standby units - count peers and 'master' relations.
    num_standbys = len(rels.peer or {})
    for rel in rels['master'].values():
        num_standbys += len(rel)

    num_clients = 0
    for rel in list(rels['db']) + list(rels['db-admin']):
        num_clients += len(rel)

    # Even without replication, replication slots get used by
    # pg_basebackup(1). Bump up max_wal_senders so things work. It is
    # cheap, so perhaps we should just pump it to several thousand.
    min_wal_senders = num_standbys * 2 + 5
    if min_wal_senders > int(opts.get('max_wal_senders', 0)):
        force(max_wal_senders=min_wal_senders)

    # We used to calculate a minimum max_connections here, ensuring
    # that we had at least one per client and enough for replication
    # and backups. It wasn't much use though, as the major variable
    # is not the number of clients but how many connections the
    # clients open (connection pools of 20 or more are not uncommon).
    # lp:1594667 required the calculation to be synchronized, or just
    # removed. So removed to avoid complexity for dubious gains.
    #
    # max_connections. One per client unit, plus replication.
    # max_wal_senders = int(opts.get('max_wal_senders', 0))
    # assert max_wal_senders > 0
    # min_max_connections = max_wal_senders + max(1, num_clients)
    #
    min_max_connections = 100
    if min_max_connections > int(opts.get('max_connections', 0)):
        force(max_connections=min_max_connections)

    # We want 'hot_standby' at a minimum, as it lets us run
    # pg_basebackup() and it is recommended over the more
    # minimal 'archive'. Is it worth only enabling the higher-still
    # 'logical' level only when necessary? How do we detect that?
    force(hot_standby=True)
    if postgresql.has_version('9.4'):
        force(wal_level='logical')
    else:
        force(wal_level='hot_standby')

    # Having two config options for the one setting is confusing. Perhaps
    # we should deprecate this.
    if num_standbys and (int(config['replicated_wal_keep_segments']) >
                         int(opts.get('wal_keep_segments', 0))):
        force(wal_keep_segments=config['replicated_wal_keep_segments'])

    # Log shipping with WAL-E.
    if config['wal_e_storage_uri']:
        force(archive_mode='on')  # Boolean pre-9.5, enum 9.5+
        force(archive_command=wal_e.wal_e_archive_command())

    # Log destinations for syslog. This charm only supports standard
    # Debian logging, or Debian + syslog. This will grow more complex in
    # the future, as the local logs are redundant if you are using syslog
    # for log aggregation, and we will want to add csvlog because it is
    # so much easier to parse.
    if context.Relations()['syslog']:
        force(log_destination='stderr,syslog',
              syslog_ident=hookenv.local_unit().replace('/', '_'))
Example #12
0
def clone_master():
    master = get_master()
    peer_rel = helpers.get_peer_relation()
    master_relinfo = peer_rel[master]

    # Be paranoid since we are about to destroy data.
    assert not reactive.helpers.is_state("postgresql.replication.is_master")
    assert not reactive.helpers.is_state("postgresql.cluster.is_running")

    # We use realpath on data_dir as it may have been replaced with
    # a symbolic link, so we empty and recreate the actual directory
    # and the links remain in place.
    data_dir = os.path.realpath(postgresql.data_dir())

    if os.path.exists(data_dir):
        hookenv.log("Removing {} in preparation for clone".format(data_dir))
        shutil.rmtree(data_dir)
    helpers.makedirs(data_dir, mode=0o700, user="******", group="postgres")

    if postgresql.has_version("10"):
        wal_method = "--wal-method=stream"
    else:
        wal_method = "--xlog-method=stream"
    cmd = [
        "sudo",
        "-H",  # -H needed to locate $HOME/.pgpass
        "-u",
        "postgres",
        "pg_basebackup",
        "-D",
        postgresql.data_dir(),
        "-h",
        master_relinfo["host"],
        "-p",
        master_relinfo["port"],
        "--checkpoint=fast",
        "--progress",
        wal_method,
        "--no-password",
        "--username=_juju_repl",
    ]
    hookenv.log("Cloning {} with {}".format(master, " ".join(cmd)))
    status_set("maintenance", "Cloning {}".format(master))
    try:
        # Switch to a directory the postgres user can access.
        with helpers.switch_cwd("/tmp"):
            subprocess.check_call(cmd, universal_newlines=True)
    except subprocess.CalledProcessError as x:
        hookenv.log("Clone failed with {}".format(x), ERROR)
        # We failed, and the local cluster is broken.
        status_set("blocked", "Failed to clone {}".format(master))
        postgresql.drop_cluster()
        reactive.remove_state("postgresql.cluster.configured")
        reactive.remove_state("postgresql.cluster.created")
        # Terminate. We need this hook to exit, rather than enter a loop.
        raise SystemExit(0)

    update_recovery_conf(follow=master)

    reactive.set_state("postgresql.replication.cloned")
    update_replication_states()