Example #1
0
def db_relation_mirror(rel):
    """Non-masters mirror relation information from the master."""
    master = replication.get_master()
    master_keys = ["database", "user", "password", "roles", "schema_user", "schema_password", "extensions"]
    master_info = rel.peers.get(master)
    if master_info is None:
        hookenv.log("Waiting for {} to join {}".format(master, rel))
        return
    hookenv.log("Mirroring {} database credentials from {}".format(rel, master))
    rel.local.update({k: master_info.get(k) for k in master_keys})
Example #2
0
def block_on_maintenance_mode():
    if hookenv.leader_get("maintenance_mode"):
        master = replication.get_master()
        if master is None:
            msg = "Application in maintenance mode"
        elif master == hookenv.local_unit():
            msg = "Master unit in maintenance mode"
        else:
            msg = "Standby unit in maintenance mode"
        hookenv.status_set("blocked", msg)
        hookenv.log("Application is in maintenance mode, terminating hook",
                    hookenv.WARNING)
        raise SystemExit(
            0)  # Terminate now without error. hookenv.atexit() not invoked.
Example #3
0
def db_relation_mirror(rel):
    """Non-masters mirror relation information from the master."""
    master = replication.get_master()
    master_keys = [
        "database",
        "user",
        "password",
        "roles",
        "schema_user",
        "schema_password",
        "extensions",
    ]
    master_info = rel.peers.get(master)
    if master_info is None:
        hookenv.log("Waiting for {} to join {}".format(master, rel))
        return
    hookenv.log("Mirroring {} database credentials from {}".format(rel, master))
    rel.local.update({k: master_info.get(k) for k in master_keys})
Example #4
0
def db_relation_common(rel):
    """Publish unit specific relation details."""
    local = rel.local
    if "database" not in local:
        return  # Not yet ready.

    # Version number, allowing clients to adjust or block if their
    # expectations are not met.
    local["version"] = postgresql.version()

    # Calculate the state of this unit. 'standalone' will disappear
    # in a future version of this interface, as this state was
    # only needed to deal with race conditions now solved by
    # Juju leadership. We check for is_primary() rather than
    # the postgresql.replication.is_master reactive state to
    # publish the correct state when we are using manual replication
    # (there might be multiple independent masters, possibly useful for
    # sharding, or perhaps this is a multi master BDR setup).
    if postgresql.is_primary():
        if reactive.helpers.is_state("postgresql.replication.has_peers"):
            local["state"] = "master"
        else:
            local["state"] = "standalone"
    else:
        local["state"] = "hot standby"

    # Host is the private ip address, but this might change and
    # become the address of an attached proxy or alternative peer
    # if this unit is in maintenance.
    local["host"] = ingress_address(local.relname, local.relid)

    # Port will be 5432, unless the user has overridden it or
    # something very weird happened when the packages where installed.
    local["port"] = str(postgresql.port())

    # The list of remote units on this relation granted access.
    # This is to avoid the race condition where a new client unit
    # joins an existing client relation and sees valid credentials,
    # before we have had a chance to grant it access.
    local["allowed-units"] = " ".join(unit for unit, relinfo in rel.items() if len(incoming_addresses(relinfo)) > 0)

    # The list of IP address ranges on this relation granted access.
    # This will replace allowed-units, which does not work with cross
    # model ralations due to the anonymization of the external client.
    local["allowed-subnets"] = ",".join(
        sorted({r: True for r in chain(*[incoming_addresses(relinfo) for relinfo in rel.values()])}.keys())
    )

    # v2 protocol. Publish connection strings for this unit and its peers.
    # Clients should use these connection strings in favour of the old
    # host, port, database settings. A single proxy unit can thus
    # publish several end points to clients.
    master = replication.get_master()
    if replication.is_master():
        master_relinfo = local
    else:
        master_relinfo = rel.peers.get(master)
    local["master"] = relinfo_to_cs(master_relinfo)
    if rel.peers:
        all_relinfo = rel.peers.values()
    all_relinfo = list(rel.peers.values()) if rel.peers else []
    all_relinfo.append(rel.local)
    standbys = filter(
        None,
        [relinfo_to_cs(relinfo) for relinfo in all_relinfo if relinfo.unit != master],
    )
    local["standbys"] = "\n".join(sorted(standbys)) or None
Example #5
0
def upgrade_charm():
    workloadstatus.status_set("maintenance", "Upgrading charm")

    rels = context.Relations()

    # The master is now appointed by the leader.
    if hookenv.is_leader():
        master = replication.get_master()
        if not master:
            master = hookenv.local_unit()
            peer_rel = helpers.get_peer_relation()
            if peer_rel:
                for peer_relinfo in peer_rel.values():
                    if peer_relinfo.get("state") == "master":
                        master = peer_relinfo.unit
                        break
            hookenv.log("Discovered {} is the master".format(master))
            leadership.leader_set(master=master)

    # The name of this crontab has changed. It will get regenerated.
    if os.path.exists("/etc/cron.d/postgresql"):
        hookenv.log("Removing old crontab")
        os.unlink("/etc/cron.d/postgresql")

    # Older generated usernames where generated from the relation id,
    # and really old ones contained random components. This made it
    # problematic to restore a database into a fresh environment,
    # because the new usernames would not match the old usernames and
    # done of the database permissions would match. We now generate
    # usernames using just the client service name, so restoring a
    # database into a fresh environment will work provided the service
    # names match. We want to update the old usernames in upgraded
    # services to the new format to improve their disaster recovery
    # story.
    for relname, superuser in [("db", False), ("db-admin", True)]:
        for client_rel in rels[relname].values():
            hookenv.log("Migrating database users for {}".format(client_rel))
            password = client_rel.local.get("password", host.pwgen())
            old_username = client_rel.local.get("user")
            new_username = postgresql.username(client_rel.service, superuser,
                                               False)
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local["user"] = new_username
                client_rel.local["password"] = password

            old_username = client_rel.local.get("schema_user")
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local["schema_user"] = new_username
                client_rel.local["schema_password"] = password

    # Admin relations used to get 'all' published as the database name,
    # which was bogus.
    for client_rel in rels["db-admin"].values():
        if client_rel.local.get("database") == "all":
            client_rel.local["database"] = client_rel.service

    # Reconfigure PostgreSQL and republish client relations.
    reactive.remove_state("postgresql.cluster.configured")
    reactive.remove_state("postgresql.client.published")

    # Don't recreate the cluster.
    reactive.set_state("postgresql.cluster.created")

    # Set the postgresql.replication.cloned flag, so we don't rebuild
    # standbys when upgrading the charm from a pre-reactive version.
    reactive.set_state("postgresql.replication.cloned")

    # Publish which node we are following
    peer_rel = helpers.get_peer_relation()
    if peer_rel and "following" not in peer_rel.local:
        following = unitdata.kv().get("postgresql.replication.following")
        if following is None and not replication.is_master():
            following = replication.get_master()
        peer_rel.local["following"] = following

    # Ensure storage that was attached but ignored is no longer ignored.
    if not reactive.is_state("postgresql.storage.pgdata.attached"):
        if hookenv.storage_list("pgdata"):
            storage.attach()

    # Ensure client usernames and passwords match leader settings.
    for relname in ("db", "db-admin"):
        for rel in rels[relname].values():
            del rel.local["user"]
            del rel.local["password"]

    # Ensure the configure version is cached.
    postgresql.version()

    # Skip checks for pre-existing databases, as that has already happened.
    reactive.set_state("postgresql.cluster.initial-check")

    # Reinstall support scripts
    reactive.remove_state("postgresql.cluster.support-scripts")

    # Ensure that systemd is managing the PostgreSQL process
    if host.init_is_systemd(
    ) and not reactive.is_flag_set("postgresql.upgrade.systemd"):
        reactive.set_flag("postgresql.upgrade.systemd")
        if reactive.is_flag_set("postgresql.cluster.is_running"):
            hookenv.log("Restarting PostgreSQL under systemd", hookenv.WARNING)
            reactive.clear_flag("postgresql.cluster.is_running")
            postgresql.stop_pgctlcluster()

    # Update the PGDG source, in case the signing key has changed.
    config = hookenv.config()
    if config["pgdg"]:
        service.add_pgdg_source()
Example #6
0
def upgrade_charm():
    workloadstatus.status_set('maintenance', 'Upgrading charm')

    rels = context.Relations()

    # The master is now appointed by the leader.
    if hookenv.is_leader():
        master = replication.get_master()
        if not master:
            master = hookenv.local_unit()
            if rels.peer:
                for peer_relinfo in rels.peer.values():
                    if peer_relinfo.get('state') == 'master':
                        master = peer_relinfo.unit
                        break
            hookenv.log('Discovered {} is the master'.format(master))
            leadership.leader_set(master=master)

    # The name of this crontab has changed. It will get regenerated.
    if os.path.exists('/etc/cron.d/postgresql'):
        hookenv.log('Removing old crontab')
        os.unlink('/etc/cron.d/postgresql')

    # Older generated usernames where generated from the relation id,
    # and really old ones contained random components. This made it
    # problematic to restore a database into a fresh environment,
    # because the new usernames would not match the old usernames and
    # done of the database permissions would match. We now generate
    # usernames using just the client service name, so restoring a
    # database into a fresh environment will work provided the service
    # names match. We want to update the old usernames in upgraded
    # services to the new format to improve their disaster recovery
    # story.
    for relname, superuser in [('db', False), ('db-admin', True)]:
        for client_rel in rels[relname].values():
            hookenv.log('Migrating database users for {}'.format(client_rel))
            password = client_rel.local.get('password', host.pwgen())
            old_username = client_rel.local.get('user')
            new_username = postgresql.username(client_rel.service,
                                               superuser, False)
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local['user'] = new_username
                client_rel.local['password'] = password

            old_username = client_rel.local.get('schema_user')
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local['schema_user'] = new_username
                client_rel.local['schema_password'] = password

    # Admin relations used to get 'all' published as the database name,
    # which was bogus.
    for client_rel in rels['db-admin'].values():
        if client_rel.local.get('database') == 'all':
            client_rel.local['database'] = client_rel.service

    # Reconfigure PostgreSQL and republish client relations.
    reactive.remove_state('postgresql.cluster.configured')
    reactive.remove_state('postgresql.client.published')

    # Don't recreate the cluster.
    reactive.set_state('postgresql.cluster.created')

    # Set the postgresql.replication.cloned flag, so we don't rebuild
    # standbys when upgrading the charm from a pre-reactive version.
    reactive.set_state('postgresql.replication.cloned')
Example #7
0
def wal_e_restore():
    reactive.remove_state("action.wal-e-restore")
    params = hookenv.action_get()
    backup = params["backup-name"].strip().replace("-", "_")
    storage_uri = params["storage-uri"].strip()

    ship_uri = hookenv.config().get("wal_e_storage_uri")
    if storage_uri == ship_uri:
        hookenv.action_fail(
            "The storage-uri parameter is identical to "
            "the wal_e_storage_uri config setting. Your "
            "restoration source cannot be the same as the "
            "folder you are archiving too to avoid corrupting "
            "the backups."
        )
        return

    if not params["confirm"]:
        m = "Recovery from {}.".format(storage_uri)
        if ship_uri:
            m += "\nContents of {} will be destroyed.".format(ship_uri)
        m += "\nExisting local database will be destroyed."
        m += "\nRerun action with 'confirm=true' to proceed."
        hookenv.action_set({"info": m})
        return

    with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir:
        update_wal_e_env_dir(envdir, storage_uri)

        # Confirm there is a backup to restore
        backups = wal_e_list_backups(envdir)
        if not backups:
            hookenv.action_fail("No backups found at {}".format(storage_uri))
            return
        if backup != "LATEST" and backup not in (b["name"] for b in backups):
            hookenv.action_fail("Backup {} not found".format(backup))
            return

        # Shutdown PostgreSQL. Note we want this action to run synchronously,
        # so there is no opportunity to ask permission from the leader. If
        # there are other units cloning this database, those clone operations
        # will fail. Which seems preferable to blocking a recovery operation
        # in any case, because if we are doing disaster recovery we generally
        # want to do it right now.
        status_set("maintenance", "Stopping PostgreSQL for backup restoration")
        postgresql.stop()

        # Trash the existing database. Its dangerous to do this first, but
        # we probably need the space.
        data_dir = postgresql.data_dir()  # May be a symlink
        for content in os.listdir(data_dir):
            cpath = os.path.join(data_dir, content)
            if os.path.isdir(cpath) and not os.path.islink(cpath):
                shutil.rmtree(cpath)
            else:
                os.remove(cpath)

        # WAL-E recover
        status_set("maintenance", "Restoring backup {}".format(backup))
        wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir)

        # Create recovery.conf to complete recovery
        is_master = reactive.is_state("postgresql.replication.is_master")
        standby_mode = "off" if is_master else "on"
        if params.get("target-time"):
            target_time = "recovery_target_time='{}'" "".format(params["target-time"])
        else:
            target_time = ""
        target_action = "promote" if is_master else "shutdown"
        immediate = "" if is_master else "recovery_target='immediate'"
        helpers.write(
            postgresql.recovery_conf_path(),
            dedent(
                """\
                             # Managed by Juju. PITR in progress.
                             standby_mode = {}
                             restore_command='{}'
                             recovery_target_timeline = {}
                             recovery_target_action = {}
                             {}
                             {}
                             """
            ).format(
                standby_mode,
                wal_e_restore_command(envdir=envdir),
                params["target-timeline"],
                target_action,
                target_time,
                immediate,
            ),
            mode=0o600,
            user="******",
            group="postgres",
        )

        # Avoid circular import. We could also avoid the import entirely
        # with a sufficiently complex set of handlers in the replication
        # module, but that seems to be a worse solution. Better to break
        # out this action into a separate module.
        from reactive.postgresql import replication

        if is_master:
            if ship_uri:
                # If master, trash the configured wal-e storage. This may
                # contain WAL and backups from the old cluster which will
                # conflict with the new cluster. Hopefully it does not
                # contain anything important, because we have no way to
                # prompt the user for confirmation.
                wal_e_run(["delete", "--confirm", "everything"])

            # Then, wait for recovery and promotion.
            postgresql.start()
            con = postgresql.connect()
            cur = con.cursor()
            while True:
                if postgresql.has_version("10"):
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_wal_replay_lsn()"""
                    )
                else:
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_xlog_replay_location()"""
                    )
                in_rec, loc = cur.fetchone()
                if not in_rec:
                    break
                status_set("maintenance", "Recovery at {}".format(loc))
                time.sleep(10)
        else:
            # If standby, startup and wait for recovery to complete and
            # shutdown.
            status_set("maintenance", "Recovery")
            # Startup might shutdown immediately and look like a failure.
            postgresql.start(ignore_failure=True)
            # No recovery point status yet for standbys, as we would need
            # to handle connection failures when the DB shuts down. We
            # should do this.
            while postgresql.is_running():
                time.sleep(5)
            replication.update_recovery_conf(follow=replication.get_master())

    # Reactive handlers will deal with the rest of the cleanup.
    # eg. ensuring required users and roles exist
    replication.update_replication_states()
    reactive.remove_state("postgresql.cluster.configured")
    reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running())
    reactive.remove_state("postgresql.nagios.user_ensured")
    reactive.remove_state("postgresql.replication.replication_user_created")
    reactive.remove_state("postgresql.client.published")