def db_relation_mirror(rel): """Non-masters mirror relation information from the master.""" master = replication.get_master() master_keys = ["database", "user", "password", "roles", "schema_user", "schema_password", "extensions"] master_info = rel.peers.get(master) if master_info is None: hookenv.log("Waiting for {} to join {}".format(master, rel)) return hookenv.log("Mirroring {} database credentials from {}".format(rel, master)) rel.local.update({k: master_info.get(k) for k in master_keys})
def block_on_maintenance_mode(): if hookenv.leader_get("maintenance_mode"): master = replication.get_master() if master is None: msg = "Application in maintenance mode" elif master == hookenv.local_unit(): msg = "Master unit in maintenance mode" else: msg = "Standby unit in maintenance mode" hookenv.status_set("blocked", msg) hookenv.log("Application is in maintenance mode, terminating hook", hookenv.WARNING) raise SystemExit( 0) # Terminate now without error. hookenv.atexit() not invoked.
def db_relation_mirror(rel): """Non-masters mirror relation information from the master.""" master = replication.get_master() master_keys = [ "database", "user", "password", "roles", "schema_user", "schema_password", "extensions", ] master_info = rel.peers.get(master) if master_info is None: hookenv.log("Waiting for {} to join {}".format(master, rel)) return hookenv.log("Mirroring {} database credentials from {}".format(rel, master)) rel.local.update({k: master_info.get(k) for k in master_keys})
def db_relation_common(rel): """Publish unit specific relation details.""" local = rel.local if "database" not in local: return # Not yet ready. # Version number, allowing clients to adjust or block if their # expectations are not met. local["version"] = postgresql.version() # Calculate the state of this unit. 'standalone' will disappear # in a future version of this interface, as this state was # only needed to deal with race conditions now solved by # Juju leadership. We check for is_primary() rather than # the postgresql.replication.is_master reactive state to # publish the correct state when we are using manual replication # (there might be multiple independent masters, possibly useful for # sharding, or perhaps this is a multi master BDR setup). if postgresql.is_primary(): if reactive.helpers.is_state("postgresql.replication.has_peers"): local["state"] = "master" else: local["state"] = "standalone" else: local["state"] = "hot standby" # Host is the private ip address, but this might change and # become the address of an attached proxy or alternative peer # if this unit is in maintenance. local["host"] = ingress_address(local.relname, local.relid) # Port will be 5432, unless the user has overridden it or # something very weird happened when the packages where installed. local["port"] = str(postgresql.port()) # The list of remote units on this relation granted access. # This is to avoid the race condition where a new client unit # joins an existing client relation and sees valid credentials, # before we have had a chance to grant it access. local["allowed-units"] = " ".join(unit for unit, relinfo in rel.items() if len(incoming_addresses(relinfo)) > 0) # The list of IP address ranges on this relation granted access. # This will replace allowed-units, which does not work with cross # model ralations due to the anonymization of the external client. local["allowed-subnets"] = ",".join( sorted({r: True for r in chain(*[incoming_addresses(relinfo) for relinfo in rel.values()])}.keys()) ) # v2 protocol. Publish connection strings for this unit and its peers. # Clients should use these connection strings in favour of the old # host, port, database settings. A single proxy unit can thus # publish several end points to clients. master = replication.get_master() if replication.is_master(): master_relinfo = local else: master_relinfo = rel.peers.get(master) local["master"] = relinfo_to_cs(master_relinfo) if rel.peers: all_relinfo = rel.peers.values() all_relinfo = list(rel.peers.values()) if rel.peers else [] all_relinfo.append(rel.local) standbys = filter( None, [relinfo_to_cs(relinfo) for relinfo in all_relinfo if relinfo.unit != master], ) local["standbys"] = "\n".join(sorted(standbys)) or None
def upgrade_charm(): workloadstatus.status_set("maintenance", "Upgrading charm") rels = context.Relations() # The master is now appointed by the leader. if hookenv.is_leader(): master = replication.get_master() if not master: master = hookenv.local_unit() peer_rel = helpers.get_peer_relation() if peer_rel: for peer_relinfo in peer_rel.values(): if peer_relinfo.get("state") == "master": master = peer_relinfo.unit break hookenv.log("Discovered {} is the master".format(master)) leadership.leader_set(master=master) # The name of this crontab has changed. It will get regenerated. if os.path.exists("/etc/cron.d/postgresql"): hookenv.log("Removing old crontab") os.unlink("/etc/cron.d/postgresql") # Older generated usernames where generated from the relation id, # and really old ones contained random components. This made it # problematic to restore a database into a fresh environment, # because the new usernames would not match the old usernames and # done of the database permissions would match. We now generate # usernames using just the client service name, so restoring a # database into a fresh environment will work provided the service # names match. We want to update the old usernames in upgraded # services to the new format to improve their disaster recovery # story. for relname, superuser in [("db", False), ("db-admin", True)]: for client_rel in rels[relname].values(): hookenv.log("Migrating database users for {}".format(client_rel)) password = client_rel.local.get("password", host.pwgen()) old_username = client_rel.local.get("user") new_username = postgresql.username(client_rel.service, superuser, False) if old_username and old_username != new_username: migrate_user(old_username, new_username, password, superuser) client_rel.local["user"] = new_username client_rel.local["password"] = password old_username = client_rel.local.get("schema_user") if old_username and old_username != new_username: migrate_user(old_username, new_username, password, superuser) client_rel.local["schema_user"] = new_username client_rel.local["schema_password"] = password # Admin relations used to get 'all' published as the database name, # which was bogus. for client_rel in rels["db-admin"].values(): if client_rel.local.get("database") == "all": client_rel.local["database"] = client_rel.service # Reconfigure PostgreSQL and republish client relations. reactive.remove_state("postgresql.cluster.configured") reactive.remove_state("postgresql.client.published") # Don't recreate the cluster. reactive.set_state("postgresql.cluster.created") # Set the postgresql.replication.cloned flag, so we don't rebuild # standbys when upgrading the charm from a pre-reactive version. reactive.set_state("postgresql.replication.cloned") # Publish which node we are following peer_rel = helpers.get_peer_relation() if peer_rel and "following" not in peer_rel.local: following = unitdata.kv().get("postgresql.replication.following") if following is None and not replication.is_master(): following = replication.get_master() peer_rel.local["following"] = following # Ensure storage that was attached but ignored is no longer ignored. if not reactive.is_state("postgresql.storage.pgdata.attached"): if hookenv.storage_list("pgdata"): storage.attach() # Ensure client usernames and passwords match leader settings. for relname in ("db", "db-admin"): for rel in rels[relname].values(): del rel.local["user"] del rel.local["password"] # Ensure the configure version is cached. postgresql.version() # Skip checks for pre-existing databases, as that has already happened. reactive.set_state("postgresql.cluster.initial-check") # Reinstall support scripts reactive.remove_state("postgresql.cluster.support-scripts") # Ensure that systemd is managing the PostgreSQL process if host.init_is_systemd( ) and not reactive.is_flag_set("postgresql.upgrade.systemd"): reactive.set_flag("postgresql.upgrade.systemd") if reactive.is_flag_set("postgresql.cluster.is_running"): hookenv.log("Restarting PostgreSQL under systemd", hookenv.WARNING) reactive.clear_flag("postgresql.cluster.is_running") postgresql.stop_pgctlcluster() # Update the PGDG source, in case the signing key has changed. config = hookenv.config() if config["pgdg"]: service.add_pgdg_source()
def upgrade_charm(): workloadstatus.status_set('maintenance', 'Upgrading charm') rels = context.Relations() # The master is now appointed by the leader. if hookenv.is_leader(): master = replication.get_master() if not master: master = hookenv.local_unit() if rels.peer: for peer_relinfo in rels.peer.values(): if peer_relinfo.get('state') == 'master': master = peer_relinfo.unit break hookenv.log('Discovered {} is the master'.format(master)) leadership.leader_set(master=master) # The name of this crontab has changed. It will get regenerated. if os.path.exists('/etc/cron.d/postgresql'): hookenv.log('Removing old crontab') os.unlink('/etc/cron.d/postgresql') # Older generated usernames where generated from the relation id, # and really old ones contained random components. This made it # problematic to restore a database into a fresh environment, # because the new usernames would not match the old usernames and # done of the database permissions would match. We now generate # usernames using just the client service name, so restoring a # database into a fresh environment will work provided the service # names match. We want to update the old usernames in upgraded # services to the new format to improve their disaster recovery # story. for relname, superuser in [('db', False), ('db-admin', True)]: for client_rel in rels[relname].values(): hookenv.log('Migrating database users for {}'.format(client_rel)) password = client_rel.local.get('password', host.pwgen()) old_username = client_rel.local.get('user') new_username = postgresql.username(client_rel.service, superuser, False) if old_username and old_username != new_username: migrate_user(old_username, new_username, password, superuser) client_rel.local['user'] = new_username client_rel.local['password'] = password old_username = client_rel.local.get('schema_user') if old_username and old_username != new_username: migrate_user(old_username, new_username, password, superuser) client_rel.local['schema_user'] = new_username client_rel.local['schema_password'] = password # Admin relations used to get 'all' published as the database name, # which was bogus. for client_rel in rels['db-admin'].values(): if client_rel.local.get('database') == 'all': client_rel.local['database'] = client_rel.service # Reconfigure PostgreSQL and republish client relations. reactive.remove_state('postgresql.cluster.configured') reactive.remove_state('postgresql.client.published') # Don't recreate the cluster. reactive.set_state('postgresql.cluster.created') # Set the postgresql.replication.cloned flag, so we don't rebuild # standbys when upgrading the charm from a pre-reactive version. reactive.set_state('postgresql.replication.cloned')
def wal_e_restore(): reactive.remove_state("action.wal-e-restore") params = hookenv.action_get() backup = params["backup-name"].strip().replace("-", "_") storage_uri = params["storage-uri"].strip() ship_uri = hookenv.config().get("wal_e_storage_uri") if storage_uri == ship_uri: hookenv.action_fail( "The storage-uri parameter is identical to " "the wal_e_storage_uri config setting. Your " "restoration source cannot be the same as the " "folder you are archiving too to avoid corrupting " "the backups." ) return if not params["confirm"]: m = "Recovery from {}.".format(storage_uri) if ship_uri: m += "\nContents of {} will be destroyed.".format(ship_uri) m += "\nExisting local database will be destroyed." m += "\nRerun action with 'confirm=true' to proceed." hookenv.action_set({"info": m}) return with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir: update_wal_e_env_dir(envdir, storage_uri) # Confirm there is a backup to restore backups = wal_e_list_backups(envdir) if not backups: hookenv.action_fail("No backups found at {}".format(storage_uri)) return if backup != "LATEST" and backup not in (b["name"] for b in backups): hookenv.action_fail("Backup {} not found".format(backup)) return # Shutdown PostgreSQL. Note we want this action to run synchronously, # so there is no opportunity to ask permission from the leader. If # there are other units cloning this database, those clone operations # will fail. Which seems preferable to blocking a recovery operation # in any case, because if we are doing disaster recovery we generally # want to do it right now. status_set("maintenance", "Stopping PostgreSQL for backup restoration") postgresql.stop() # Trash the existing database. Its dangerous to do this first, but # we probably need the space. data_dir = postgresql.data_dir() # May be a symlink for content in os.listdir(data_dir): cpath = os.path.join(data_dir, content) if os.path.isdir(cpath) and not os.path.islink(cpath): shutil.rmtree(cpath) else: os.remove(cpath) # WAL-E recover status_set("maintenance", "Restoring backup {}".format(backup)) wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir) # Create recovery.conf to complete recovery is_master = reactive.is_state("postgresql.replication.is_master") standby_mode = "off" if is_master else "on" if params.get("target-time"): target_time = "recovery_target_time='{}'" "".format(params["target-time"]) else: target_time = "" target_action = "promote" if is_master else "shutdown" immediate = "" if is_master else "recovery_target='immediate'" helpers.write( postgresql.recovery_conf_path(), dedent( """\ # Managed by Juju. PITR in progress. standby_mode = {} restore_command='{}' recovery_target_timeline = {} recovery_target_action = {} {} {} """ ).format( standby_mode, wal_e_restore_command(envdir=envdir), params["target-timeline"], target_action, target_time, immediate, ), mode=0o600, user="******", group="postgres", ) # Avoid circular import. We could also avoid the import entirely # with a sufficiently complex set of handlers in the replication # module, but that seems to be a worse solution. Better to break # out this action into a separate module. from reactive.postgresql import replication if is_master: if ship_uri: # If master, trash the configured wal-e storage. This may # contain WAL and backups from the old cluster which will # conflict with the new cluster. Hopefully it does not # contain anything important, because we have no way to # prompt the user for confirmation. wal_e_run(["delete", "--confirm", "everything"]) # Then, wait for recovery and promotion. postgresql.start() con = postgresql.connect() cur = con.cursor() while True: if postgresql.has_version("10"): cur.execute( """SELECT pg_is_in_recovery(), pg_last_wal_replay_lsn()""" ) else: cur.execute( """SELECT pg_is_in_recovery(), pg_last_xlog_replay_location()""" ) in_rec, loc = cur.fetchone() if not in_rec: break status_set("maintenance", "Recovery at {}".format(loc)) time.sleep(10) else: # If standby, startup and wait for recovery to complete and # shutdown. status_set("maintenance", "Recovery") # Startup might shutdown immediately and look like a failure. postgresql.start(ignore_failure=True) # No recovery point status yet for standbys, as we would need # to handle connection failures when the DB shuts down. We # should do this. while postgresql.is_running(): time.sleep(5) replication.update_recovery_conf(follow=replication.get_master()) # Reactive handlers will deal with the rest of the cleanup. # eg. ensuring required users and roles exist replication.update_replication_states() reactive.remove_state("postgresql.cluster.configured") reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running()) reactive.remove_state("postgresql.nagios.user_ensured") reactive.remove_state("postgresql.replication.replication_user_created") reactive.remove_state("postgresql.client.published")