def create_cluster(): '''Sets the postgresql.cluster.created state.''' assert not os.path.exists(postgresql.postgresql_conf_path()), \ 'inhibit_default_cluster_creation() failed' assert not os.path.exists(postgresql.data_dir()) postgresql.create_cluster() reactive.set_state('postgresql.cluster.created')
def remount(): if reactive.is_state('postgresql.cluster.is_running'): # Attempting this while PostgreSQL is live would be really, really bad. service.stop() old_data_dir = postgresql.data_dir() new_data_dir = os.path.join(external_volume_mount, 'postgresql', postgresql.version(), 'main') backup_data_dir = '{}-{}'.format(old_data_dir, int(time.time())) if os.path.isdir(new_data_dir): hookenv.log('Remounting existing database at {}'.format(new_data_dir), WARNING) else: status_set('maintenance', 'Migrating data from {} to {}'.format(old_data_dir, new_data_dir)) helpers.makedirs(new_data_dir, mode=0o770, user='******', group='postgres') try: rsync_cmd = ['rsync', '-av', old_data_dir + '/', new_data_dir + '/'] hookenv.log('Running {}'.format(' '.join(rsync_cmd)), DEBUG) subprocess.check_call(rsync_cmd) except subprocess.CalledProcessError: status_set('blocked', 'Failed to sync data from {} to {}' ''.format(old_data_dir, new_data_dir)) return os.replace(old_data_dir, backup_data_dir) os.symlink(new_data_dir, old_data_dir) fix_perms(new_data_dir) reactive.remove_state('postgresql.storage.needs_remount')
def test_simple_paths(self, version): # We have a pile of trivial helpers to get directory and file # paths. We use these for consistency and ease of mocking. version.return_value = "9.9" self.assertEqual(postgresql.config_dir(), "/etc/postgresql/9.9/main") self.assertEqual(postgresql.data_dir(), "/var/lib/postgresql/9.9/main") self.assertEqual(postgresql.postgresql_conf_path(), "/etc/postgresql/9.9/main/postgresql.conf") self.assertEqual(postgresql.pg_hba_conf_path(), "/etc/postgresql/9.9/main/pg_hba.conf") self.assertEqual(postgresql.pg_ident_conf_path(), "/etc/postgresql/9.9/main/pg_ident.conf") self.assertEqual(postgresql.recovery_conf_path(), "/var/lib/postgresql/9.9/main/recovery.conf") self.assertEqual(postgresql.pg_ctl_path(), "/usr/lib/postgresql/9.9/bin/pg_ctl") self.assertEqual(postgresql.postgres_path(), "/usr/lib/postgresql/9.9/bin/postgres")
def test_simple_paths(self, version): # We have a pile of trivial helpers to get directory and file # paths. We use these for consistency and ease of mocking. version.return_value = '9.9' self.assertEqual(postgresql.config_dir(), '/etc/postgresql/9.9/main') self.assertEqual(postgresql.data_dir(), '/var/lib/postgresql/9.9/main') self.assertEqual(postgresql.postgresql_conf_path(), '/etc/postgresql/9.9/main/postgresql.conf') self.assertEqual(postgresql.pg_hba_conf_path(), '/etc/postgresql/9.9/main/pg_hba.conf') self.assertEqual(postgresql.pg_ident_conf_path(), '/etc/postgresql/9.9/main/pg_ident.conf') self.assertEqual(postgresql.recovery_conf_path(), '/var/lib/postgresql/9.9/main/recovery.conf') self.assertEqual(postgresql.pg_ctl_path(), '/usr/lib/postgresql/9.9/bin/pg_ctl') self.assertEqual(postgresql.postgres_path(), '/usr/lib/postgresql/9.9/bin/postgres')
def migrate_pgdata(): ''' Copy the data from /var/lib/postgresql/9.x/main to the new path and replace the original PGDATA with a symlink. Note that the original may already be a symlink, either from the block storage broker or manual changes by admins. ''' if reactive.is_state('postgresql.cluster.is_running'): # Attempting this while PostgreSQL is live would be really, really bad. service.stop() old_data_dir = postgresql.data_dir() new_data_dir = unitdata.kv().get(pgdata_path_key) backup_data_dir = '{}-{}'.format(old_data_dir, int(time.time())) if os.path.isdir(new_data_dir): # This never happens with Juju storage, at least with 2.0, # because we have no way of reusing old partitions. hookenv.log('Remounting existing database at {}'.format(new_data_dir), WARNING) else: status_set('maintenance', 'Migrating data from {} to {}'.format(old_data_dir, new_data_dir)) helpers.makedirs(new_data_dir, mode=0o770, user='******', group='postgres') try: rsync_cmd = ['rsync', '-av', old_data_dir + '/', new_data_dir + '/'] hookenv.log('Running {}'.format(' '.join(rsync_cmd)), DEBUG) subprocess.check_call(rsync_cmd, universal_newlines=True) except subprocess.CalledProcessError: status_set('blocked', 'Failed to sync data from {} to {}' ''.format(old_data_dir, new_data_dir)) return os.replace(old_data_dir, backup_data_dir) os.symlink(new_data_dir, old_data_dir) fix_perms(new_data_dir) reactive.set_state('postgresql.storage.pgdata.migrated')
def main(): if not (reactive.is_state("postgresql.cluster.created") or reactive.is_state("postgresql.cluster.initial-check")): # We need to check for existance of an existing database, # before the main PostgreSQL package has been installed. # If there is one, abort rather than risk destroying data. # We need to do this here, as the apt layer may pull in # the main PostgreSQL package through dependencies, per # lp:1749284 if os.path.exists(postgresql.postgresql_conf_path()): hookenv.status_set( "blocked", "PostgreSQL config from previous install found at {}".format(postgresql.postgresql_conf_path()), ) elif os.path.exists(postgresql.data_dir()): hookenv.status_set( "blocked", "PostgreSQL database from previous install found at {}".format(postgresql.postgresql.data_dir()), ) else: hookenv.log("No pre-existing PostgreSQL database found") reactive.set_state("postgresql.cluster.initial-check") # Don't trust this state from the last hook. Daemons may have # crashed and servers rebooted since then. if reactive.is_state("postgresql.cluster.created"): try: reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running()) except subprocess.CalledProcessError as x: if not reactive.is_state("workloadstatus.blocked"): status_set( "blocked", "Local PostgreSQL cluster is corrupt: {}".format(x.stderr), ) # Reconfigure PostgreSQL. While we don't strictly speaking need # to do this every hook, we do need to do this almost every hook, # since even things like the number of peers or number of clients # can affect minimum viable configuration settings. reactive.remove_state("postgresql.cluster.configured") log_states() # Debug noise.
def remount(): if reactive.is_state("postgresql.cluster.is_running"): # Attempting this while PostgreSQL is live would be really, really bad. service.stop() old_data_dir = postgresql.data_dir() new_data_dir = os.path.join(external_volume_mount, "postgresql", postgresql.version(), "main") backup_data_dir = "{}-{}".format(old_data_dir, int(time.time())) if os.path.isdir(new_data_dir): hookenv.log("Remounting existing database at {}".format(new_data_dir), WARNING) else: status_set( "maintenance", "Migrating data from {} to {}".format(old_data_dir, new_data_dir), ) helpers.makedirs(new_data_dir, mode=0o770, user="******", group="postgres") try: rsync_cmd = [ "rsync", "-av", old_data_dir + "/", new_data_dir + "/" ] hookenv.log("Running {}".format(" ".join(rsync_cmd)), DEBUG) subprocess.check_call(rsync_cmd) except subprocess.CalledProcessError: status_set( "blocked", "Failed to sync data from {} to {}" "".format(old_data_dir, new_data_dir), ) return os.replace(old_data_dir, backup_data_dir) os.symlink(new_data_dir, old_data_dir) fix_perms(new_data_dir) reactive.remove_state("postgresql.storage.needs_remount")
def attach(): mount = hookenv.storage_get()['location'] pgdata = os.path.join(mount, postgresql.version(), 'main') unitdata.kv().set(pgdata_mount_key, mount) unitdata.kv().set(pgdata_path_key, pgdata) hookenv.log('PGDATA storage attached at {}'.format(mount)) # Never happens with Juju 2.0 as we can't reuse an old mount. This # check is here for the future. existingdb = os.path.exists(pgdata) required_space = shutil.disk_usage(postgresql.data_dir()).used free_space = shutil.disk_usage(mount).free if required_space > free_space and not existingdb: hookenv.status_set('blocked', 'Not enough free space in pgdata storage') else: apt.queue_install(['rsync']) coordinator.acquire('restart') reactive.set_state('postgresql.storage.pgdata.attached')
def needs_remount(): mounted = os.path.isdir(external_volume_mount) linked = os.path.islink(postgresql.data_dir()) return mounted and not linked
def wal_e_backup_command(): return 'envdir {} wal-e backup-push {}'.format(wal_e_env_dir(), postgresql.data_dir())
def wal_e_restore(): reactive.remove_state("action.wal-e-restore") params = hookenv.action_get() backup = params["backup-name"].strip().replace("-", "_") storage_uri = params["storage-uri"].strip() ship_uri = hookenv.config().get("wal_e_storage_uri") if storage_uri == ship_uri: hookenv.action_fail( "The storage-uri parameter is identical to " "the wal_e_storage_uri config setting. Your " "restoration source cannot be the same as the " "folder you are archiving too to avoid corrupting " "the backups." ) return if not params["confirm"]: m = "Recovery from {}.".format(storage_uri) if ship_uri: m += "\nContents of {} will be destroyed.".format(ship_uri) m += "\nExisting local database will be destroyed." m += "\nRerun action with 'confirm=true' to proceed." hookenv.action_set({"info": m}) return with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir: update_wal_e_env_dir(envdir, storage_uri) # Confirm there is a backup to restore backups = wal_e_list_backups(envdir) if not backups: hookenv.action_fail("No backups found at {}".format(storage_uri)) return if backup != "LATEST" and backup not in (b["name"] for b in backups): hookenv.action_fail("Backup {} not found".format(backup)) return # Shutdown PostgreSQL. Note we want this action to run synchronously, # so there is no opportunity to ask permission from the leader. If # there are other units cloning this database, those clone operations # will fail. Which seems preferable to blocking a recovery operation # in any case, because if we are doing disaster recovery we generally # want to do it right now. status_set("maintenance", "Stopping PostgreSQL for backup restoration") postgresql.stop() # Trash the existing database. Its dangerous to do this first, but # we probably need the space. data_dir = postgresql.data_dir() # May be a symlink for content in os.listdir(data_dir): cpath = os.path.join(data_dir, content) if os.path.isdir(cpath) and not os.path.islink(cpath): shutil.rmtree(cpath) else: os.remove(cpath) # WAL-E recover status_set("maintenance", "Restoring backup {}".format(backup)) wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir) # Create recovery.conf to complete recovery is_master = reactive.is_state("postgresql.replication.is_master") standby_mode = "off" if is_master else "on" if params.get("target-time"): target_time = "recovery_target_time='{}'" "".format(params["target-time"]) else: target_time = "" target_action = "promote" if is_master else "shutdown" immediate = "" if is_master else "recovery_target='immediate'" helpers.write( postgresql.recovery_conf_path(), dedent( """\ # Managed by Juju. PITR in progress. standby_mode = {} restore_command='{}' recovery_target_timeline = {} recovery_target_action = {} {} {} """ ).format( standby_mode, wal_e_restore_command(envdir=envdir), params["target-timeline"], target_action, target_time, immediate, ), mode=0o600, user="******", group="postgres", ) # Avoid circular import. We could also avoid the import entirely # with a sufficiently complex set of handlers in the replication # module, but that seems to be a worse solution. Better to break # out this action into a separate module. from reactive.postgresql import replication if is_master: if ship_uri: # If master, trash the configured wal-e storage. This may # contain WAL and backups from the old cluster which will # conflict with the new cluster. Hopefully it does not # contain anything important, because we have no way to # prompt the user for confirmation. wal_e_run(["delete", "--confirm", "everything"]) # Then, wait for recovery and promotion. postgresql.start() con = postgresql.connect() cur = con.cursor() while True: if postgresql.has_version("10"): cur.execute( """SELECT pg_is_in_recovery(), pg_last_wal_replay_lsn()""" ) else: cur.execute( """SELECT pg_is_in_recovery(), pg_last_xlog_replay_location()""" ) in_rec, loc = cur.fetchone() if not in_rec: break status_set("maintenance", "Recovery at {}".format(loc)) time.sleep(10) else: # If standby, startup and wait for recovery to complete and # shutdown. status_set("maintenance", "Recovery") # Startup might shutdown immediately and look like a failure. postgresql.start(ignore_failure=True) # No recovery point status yet for standbys, as we would need # to handle connection failures when the DB shuts down. We # should do this. while postgresql.is_running(): time.sleep(5) replication.update_recovery_conf(follow=replication.get_master()) # Reactive handlers will deal with the rest of the cleanup. # eg. ensuring required users and roles exist replication.update_replication_states() reactive.remove_state("postgresql.cluster.configured") reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running()) reactive.remove_state("postgresql.nagios.user_ensured") reactive.remove_state("postgresql.replication.replication_user_created") reactive.remove_state("postgresql.client.published")
def wal_e_backup_command(): return "/snap/bin/wal-e.envdir {} /snap/bin/wal-e backup-push {}".format(wal_e_env_dir(), postgresql.data_dir())
def clone_master(): master = get_master() peer_rel = helpers.get_peer_relation() master_relinfo = peer_rel[master] # Be paranoid since we are about to destroy data. assert not reactive.helpers.is_state("postgresql.replication.is_master") assert not reactive.helpers.is_state("postgresql.cluster.is_running") # We use realpath on data_dir as it may have been replaced with # a symbolic link, so we empty and recreate the actual directory # and the links remain in place. data_dir = os.path.realpath(postgresql.data_dir()) if os.path.exists(data_dir): hookenv.log("Removing {} in preparation for clone".format(data_dir)) shutil.rmtree(data_dir) helpers.makedirs(data_dir, mode=0o700, user="******", group="postgres") if postgresql.has_version("10"): wal_method = "--wal-method=stream" else: wal_method = "--xlog-method=stream" cmd = [ "sudo", "-H", # -H needed to locate $HOME/.pgpass "-u", "postgres", "pg_basebackup", "-D", postgresql.data_dir(), "-h", master_relinfo["host"], "-p", master_relinfo["port"], "--checkpoint=fast", "--progress", wal_method, "--no-password", "--username=_juju_repl", ] hookenv.log("Cloning {} with {}".format(master, " ".join(cmd))) status_set("maintenance", "Cloning {}".format(master)) try: # Switch to a directory the postgres user can access. with helpers.switch_cwd("/tmp"): subprocess.check_call(cmd, universal_newlines=True) except subprocess.CalledProcessError as x: hookenv.log("Clone failed with {}".format(x), ERROR) # We failed, and the local cluster is broken. status_set("blocked", "Failed to clone {}".format(master)) postgresql.drop_cluster() reactive.remove_state("postgresql.cluster.configured") reactive.remove_state("postgresql.cluster.created") # Terminate. We need this hook to exit, rather than enter a loop. raise SystemExit(0) update_recovery_conf(follow=master) reactive.set_state("postgresql.replication.cloned") update_replication_states()