def test_start(self, version, service_start, status_set, emit_pg_log): version.return_value = '9.9' # When it works, it works. postgresql.start() service_start.assert_called_once_with('[email protected]') self.assertFalse(emit_pg_log.called) # Start failure we block, and terminate whatever hook is running. service_start.return_value = False with self.assertRaises(SystemExit) as x: postgresql.start() status_set.assert_called_once_with('blocked', ANY) # Set blocked. self.assertEqual(x.exception.code, 0) # Terminated without error emit_pg_log.assert_called_once_with() # Tail of log emitted to logs.
def test_start(self, version, check_call, status_set, emit_pg_log): version.return_value = "9.9" # When it works, it works. postgresql.start() # Both -w and -t options are required to wait for startup. # We wait a long time, as startup might take a long time. # Maybe we should wait a lot longer. check_call.assert_called_once_with( ["pg_ctlcluster", "9.9", "main", "start", "--", "-w", "-t", "86400"], universal_newlines=True ) self.assertFalse(emit_pg_log.called) # If it is already running, pg_ctlcluster returns code 2. # We block, and terminate whatever hook is running. check_call.side_effect = subprocess.CalledProcessError(2, "whoops") check_call.reset_mock() postgresql.start() check_call.assert_called_once_with( ["pg_ctlcluster", "9.9", "main", "start", "--", "-w", "-t", "86400"], universal_newlines=True ) # Other failures block the unit. Perhaps it is just taking too # perform recovery after a power outage. check_call.side_effect = subprocess.CalledProcessError(42, "whoops") with self.assertRaises(SystemExit) as x: postgresql.start() status_set.assert_called_once_with("blocked", ANY) # Set blocked. self.assertEqual(x.exception.code, 0) # Terminated without error emit_pg_log.assert_called_once_with() # Tail of log emitted to logs.
def start(): status_set('maintenance', 'Starting PostgreSQL') postgresql.start() while postgresql.is_primary() and postgresql.is_in_recovery(): status_set('maintenance', 'Startup recovery') time.sleep(1) store = unitdata.kv() open_ports(store.get('postgresql.cluster.pgconf.live.port') or 5432, store.get('postgresql.cluster.pgconf.current.port') or 5432) # Update the 'live' config now we know it is in effect. This # is used to detect future config changes that require a restart. settings = store.getrange('postgresql.cluster.pgconf.current.', strip=True) store.unsetrange(prefix='postgresql.cluster.pgconf.live.') store.update(settings, prefix='postgresql.cluster.pgconf.live.') reactive.set_state('postgresql.cluster.is_running') reactive.remove_state('postgresql.cluster.needs_restart') reactive.remove_state('postgresql.cluster.needs_reload')
def start(): status_set("maintenance", "Starting PostgreSQL") postgresql.start() while postgresql.is_primary() and postgresql.is_in_recovery(): status_set("maintenance", "Startup recovery") time.sleep(1) store = unitdata.kv() open_ports( store.get("postgresql.cluster.pgconf.live.port") or 5432, store.get("postgresql.cluster.pgconf.current.port") or 5432, ) # Update the 'live' config now we know it is in effect. This # is used to detect future config changes that require a restart. settings = store.getrange("postgresql.cluster.pgconf.current.", strip=True) store.unsetrange(prefix="postgresql.cluster.pgconf.live.") store.update(settings, prefix="postgresql.cluster.pgconf.live.") reactive.set_state("postgresql.cluster.is_running") reactive.remove_state("postgresql.cluster.needs_restart") reactive.remove_state("postgresql.cluster.needs_reload")
def wal_e_restore(): reactive.remove_state("action.wal-e-restore") params = hookenv.action_get() backup = params["backup-name"].strip().replace("-", "_") storage_uri = params["storage-uri"].strip() ship_uri = hookenv.config().get("wal_e_storage_uri") if storage_uri == ship_uri: hookenv.action_fail( "The storage-uri parameter is identical to " "the wal_e_storage_uri config setting. Your " "restoration source cannot be the same as the " "folder you are archiving too to avoid corrupting " "the backups." ) return if not params["confirm"]: m = "Recovery from {}.".format(storage_uri) if ship_uri: m += "\nContents of {} will be destroyed.".format(ship_uri) m += "\nExisting local database will be destroyed." m += "\nRerun action with 'confirm=true' to proceed." hookenv.action_set({"info": m}) return with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir: update_wal_e_env_dir(envdir, storage_uri) # Confirm there is a backup to restore backups = wal_e_list_backups(envdir) if not backups: hookenv.action_fail("No backups found at {}".format(storage_uri)) return if backup != "LATEST" and backup not in (b["name"] for b in backups): hookenv.action_fail("Backup {} not found".format(backup)) return # Shutdown PostgreSQL. Note we want this action to run synchronously, # so there is no opportunity to ask permission from the leader. If # there are other units cloning this database, those clone operations # will fail. Which seems preferable to blocking a recovery operation # in any case, because if we are doing disaster recovery we generally # want to do it right now. status_set("maintenance", "Stopping PostgreSQL for backup restoration") postgresql.stop() # Trash the existing database. Its dangerous to do this first, but # we probably need the space. data_dir = postgresql.data_dir() # May be a symlink for content in os.listdir(data_dir): cpath = os.path.join(data_dir, content) if os.path.isdir(cpath) and not os.path.islink(cpath): shutil.rmtree(cpath) else: os.remove(cpath) # WAL-E recover status_set("maintenance", "Restoring backup {}".format(backup)) wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir) # Create recovery.conf to complete recovery is_master = reactive.is_state("postgresql.replication.is_master") standby_mode = "off" if is_master else "on" if params.get("target-time"): target_time = "recovery_target_time='{}'" "".format(params["target-time"]) else: target_time = "" target_action = "promote" if is_master else "shutdown" immediate = "" if is_master else "recovery_target='immediate'" helpers.write( postgresql.recovery_conf_path(), dedent( """\ # Managed by Juju. PITR in progress. standby_mode = {} restore_command='{}' recovery_target_timeline = {} recovery_target_action = {} {} {} """ ).format( standby_mode, wal_e_restore_command(envdir=envdir), params["target-timeline"], target_action, target_time, immediate, ), mode=0o600, user="******", group="postgres", ) # Avoid circular import. We could also avoid the import entirely # with a sufficiently complex set of handlers in the replication # module, but that seems to be a worse solution. Better to break # out this action into a separate module. from reactive.postgresql import replication if is_master: if ship_uri: # If master, trash the configured wal-e storage. This may # contain WAL and backups from the old cluster which will # conflict with the new cluster. Hopefully it does not # contain anything important, because we have no way to # prompt the user for confirmation. wal_e_run(["delete", "--confirm", "everything"]) # Then, wait for recovery and promotion. postgresql.start() con = postgresql.connect() cur = con.cursor() while True: if postgresql.has_version("10"): cur.execute( """SELECT pg_is_in_recovery(), pg_last_wal_replay_lsn()""" ) else: cur.execute( """SELECT pg_is_in_recovery(), pg_last_xlog_replay_location()""" ) in_rec, loc = cur.fetchone() if not in_rec: break status_set("maintenance", "Recovery at {}".format(loc)) time.sleep(10) else: # If standby, startup and wait for recovery to complete and # shutdown. status_set("maintenance", "Recovery") # Startup might shutdown immediately and look like a failure. postgresql.start(ignore_failure=True) # No recovery point status yet for standbys, as we would need # to handle connection failures when the DB shuts down. We # should do this. while postgresql.is_running(): time.sleep(5) replication.update_recovery_conf(follow=replication.get_master()) # Reactive handlers will deal with the rest of the cleanup. # eg. ensuring required users and roles exist replication.update_replication_states() reactive.remove_state("postgresql.cluster.configured") reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running()) reactive.remove_state("postgresql.nagios.user_ensured") reactive.remove_state("postgresql.replication.replication_user_created") reactive.remove_state("postgresql.client.published")