Example #1
0
    def test_start(self, version, service_start, status_set, emit_pg_log):
        version.return_value = '9.9'

        # When it works, it works.
        postgresql.start()
        service_start.assert_called_once_with('[email protected]')
        self.assertFalse(emit_pg_log.called)

        # Start failure we block, and terminate whatever hook is running.
        service_start.return_value = False
        with self.assertRaises(SystemExit) as x:
            postgresql.start()
        status_set.assert_called_once_with('blocked', ANY)  # Set blocked.
        self.assertEqual(x.exception.code, 0)  # Terminated without error
        emit_pg_log.assert_called_once_with()  # Tail of log emitted to logs.
    def test_start(self, version, check_call, status_set, emit_pg_log):
        version.return_value = "9.9"

        # When it works, it works.
        postgresql.start()
        # Both -w and -t options are required to wait for startup.
        # We wait a long time, as startup might take a long time.
        # Maybe we should wait a lot longer.
        check_call.assert_called_once_with(
            ["pg_ctlcluster", "9.9", "main", "start", "--", "-w", "-t", "86400"], universal_newlines=True
        )
        self.assertFalse(emit_pg_log.called)

        # If it is already running, pg_ctlcluster returns code 2.
        # We block, and terminate whatever hook is running.
        check_call.side_effect = subprocess.CalledProcessError(2, "whoops")
        check_call.reset_mock()
        postgresql.start()
        check_call.assert_called_once_with(
            ["pg_ctlcluster", "9.9", "main", "start", "--", "-w", "-t", "86400"], universal_newlines=True
        )

        # Other failures block the unit. Perhaps it is just taking too
        # perform recovery after a power outage.
        check_call.side_effect = subprocess.CalledProcessError(42, "whoops")
        with self.assertRaises(SystemExit) as x:
            postgresql.start()
        status_set.assert_called_once_with("blocked", ANY)  # Set blocked.
        self.assertEqual(x.exception.code, 0)  # Terminated without error
        emit_pg_log.assert_called_once_with()  # Tail of log emitted to logs.
Example #3
0
def start():
    status_set('maintenance', 'Starting PostgreSQL')
    postgresql.start()

    while postgresql.is_primary() and postgresql.is_in_recovery():
        status_set('maintenance', 'Startup recovery')
        time.sleep(1)

    store = unitdata.kv()

    open_ports(store.get('postgresql.cluster.pgconf.live.port') or 5432,
               store.get('postgresql.cluster.pgconf.current.port') or 5432)

    # Update the 'live' config now we know it is in effect. This
    # is used to detect future config changes that require a restart.
    settings = store.getrange('postgresql.cluster.pgconf.current.', strip=True)
    store.unsetrange(prefix='postgresql.cluster.pgconf.live.')
    store.update(settings, prefix='postgresql.cluster.pgconf.live.')

    reactive.set_state('postgresql.cluster.is_running')
    reactive.remove_state('postgresql.cluster.needs_restart')
    reactive.remove_state('postgresql.cluster.needs_reload')
Example #4
0
def start():
    status_set("maintenance", "Starting PostgreSQL")
    postgresql.start()

    while postgresql.is_primary() and postgresql.is_in_recovery():
        status_set("maintenance", "Startup recovery")
        time.sleep(1)

    store = unitdata.kv()

    open_ports(
        store.get("postgresql.cluster.pgconf.live.port") or 5432,
        store.get("postgresql.cluster.pgconf.current.port") or 5432,
    )

    # Update the 'live' config now we know it is in effect. This
    # is used to detect future config changes that require a restart.
    settings = store.getrange("postgresql.cluster.pgconf.current.", strip=True)
    store.unsetrange(prefix="postgresql.cluster.pgconf.live.")
    store.update(settings, prefix="postgresql.cluster.pgconf.live.")

    reactive.set_state("postgresql.cluster.is_running")
    reactive.remove_state("postgresql.cluster.needs_restart")
    reactive.remove_state("postgresql.cluster.needs_reload")
Example #5
0
def wal_e_restore():
    reactive.remove_state("action.wal-e-restore")
    params = hookenv.action_get()
    backup = params["backup-name"].strip().replace("-", "_")
    storage_uri = params["storage-uri"].strip()

    ship_uri = hookenv.config().get("wal_e_storage_uri")
    if storage_uri == ship_uri:
        hookenv.action_fail(
            "The storage-uri parameter is identical to "
            "the wal_e_storage_uri config setting. Your "
            "restoration source cannot be the same as the "
            "folder you are archiving too to avoid corrupting "
            "the backups."
        )
        return

    if not params["confirm"]:
        m = "Recovery from {}.".format(storage_uri)
        if ship_uri:
            m += "\nContents of {} will be destroyed.".format(ship_uri)
        m += "\nExisting local database will be destroyed."
        m += "\nRerun action with 'confirm=true' to proceed."
        hookenv.action_set({"info": m})
        return

    with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir:
        update_wal_e_env_dir(envdir, storage_uri)

        # Confirm there is a backup to restore
        backups = wal_e_list_backups(envdir)
        if not backups:
            hookenv.action_fail("No backups found at {}".format(storage_uri))
            return
        if backup != "LATEST" and backup not in (b["name"] for b in backups):
            hookenv.action_fail("Backup {} not found".format(backup))
            return

        # Shutdown PostgreSQL. Note we want this action to run synchronously,
        # so there is no opportunity to ask permission from the leader. If
        # there are other units cloning this database, those clone operations
        # will fail. Which seems preferable to blocking a recovery operation
        # in any case, because if we are doing disaster recovery we generally
        # want to do it right now.
        status_set("maintenance", "Stopping PostgreSQL for backup restoration")
        postgresql.stop()

        # Trash the existing database. Its dangerous to do this first, but
        # we probably need the space.
        data_dir = postgresql.data_dir()  # May be a symlink
        for content in os.listdir(data_dir):
            cpath = os.path.join(data_dir, content)
            if os.path.isdir(cpath) and not os.path.islink(cpath):
                shutil.rmtree(cpath)
            else:
                os.remove(cpath)

        # WAL-E recover
        status_set("maintenance", "Restoring backup {}".format(backup))
        wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir)

        # Create recovery.conf to complete recovery
        is_master = reactive.is_state("postgresql.replication.is_master")
        standby_mode = "off" if is_master else "on"
        if params.get("target-time"):
            target_time = "recovery_target_time='{}'" "".format(params["target-time"])
        else:
            target_time = ""
        target_action = "promote" if is_master else "shutdown"
        immediate = "" if is_master else "recovery_target='immediate'"
        helpers.write(
            postgresql.recovery_conf_path(),
            dedent(
                """\
                             # Managed by Juju. PITR in progress.
                             standby_mode = {}
                             restore_command='{}'
                             recovery_target_timeline = {}
                             recovery_target_action = {}
                             {}
                             {}
                             """
            ).format(
                standby_mode,
                wal_e_restore_command(envdir=envdir),
                params["target-timeline"],
                target_action,
                target_time,
                immediate,
            ),
            mode=0o600,
            user="******",
            group="postgres",
        )

        # Avoid circular import. We could also avoid the import entirely
        # with a sufficiently complex set of handlers in the replication
        # module, but that seems to be a worse solution. Better to break
        # out this action into a separate module.
        from reactive.postgresql import replication

        if is_master:
            if ship_uri:
                # If master, trash the configured wal-e storage. This may
                # contain WAL and backups from the old cluster which will
                # conflict with the new cluster. Hopefully it does not
                # contain anything important, because we have no way to
                # prompt the user for confirmation.
                wal_e_run(["delete", "--confirm", "everything"])

            # Then, wait for recovery and promotion.
            postgresql.start()
            con = postgresql.connect()
            cur = con.cursor()
            while True:
                if postgresql.has_version("10"):
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_wal_replay_lsn()"""
                    )
                else:
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_xlog_replay_location()"""
                    )
                in_rec, loc = cur.fetchone()
                if not in_rec:
                    break
                status_set("maintenance", "Recovery at {}".format(loc))
                time.sleep(10)
        else:
            # If standby, startup and wait for recovery to complete and
            # shutdown.
            status_set("maintenance", "Recovery")
            # Startup might shutdown immediately and look like a failure.
            postgresql.start(ignore_failure=True)
            # No recovery point status yet for standbys, as we would need
            # to handle connection failures when the DB shuts down. We
            # should do this.
            while postgresql.is_running():
                time.sleep(5)
            replication.update_recovery_conf(follow=replication.get_master())

    # Reactive handlers will deal with the rest of the cleanup.
    # eg. ensuring required users and roles exist
    replication.update_replication_states()
    reactive.remove_state("postgresql.cluster.configured")
    reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running())
    reactive.remove_state("postgresql.nagios.user_ensured")
    reactive.remove_state("postgresql.replication.replication_user_created")
    reactive.remove_state("postgresql.client.published")