Example #1
0
def main():
    # Modify the behavior of the PostgreSQL package installation
    # before any packages are installed. We do this here, rather than
    # in handlers, so that extra_packages declared by the operator
    # don't drag in the PostgreSQL packages as dependencies before
    # the environment tweaks have been made.
    if (not reactive.is_state('apt.installed.postgresql-common') and
            not reactive.is_state('postgresql.cluster.inhibited')):
        generate_locale()
        inhibit_default_cluster_creation()
        install_postgresql_packages()
        install_extra_packages()  # Deprecated extra-packages option

    # Don't trust this state from the last hook. Daemons may have
    # crashed and servers rebooted since then.
    if reactive.is_state('postgresql.cluster.created'):
        try:
            reactive.toggle_state('postgresql.cluster.is_running',
                                  postgresql.is_running())
        except subprocess.CalledProcessError as x:
            if not reactive.is_state('workloadstatus.blocked'):
                status_set('blocked',
                           'Local PostgreSQL cluster is corrupt: {}'
                           ''.format(x.stderr))

    # Reconfigure PostgreSQL. While we don't strictly speaking need
    # to do this every hook, we do need to do this almost every hook,
    # since even things like the number of peers or number of clients
    # can affect minimum viable configuration settings.
    reactive.remove_state('postgresql.cluster.configured')

    log_states()  # Debug noise.
Example #2
0
def stop():
    # First try a 'fast' shutdown.
    try:
        subprocess.check_call(['pg_ctlcluster', '--mode', 'fast',
                               version(), 'main', 'stop',
                               '--', '-w', '-t', str(SHUTDOWN_TIMEOUT)],
                              universal_newlines=True)
        return
    except subprocess.CalledProcessError as x:
        if x.returncode == 2:
            return  # The server was not running.

    # If the 'fast' shutdown failed, try an 'immediate' shutdown.
    try:
        hookenv.log('Fast shutdown failed. Attempting immediate shutdown.',
                    WARNING)
        subprocess.check_call(['pg_ctlcluster', '--mode', 'immediate',
                               version(), 'main', 'stop',
                               '--', '-w', '-t', str(SHUTDOWN_TIMEOUT)],
                              universal_newlines=True)
        return
    except subprocess.CalledProcessError as x:
        if x.returncode == 2:
            return  # The server was not running.
        workloadstatus.status_set('blocked', 'Unable to shutdown PostgreSQL')
        raise SystemExit(0)
Example #3
0
def remount():
    if reactive.is_state('postgresql.cluster.is_running'):
        # Attempting this while PostgreSQL is live would be really, really bad.
        service.stop()

    old_data_dir = postgresql.data_dir()
    new_data_dir = os.path.join(external_volume_mount, 'postgresql',
                                postgresql.version(), 'main')
    backup_data_dir = '{}-{}'.format(old_data_dir, int(time.time()))

    if os.path.isdir(new_data_dir):
        hookenv.log('Remounting existing database at {}'.format(new_data_dir),
                    WARNING)
    else:
        status_set('maintenance',
                   'Migrating data from {} to {}'.format(old_data_dir,
                                                         new_data_dir))
        helpers.makedirs(new_data_dir, mode=0o770,
                         user='******', group='postgres')
        try:
            rsync_cmd = ['rsync', '-av',
                         old_data_dir + '/',
                         new_data_dir + '/']
            hookenv.log('Running {}'.format(' '.join(rsync_cmd)), DEBUG)
            subprocess.check_call(rsync_cmd)
        except subprocess.CalledProcessError:
            status_set('blocked',
                       'Failed to sync data from {} to {}'
                       ''.format(old_data_dir, new_data_dir))
            return

    os.replace(old_data_dir, backup_data_dir)
    os.symlink(new_data_dir, old_data_dir)
    fix_perms(new_data_dir)
    reactive.remove_state('postgresql.storage.needs_remount')
Example #4
0
def block_on_invalid_config():
    """
    Sanity check charm configuration, blocking the unit if we have
    bogus bogus config values or config changes the charm does not
    yet (or cannot) support.

    We need to do this before the main reactive loop (@preflight),
    or we risk failing to run handlers that rely on @when_file_changed,
    reactive.helpers.data_changed or similar state tied to
    charmhelpers.core.unitdata transactions.
    """
    valid = True
    config = hookenv.config()

    enums = dict(version=set(['', '9.1', '9.2', '9.3', '9.4', '9.5', '9.6']),
                 package_status=set(['install', 'hold']))
    for key, vals in enums.items():
        config[key] = config[key].lower()  # Rewrite to lower case.
        if config[key] not in vals:
            valid = False
            status_set('blocked',
                       'Invalid value for {} ({!r})'.format(key, config[key]))

    unchangeable_config = ['locale', 'encoding', 'pgdg', 'manual_replication']
    if config._prev_dict is not None:
        for name in unchangeable_config:
            if config.changed(name):
                config[name] = config.previous(name)
                valid = False
                status_set('blocked',
                           'Cannot change {!r} after install '
                           '(from {!r} to {!r}).'
                           .format(name, config.previous(name),
                                   config.get('name')))
        if config.changed('version') and (config.previous('version') !=
                                          postgresql.version()):
            valid = False
            status_set('blocked',
                       'Cannot change version after install '
                       '(from {!r} to {!r}).'
                       .format(config.previous('version'), config['version']))
            config['version'] = config.previous('version')
            valid = False

    metrics_target = config['metrics_target'].strip()
    if metrics_target:
        if ':' not in metrics_target:
            status_set('blocked',
                       'Invalid metrics_target {}'.format(metrics_target))
            valid = False
        metrics_interval = config['metrics_sample_interval']
        if not metrics_interval:
            status_set('blocked',
                       'metrics_sample_interval is required when '
                       'metrics_target is set')
            valid = False

    if not valid:
        raise SystemExit(0)
Example #5
0
def install():
    # Install WAL-E via snap package. We only do this when relevant
    # configuration options are set to ensure that the main charm works
    # in environments with limited or no snap support (eg. Trusty lxd
    # containers). The wal-e snap is used for log shipping and PITR
    # actions.
    status_set(None, "Installing wal-e snap")
    snap.install("wal-e", classic=True)
Example #6
0
def update_recovery_conf(follow):
    assert follow != hookenv.local_unit()

    peer_rel = helpers.get_peer_relation()
    follow_relinfo = peer_rel.get(follow)
    assert follow_relinfo is not None, "Invalid upstream {}".format(follow)

    current_follow = get_following()

    if follow != current_follow:
        status_set("maintenance", "Following new unit {}".format(follow))
        set_following(follow)
        # Setting the state to defer publication until after restart.
        reactive.set_state("postgresql.replication.publish_following")

    else:
        # Even though the master is unchanged, we still regenerate
        # recovery.conf in case connection details such as IP addresses
        # have changed.
        hookenv.log("Continuing to follow {}".format(follow))

    pg12 = postgresql.has_version("12")
    if pg12:
        path = postgresql.hot_standby_conf_path()
        template = "hot_standby.conf.tmpl"
    else:
        path = postgresql.recovery_conf_path()
        template = "recovery.conf.tmpl"

    config = hookenv.config()

    data = dict(
        streaming_replication=config["streaming_replication"],
        host=follow_relinfo["host"],
        port=follow_relinfo["port"],
        user=replication_username(),
        password=leader_get("replication_password"),
    )

    if reactive.helpers.is_state("postgresql.wal_e.enabled"):
        data["restore_command"] = wal_e.wal_e_restore_command()

    templating.render(template,
                      path,
                      data,
                      owner="postgres",
                      group="postgres",
                      perms=0o600)

    if pg12:
        touch(postgresql.hot_standby_signal_path())

    # Use @when_file_changed for this when Issue #44 is resolved.
    if reactive.helpers.any_file_changed([path]):
        reactive.set_state("postgresql.cluster.needs_restart")
        if reactive.is_state("postgresql.replication.cloned"):
            reactive.set_state("postgresql.replication.check_following")
Example #7
0
def check_following():
    peer_rel = helpers.get_peer_relation()
    following = get_following()
    if peer_rel is None or following is None:
        reactive.remove_state("postgresql.replication.check_following")
        return
    if postgresql.is_replicating(following, user=replication_username()):
        hookenv.log("Replication of {} is confirmed".format(following))
        reactive.remove_state("postgresql.replication.check_following")
    else:
        status_set("blocked", "Replication of {} has failed".format(following))
Example #8
0
def parse_config(unparsed_config, fatal=True):
    """Parse a postgresql.conf style string, returning a dictionary.

    This is a simple key=value format, per section 18.1.2 at
    http://www.postgresql.org/docs/9.4/static/config-setting.html
    """
    scanner = re.compile(
        r"""^\s*
                         (                       # key=value (1)
                           (?:
                              (\w+)              # key (2)
                              (?:\s*=\s*|\s+)    # separator
                           )?
                           (?:
                              ([-.\w]+) |        # simple value (3) or
                              '(                 # quoted value (4)
                                (?:[^']|''|\\')*
                               )(?<!\\)'(?!')
                           )?
                           \s* ([^\#\s].*?)?     # badly quoted value (5)
                         )?
                         (?:\s*\#.*)?$           # comment
                         """,
        re.X,
    )
    parsed = OrderedDict()
    for lineno, line in zip(itertools.count(1), unparsed_config.splitlines()):
        try:
            m = scanner.search(line)
            if m is None:
                raise SyntaxError("Invalid line")
            keqv, key, value, q_value, bad_value = m.groups()
            if not keqv:
                continue
            if key is None:
                raise SyntaxError("Missing key {!r}".format(keqv))
            if bad_value is not None:
                raise SyntaxError("Badly quoted value {!r}".format(bad_value))
            assert value is None or q_value is None
            if q_value is not None:
                value = re.sub(r"''|\\'", "'", q_value)
            if value is not None:
                parsed[key.lower()] = value
            else:
                raise SyntaxError("Missing value")
        except SyntaxError as x:
            if fatal:
                x.lineno = lineno
                x.text = line
                raise x
            workloadstatus.status_set("blocked", "{} line {}: {}".format(x, lineno, line))
            raise SystemExit(0)
    return parsed
Example #9
0
def set_active():
    if postgresql.is_running():
        if replication.is_master():
            msg = 'Live master'
        elif postgresql.is_primary():
            msg = 'Live primary'
        else:
            msg = 'Live secondary'
        status_set('active', '{} ({})'.format(msg, postgresql.point_version()))
    else:
        # PostgreSQL crashed! Maybe bad configuration we failed to
        # pick up, or maybe a full disk. The admin will need to diagnose.
        status_set('blocked', 'PostgreSQL unexpectedly shut down')
Example #10
0
def configure_cluster():
    """Configure the cluster."""
    update_pg_ident_conf()
    update_pg_hba_conf()

    try:
        update_postgresql_conf()
        reactive.set_state("postgresql.cluster.configured")
        hookenv.log("PostgreSQL has been configured")
        if reactive.helpers.is_state("postgresql.cluster.is_running"):
            postgresql_conf_changed()
    except InvalidPgConfSetting as x:
        status_set("blocked", "Invalid postgresql.conf setting {}: {}".format(*x.args))
Example #11
0
def set_active():
    if postgresql.is_running():
        if replication.is_master():
            msg = "Live master"
        elif postgresql.is_primary():
            msg = "Live primary"
        else:
            msg = "Live secondary"
        status_set("active", "{} ({})".format(msg, postgresql.point_version()))
    else:
        # PostgreSQL crashed! Maybe bad configuration we failed to
        # pick up, or maybe a full disk. The admin will need to diagnose.
        status_set("blocked", "PostgreSQL unexpectedly shut down")
Example #12
0
def parse_config(unparsed_config, fatal=True):
    '''Parse a postgresql.conf style string, returning a dictionary.

    This is a simple key=value format, per section 18.1.2 at
    http://www.postgresql.org/docs/9.4/static/config-setting.html
    '''
    scanner = re.compile(r"""^\s*
                         (                       # key=value (1)
                           (?:
                              (\w+)              # key (2)
                              (?:\s*=\s*|\s+)    # separator
                           )?
                           (?:
                              ([-.\w]+) |        # simple value (3) or
                              '(                 # quoted value (4)
                                (?:[^']|''|\\')*
                               )(?<!\\)'(?!')
                           )?
                           \s* ([^\#\s].*?)?     # badly quoted value (5)
                         )?
                         (?:\s*\#.*)?$           # comment
                         """, re.X)
    parsed = OrderedDict()
    for lineno, line in zip(itertools.count(1), unparsed_config.splitlines()):
        try:
            m = scanner.search(line)
            if m is None:
                raise SyntaxError('Invalid line')
            keqv, key, value, q_value, bad_value = m.groups()
            if not keqv:
                continue
            if key is None:
                raise SyntaxError('Missing key'.format(keqv))
            if bad_value is not None:
                raise SyntaxError('Badly quoted value'.format(bad_value))
            assert value is None or q_value is None
            if q_value is not None:
                value = re.sub(r"''|\\'", "'", q_value)
            if value is not None:
                parsed[key.lower()] = value
            else:
                raise SyntaxError('Missing value')
        except SyntaxError as x:
            if fatal:
                x.lineno = lineno
                x.text = line
                raise x
            workloadstatus.status_set('blocked',
                                      '{} line {}: {}'.format(x, lineno, line))
            raise SystemExit(0)
    return parsed
Example #13
0
def coordinate_failover():
    """The master has been destroyed. Trigger the failover process."""
    master = get_master()
    rel = helpers.get_peer_relation()

    hookenv.log("Master {} is gone".format(master), WARNING)

    # Per Bug #1417874, the master doesn't know it is dying until it
    # is too late, and standbys learn about their master dying at
    # different times. We need to wait until all remaining units
    # are aware that the master is gone, and are no longer following
    # it. If we fail to do this step, then we risk appointing a new
    # master while some units are still replicating data from
    # the ex-master and we will end up with diverged timelines.
    # Unfortunately, this means failover will not complete until
    # hooks can be run on all remaining units, which could be several
    # hours if maintenance operations are in progress. Once
    # Bug #1417874 is addressed, the departing master
    # can cut off replication to all units simultaneously and we
    # can skip this step and allow failover to occur as soon as the
    # leader learns that the master is gone. Or can we? A network
    # partition could stop the controller seeing the master, and
    # any about-to-depart hooks will not be triggered, with the same
    # problem detailed above. pg_rewind and repmgr may also offer
    # alternatives, repairing the diverged timeline rather than
    # avoiding it. But pg_rewind only copes with timeline switches
    # in PG9.6+, which means we can't promote, which risks wal shipping
    # collisions between the old and new masters.
    waiting_on = set()
    for unit, relinfo in rel.items():
        if relinfo.get("following"):
            hookenv.log("Waiting for {} to stop replicating ex-master"
                        "".format(unit))
            waiting_on.add(unit)
    if rel.local.get("following"):
        # following from the relation, rather than get_following(),
        # to ensure that the change has been applied.
        hookenv.log("Waiting for me to stop replicating ex-master")
        waiting_on.add(hookenv.local_unit())
    if not waiting_on:
        new_master = elect_master()
        hookenv.log("Failing over to new master {}".format(new_master),
                    WARNING)
        set_master(new_master)
    else:
        status_set(
            None,
            "Coordinating failover. Waiting on {}"
            "".format(",".join(sorted(waiting_on))),
        )
Example #14
0
def start():
    try:
        subprocess.check_call(['pg_ctlcluster',
                               version(), 'main', 'start',
                               # These extra options cause pg_ctl to wait
                               # for startup to finish, so we don't have to.
                               '--', '-w', '-t', str(STARTUP_TIMEOUT)],
                              universal_newlines=True)
    except subprocess.CalledProcessError as x:
        if x.returncode == 2:
            return  # The server is already running.
        workloadstatus.status_set('blocked', 'PostgreSQL failed to start')
        emit_pg_log()  # For debugging inscruitable pg_ctlcluster failures.
        raise SystemExit(0)
Example #15
0
def configure_cluster():
    '''Configure the cluster.'''
    update_pg_ident_conf()
    update_pg_hba_conf()

    try:
        update_postgresql_conf()
        reactive.set_state('postgresql.cluster.configured')
        hookenv.log('PostgreSQL has been configured')
        # Use @when_file_changed for this when Issue #44 is resolved.
        if reactive.helpers.is_state('postgresql.cluster.is_running'):
            postgresql_conf_changed()
    except InvalidPgConfSetting as x:
        status_set('blocked',
                   'Invalid postgresql.conf setting {}: {}'.format(*x.args))
Example #16
0
def generate_locale():
    '''Ensure that the requested database locale is available.

    The locale cannot be changed post deployment, as this would involve
    completely destroying and recreding the database.
    '''
    config = hookenv.config()
    if config['locale'] != 'C':
        status_set('maintenance',
                   'Generating {} locale'.format(config['locale']))
        subprocess.check_call(['locale-gen',
                               '{}.{}'.format(hookenv.config('locale'),
                                              hookenv.config('encoding'))],
                              universal_newlines=True)
    reactive.set_state('postgresql.cluster.locale.set')
Example #17
0
def generate_locale():
    """Ensure that the requested database locale is available.

    The locale cannot be changed post deployment, as this would involve
    completely destroying and recreding the database.
    """
    config = hookenv.config()
    if config["locale"] != "C":
        status_set("maintenance", "Generating {} locale".format(config["locale"]))
        subprocess.check_call(
            [
                "locale-gen",
                "{}.{}".format(hookenv.config("locale"), hookenv.config("encoding")),
            ],
            universal_newlines=True,
        )
    reactive.set_state("postgresql.cluster.locale.set")
Example #18
0
def elect_master():
    """Elect a new master after the old one has departed.

    The new master is the secondary that has replayed the most
    WAL data. There must be no hot standbys still replicating
    data from the previous master, or we may end up with diverged
    timelines.

    Note we check replayed wal instead of received wal, because the
    servers have just been restarted with no master and information
    about received wal lost.
    """
    rel = helpers.get_peer_relation()
    local_unit = hookenv.local_unit()

    # The unit with the most advanced WAL offset should be the new master.
    if postgresql.is_running():
        local_offset = postgresql.wal_replay_offset(postgresql.connect())
        offsets = [(local_offset, local_unit)]
    else:
        offsets = []

    for unit, relinfo in rel.items():
        try:
            con = postgresql.connect(user=replication_username(), unit=unit)
            offsets.append((postgresql.wal_replay_offset(con), unit))
        except (psycopg2.Error, postgresql.InvalidConnection) as x:
            hookenv.log(
                "Unable to query replication state of {}: {}"
                "".format(unit, x),
                WARNING,
            )
            # TODO: Signal re-cloning required. Or autodetect
            # based on timeline switch. Or PG9.3+ could use pg_rewind.

    offsets.sort()
    if not offsets:
        # This should only happen if we failover before replication has
        # been setup, like a test suite destroying units without waiting
        # for the initial deployment to complete.
        status_set("blocked", "No candidates for master found!")
        raise SystemExit(0)
    elected_master = offsets[0][1]
    return elected_master
Example #19
0
def migrate_pgdata():
    '''
    Copy the data from /var/lib/postgresql/9.x/main to the
    new path and replace the original PGDATA with a symlink.
    Note that the original may already be a symlink, either from
    the block storage broker or manual changes by admins.
    '''
    if reactive.is_state('postgresql.cluster.is_running'):
        # Attempting this while PostgreSQL is live would be really, really bad.
        service.stop()

    old_data_dir = postgresql.data_dir()
    new_data_dir = unitdata.kv().get(pgdata_path_key)

    backup_data_dir = '{}-{}'.format(old_data_dir, int(time.time()))

    if os.path.isdir(new_data_dir):
        # This never happens with Juju storage, at least with 2.0,
        # because we have no way of reusing old partitions.
        hookenv.log('Remounting existing database at {}'.format(new_data_dir),
                    WARNING)
    else:
        status_set('maintenance',
                   'Migrating data from {} to {}'.format(old_data_dir,
                                                         new_data_dir))
        helpers.makedirs(new_data_dir, mode=0o770,
                         user='******', group='postgres')
        try:
            rsync_cmd = ['rsync', '-av',
                         old_data_dir + '/',
                         new_data_dir + '/']
            hookenv.log('Running {}'.format(' '.join(rsync_cmd)), DEBUG)
            subprocess.check_call(rsync_cmd, universal_newlines=True)
        except subprocess.CalledProcessError:
            status_set('blocked',
                       'Failed to sync data from {} to {}'
                       ''.format(old_data_dir, new_data_dir))
            return

    os.replace(old_data_dir, backup_data_dir)
    os.symlink(new_data_dir, old_data_dir)
    fix_perms(new_data_dir)
    reactive.set_state('postgresql.storage.pgdata.migrated')
Example #20
0
def promote_to_master():
    status_set("maintenance", "Promoting to master")
    postgresql.promote()

    set_following(None)
    publish_following()

    while postgresql.is_in_recovery():
        status_set("maintenance", "Waiting for startup")
        time.sleep(1)

    if postgresql.has_version("12"):
        assert not os.path.exists(postgresql.hot_standby_signal_path(
        )), "standby.signal still exists after promotion"
    else:
        assert not os.path.exists(postgresql.recovery_conf_path(
        )), "recovery.conf still exists after promotion"

    update_replication_states()
    helpers.ping_peers()
Example #21
0
def inhibit_default_cluster_creation():
    '''Stop the PostgreSQL packages from creating the default cluster.

    We can't use the default cluster as it is likely created with an
    incorrect locale and without options such as data checksumming.
    We could just delete it, but then we need to be able to tell between
    an existing cluster whose data should be preserved and a freshly
    created empty cluster. And why waste time creating it in the first
    place?
    '''
    hookenv.log('Inhibiting PostgreSQL packages from creating default cluster')
    path = postgresql.postgresql_conf_path()
    if os.path.exists(path) and open(path, 'r').read():
        status_set('blocked', 'postgresql.conf already exists')
    else:
        hookenv.log('Inhibiting', DEBUG)
        os.makedirs(os.path.dirname(path), mode=0o755, exist_ok=True)
        with open(path, 'w') as f:
            f.write('# Inhibited')
        reactive.set_state('postgresql.cluster.inhibited')
Example #22
0
def main():
    if not (reactive.is_state("postgresql.cluster.created") or reactive.is_state("postgresql.cluster.initial-check")):
        # We need to check for existance of an existing database,
        # before the main PostgreSQL package has been installed.
        # If there is one, abort rather than risk destroying data.
        # We need to do this here, as the apt layer may pull in
        # the main PostgreSQL package through dependencies, per
        # lp:1749284
        if os.path.exists(postgresql.postgresql_conf_path()):
            hookenv.status_set(
                "blocked",
                "PostgreSQL config from previous install found at {}".format(postgresql.postgresql_conf_path()),
            )
        elif os.path.exists(postgresql.data_dir()):
            hookenv.status_set(
                "blocked",
                "PostgreSQL database from previous install found at {}".format(postgresql.postgresql.data_dir()),
            )
        else:
            hookenv.log("No pre-existing PostgreSQL database found")
            reactive.set_state("postgresql.cluster.initial-check")

    # Don't trust this state from the last hook. Daemons may have
    # crashed and servers rebooted since then.
    if reactive.is_state("postgresql.cluster.created"):
        try:
            reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running())
        except subprocess.CalledProcessError as x:
            if not reactive.is_state("workloadstatus.blocked"):
                status_set(
                    "blocked",
                    "Local PostgreSQL cluster is corrupt: {}".format(x.stderr),
                )

    # Reconfigure PostgreSQL. While we don't strictly speaking need
    # to do this every hook, we do need to do this almost every hook,
    # since even things like the number of peers or number of clients
    # can affect minimum viable configuration settings.
    reactive.remove_state("postgresql.cluster.configured")

    log_states()  # Debug noise.
Example #23
0
def remount():
    if reactive.is_state("postgresql.cluster.is_running"):
        # Attempting this while PostgreSQL is live would be really, really bad.
        service.stop()

    old_data_dir = postgresql.data_dir()
    new_data_dir = os.path.join(external_volume_mount, "postgresql",
                                postgresql.version(), "main")
    backup_data_dir = "{}-{}".format(old_data_dir, int(time.time()))

    if os.path.isdir(new_data_dir):
        hookenv.log("Remounting existing database at {}".format(new_data_dir),
                    WARNING)
    else:
        status_set(
            "maintenance",
            "Migrating data from {} to {}".format(old_data_dir, new_data_dir),
        )
        helpers.makedirs(new_data_dir,
                         mode=0o770,
                         user="******",
                         group="postgres")
        try:
            rsync_cmd = [
                "rsync", "-av", old_data_dir + "/", new_data_dir + "/"
            ]
            hookenv.log("Running {}".format(" ".join(rsync_cmd)), DEBUG)
            subprocess.check_call(rsync_cmd)
        except subprocess.CalledProcessError:
            status_set(
                "blocked",
                "Failed to sync data from {} to {}"
                "".format(old_data_dir, new_data_dir),
            )
            return

    os.replace(old_data_dir, backup_data_dir)
    os.symlink(new_data_dir, old_data_dir)
    fix_perms(new_data_dir)
    reactive.remove_state("postgresql.storage.needs_remount")
Example #24
0
def start():
    status_set('maintenance', 'Starting PostgreSQL')
    postgresql.start()

    while postgresql.is_primary() and postgresql.is_in_recovery():
        status_set('maintenance', 'Startup recovery')
        time.sleep(1)

    store = unitdata.kv()

    open_ports(store.get('postgresql.cluster.pgconf.live.port') or 5432,
               store.get('postgresql.cluster.pgconf.current.port') or 5432)

    # Update the 'live' config now we know it is in effect. This
    # is used to detect future config changes that require a restart.
    settings = store.getrange('postgresql.cluster.pgconf.current.', strip=True)
    store.unsetrange(prefix='postgresql.cluster.pgconf.live.')
    store.update(settings, prefix='postgresql.cluster.pgconf.live.')

    reactive.set_state('postgresql.cluster.is_running')
    reactive.remove_state('postgresql.cluster.needs_restart')
    reactive.remove_state('postgresql.cluster.needs_reload')
Example #25
0
def start():
    status_set("maintenance", "Starting PostgreSQL")
    postgresql.start()

    while postgresql.is_primary() and postgresql.is_in_recovery():
        status_set("maintenance", "Startup recovery")
        time.sleep(1)

    store = unitdata.kv()

    open_ports(
        store.get("postgresql.cluster.pgconf.live.port") or 5432,
        store.get("postgresql.cluster.pgconf.current.port") or 5432,
    )

    # Update the 'live' config now we know it is in effect. This
    # is used to detect future config changes that require a restart.
    settings = store.getrange("postgresql.cluster.pgconf.current.", strip=True)
    store.unsetrange(prefix="postgresql.cluster.pgconf.live.")
    store.update(settings, prefix="postgresql.cluster.pgconf.live.")

    reactive.set_state("postgresql.cluster.is_running")
    reactive.remove_state("postgresql.cluster.needs_restart")
    reactive.remove_state("postgresql.cluster.needs_reload")
Example #26
0
def drain_master_and_promote_anointed():
    # Wait until this anointed unit is fully in-sync with the
    # master, and then promote it to master. But first we
    # need to ensure that the master is following us, and that we
    # have no outstanding requests on the restart lock to avoid deadlocking
    # the cluster.
    peer_rel = helpers.get_peer_relation()
    master = get_master()
    if peer_rel is None or master is None:
        return  # Peers all gone? Other handlers will promote.

    if peer_rel[master].get("following") != hookenv.local_unit():
        status_set("waiting",
                   "Waiting for master to follow me, its anointed successor")
        return  # Try again next hook

    # Drain the master
    while True:
        local_offset = postgresql.wal_received_offset(postgresql.connect())
        if local_offset is None:
            # Huh? Should not happen unless the server was unexpectedly
            # restarted.
            break

        try:
            remote_con = postgresql.connect(user=replication_username(),
                                            unit=master)
            remote_offset = postgresql.wal_received_offset(remote_con)
            if remote_offset is None:
                # Huh? Should not happen either, since the master published
                # that it is following us.
                break
        except (psycopg2.Error, postgresql.InvalidConnection) as x:
            status_set(
                "waiting",
                "Waiting to query replication state of {}: {}"
                "".format(master, x),
            )
            time.sleep(1)
            continue

        if local_offset >= remote_offset:
            break  # In sync. Proceed to promotion.

        status_set(
            "waiting",
            "{} bytes to sync before promotion"
            "".format(remote_offset - local_offset),
        )
        time.sleep(1)

    # Promote the anointed to master
    promote_to_master()
    switchover_status()
Example #27
0
def start(ignore_failure=False):
    if host.service_start(service_name()) or ignore_failure:
        return
    workloadstatus.status_set("blocked", "PostgreSQL failed to start")
    emit_pg_log()
    raise SystemExit(0)
Example #28
0
def stop():
    status_set("maintenance", "Stopping PostgreSQL")
    postgresql.stop()
    reactive.remove_state("postgresql.cluster.is_running")
Example #29
0
def stop():
    if not host.service_stop(service_name()):
        workloadstatus.status_set("blocked", "Unable to shutdown PostgreSQL")
        raise SystemExit(0)
Example #30
0
def clone_master():
    master = get_master()
    peer_rel = helpers.get_peer_relation()
    master_relinfo = peer_rel[master]

    # Be paranoid since we are about to destroy data.
    assert not reactive.helpers.is_state("postgresql.replication.is_master")
    assert not reactive.helpers.is_state("postgresql.cluster.is_running")

    # We use realpath on data_dir as it may have been replaced with
    # a symbolic link, so we empty and recreate the actual directory
    # and the links remain in place.
    data_dir = os.path.realpath(postgresql.data_dir())

    if os.path.exists(data_dir):
        hookenv.log("Removing {} in preparation for clone".format(data_dir))
        shutil.rmtree(data_dir)
    helpers.makedirs(data_dir, mode=0o700, user="******", group="postgres")

    if postgresql.has_version("10"):
        wal_method = "--wal-method=stream"
    else:
        wal_method = "--xlog-method=stream"
    cmd = [
        "sudo",
        "-H",  # -H needed to locate $HOME/.pgpass
        "-u",
        "postgres",
        "pg_basebackup",
        "-D",
        postgresql.data_dir(),
        "-h",
        master_relinfo["host"],
        "-p",
        master_relinfo["port"],
        "--checkpoint=fast",
        "--progress",
        wal_method,
        "--no-password",
        "--username=_juju_repl",
    ]
    hookenv.log("Cloning {} with {}".format(master, " ".join(cmd)))
    status_set("maintenance", "Cloning {}".format(master))
    try:
        # Switch to a directory the postgres user can access.
        with helpers.switch_cwd("/tmp"):
            subprocess.check_call(cmd, universal_newlines=True)
    except subprocess.CalledProcessError as x:
        hookenv.log("Clone failed with {}".format(x), ERROR)
        # We failed, and the local cluster is broken.
        status_set("blocked", "Failed to clone {}".format(master))
        postgresql.drop_cluster()
        reactive.remove_state("postgresql.cluster.configured")
        reactive.remove_state("postgresql.cluster.created")
        # Terminate. We need this hook to exit, rather than enter a loop.
        raise SystemExit(0)

    update_recovery_conf(follow=master)

    reactive.set_state("postgresql.replication.cloned")
    update_replication_states()
Example #31
0
def upgrade_charm():
    workloadstatus.status_set("maintenance", "Upgrading charm")

    rels = context.Relations()

    # The master is now appointed by the leader.
    if hookenv.is_leader():
        master = replication.get_master()
        if not master:
            master = hookenv.local_unit()
            peer_rel = helpers.get_peer_relation()
            if peer_rel:
                for peer_relinfo in peer_rel.values():
                    if peer_relinfo.get("state") == "master":
                        master = peer_relinfo.unit
                        break
            hookenv.log("Discovered {} is the master".format(master))
            leadership.leader_set(master=master)

    # The name of this crontab has changed. It will get regenerated.
    if os.path.exists("/etc/cron.d/postgresql"):
        hookenv.log("Removing old crontab")
        os.unlink("/etc/cron.d/postgresql")

    # Older generated usernames where generated from the relation id,
    # and really old ones contained random components. This made it
    # problematic to restore a database into a fresh environment,
    # because the new usernames would not match the old usernames and
    # done of the database permissions would match. We now generate
    # usernames using just the client service name, so restoring a
    # database into a fresh environment will work provided the service
    # names match. We want to update the old usernames in upgraded
    # services to the new format to improve their disaster recovery
    # story.
    for relname, superuser in [("db", False), ("db-admin", True)]:
        for client_rel in rels[relname].values():
            hookenv.log("Migrating database users for {}".format(client_rel))
            password = client_rel.local.get("password", host.pwgen())
            old_username = client_rel.local.get("user")
            new_username = postgresql.username(client_rel.service, superuser,
                                               False)
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local["user"] = new_username
                client_rel.local["password"] = password

            old_username = client_rel.local.get("schema_user")
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local["schema_user"] = new_username
                client_rel.local["schema_password"] = password

    # Admin relations used to get 'all' published as the database name,
    # which was bogus.
    for client_rel in rels["db-admin"].values():
        if client_rel.local.get("database") == "all":
            client_rel.local["database"] = client_rel.service

    # Reconfigure PostgreSQL and republish client relations.
    reactive.remove_state("postgresql.cluster.configured")
    reactive.remove_state("postgresql.client.published")

    # Don't recreate the cluster.
    reactive.set_state("postgresql.cluster.created")

    # Set the postgresql.replication.cloned flag, so we don't rebuild
    # standbys when upgrading the charm from a pre-reactive version.
    reactive.set_state("postgresql.replication.cloned")

    # Publish which node we are following
    peer_rel = helpers.get_peer_relation()
    if peer_rel and "following" not in peer_rel.local:
        following = unitdata.kv().get("postgresql.replication.following")
        if following is None and not replication.is_master():
            following = replication.get_master()
        peer_rel.local["following"] = following

    # Ensure storage that was attached but ignored is no longer ignored.
    if not reactive.is_state("postgresql.storage.pgdata.attached"):
        if hookenv.storage_list("pgdata"):
            storage.attach()

    # Ensure client usernames and passwords match leader settings.
    for relname in ("db", "db-admin"):
        for rel in rels[relname].values():
            del rel.local["user"]
            del rel.local["password"]

    # Ensure the configure version is cached.
    postgresql.version()

    # Skip checks for pre-existing databases, as that has already happened.
    reactive.set_state("postgresql.cluster.initial-check")

    # Reinstall support scripts
    reactive.remove_state("postgresql.cluster.support-scripts")

    # Ensure that systemd is managing the PostgreSQL process
    if host.init_is_systemd(
    ) and not reactive.is_flag_set("postgresql.upgrade.systemd"):
        reactive.set_flag("postgresql.upgrade.systemd")
        if reactive.is_flag_set("postgresql.cluster.is_running"):
            hookenv.log("Restarting PostgreSQL under systemd", hookenv.WARNING)
            reactive.clear_flag("postgresql.cluster.is_running")
            postgresql.stop_pgctlcluster()

    # Update the PGDG source, in case the signing key has changed.
    config = hookenv.config()
    if config["pgdg"]:
        service.add_pgdg_source()
Example #32
0
def depart_storage_relation():
    status_set('blocked',
               'Unable to continue after departing block storage relation')
Example #33
0
def wait_for_peers():
    """Wait if there are no peers and we are not the master."""
    status_set("waiting", "Waiting for peers")
Example #34
0
def wait_for_restart():
    status_set('waiting', 'Waiting for permission to restart')
Example #35
0
def stop():
    status_set('maintenance', 'Stopping PostgreSQL')
    postgresql.stop()
    reactive.remove_state('postgresql.cluster.is_running')
Example #36
0
def wait_for_restart():
    status_set("waiting", "Waiting for permission to restart")
Example #37
0
def block_on_bad_juju():
    if not hookenv.has_juju_version('1.24'):
        status_set('blocked', 'Requires Juju 1.24 or higher')
        # Error state, since we don't have 1.24 to give a nice blocked state.
        raise SystemExit(1)
Example #38
0
def diverged_timeline():
    status_set("maintenance", "Diverged timeline")
    # Don't shutdown without the coordinator lock. Most likely,
    # this unit is being destroyed and shouldn't reclone.
    reactive.set_state("postgresql.cluster.needs_restart")
Example #39
0
def wait_for_failover():
    """Failover in progress."""
    status_set("waiting", "Failover from {}".format(get_master()))
Example #40
0
def wait_for_master():
    """Master appointed but not available to this unit."""
    status_set("waiting", "Waiting for master {}".format(get_master()))
Example #41
0
def upgrade_charm():
    workloadstatus.status_set('maintenance', 'Upgrading charm')

    rels = context.Relations()

    # The master is now appointed by the leader.
    if hookenv.is_leader():
        master = replication.get_master()
        if not master:
            master = hookenv.local_unit()
            if rels.peer:
                for peer_relinfo in rels.peer.values():
                    if peer_relinfo.get('state') == 'master':
                        master = peer_relinfo.unit
                        break
            hookenv.log('Discovered {} is the master'.format(master))
            leadership.leader_set(master=master)

    # The name of this crontab has changed. It will get regenerated.
    if os.path.exists('/etc/cron.d/postgresql'):
        hookenv.log('Removing old crontab')
        os.unlink('/etc/cron.d/postgresql')

    # Older generated usernames where generated from the relation id,
    # and really old ones contained random components. This made it
    # problematic to restore a database into a fresh environment,
    # because the new usernames would not match the old usernames and
    # done of the database permissions would match. We now generate
    # usernames using just the client service name, so restoring a
    # database into a fresh environment will work provided the service
    # names match. We want to update the old usernames in upgraded
    # services to the new format to improve their disaster recovery
    # story.
    for relname, superuser in [('db', False), ('db-admin', True)]:
        for client_rel in rels[relname].values():
            hookenv.log('Migrating database users for {}'.format(client_rel))
            password = client_rel.local.get('password', host.pwgen())
            old_username = client_rel.local.get('user')
            new_username = postgresql.username(client_rel.service,
                                               superuser, False)
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local['user'] = new_username
                client_rel.local['password'] = password

            old_username = client_rel.local.get('schema_user')
            if old_username and old_username != new_username:
                migrate_user(old_username, new_username, password, superuser)
                client_rel.local['schema_user'] = new_username
                client_rel.local['schema_password'] = password

    # Admin relations used to get 'all' published as the database name,
    # which was bogus.
    for client_rel in rels['db-admin'].values():
        if client_rel.local.get('database') == 'all':
            client_rel.local['database'] = client_rel.service

    # Reconfigure PostgreSQL and republish client relations.
    reactive.remove_state('postgresql.cluster.configured')
    reactive.remove_state('postgresql.client.published')

    # Don't recreate the cluster.
    reactive.set_state('postgresql.cluster.created')

    # Set the postgresql.replication.cloned flag, so we don't rebuild
    # standbys when upgrading the charm from a pre-reactive version.
    reactive.set_state('postgresql.replication.cloned')
Example #42
0
def wal_e_restore():
    reactive.remove_state("action.wal-e-restore")
    params = hookenv.action_get()
    backup = params["backup-name"].strip().replace("-", "_")
    storage_uri = params["storage-uri"].strip()

    ship_uri = hookenv.config().get("wal_e_storage_uri")
    if storage_uri == ship_uri:
        hookenv.action_fail(
            "The storage-uri parameter is identical to "
            "the wal_e_storage_uri config setting. Your "
            "restoration source cannot be the same as the "
            "folder you are archiving too to avoid corrupting "
            "the backups."
        )
        return

    if not params["confirm"]:
        m = "Recovery from {}.".format(storage_uri)
        if ship_uri:
            m += "\nContents of {} will be destroyed.".format(ship_uri)
        m += "\nExisting local database will be destroyed."
        m += "\nRerun action with 'confirm=true' to proceed."
        hookenv.action_set({"info": m})
        return

    with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir:
        update_wal_e_env_dir(envdir, storage_uri)

        # Confirm there is a backup to restore
        backups = wal_e_list_backups(envdir)
        if not backups:
            hookenv.action_fail("No backups found at {}".format(storage_uri))
            return
        if backup != "LATEST" and backup not in (b["name"] for b in backups):
            hookenv.action_fail("Backup {} not found".format(backup))
            return

        # Shutdown PostgreSQL. Note we want this action to run synchronously,
        # so there is no opportunity to ask permission from the leader. If
        # there are other units cloning this database, those clone operations
        # will fail. Which seems preferable to blocking a recovery operation
        # in any case, because if we are doing disaster recovery we generally
        # want to do it right now.
        status_set("maintenance", "Stopping PostgreSQL for backup restoration")
        postgresql.stop()

        # Trash the existing database. Its dangerous to do this first, but
        # we probably need the space.
        data_dir = postgresql.data_dir()  # May be a symlink
        for content in os.listdir(data_dir):
            cpath = os.path.join(data_dir, content)
            if os.path.isdir(cpath) and not os.path.islink(cpath):
                shutil.rmtree(cpath)
            else:
                os.remove(cpath)

        # WAL-E recover
        status_set("maintenance", "Restoring backup {}".format(backup))
        wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir)

        # Create recovery.conf to complete recovery
        is_master = reactive.is_state("postgresql.replication.is_master")
        standby_mode = "off" if is_master else "on"
        if params.get("target-time"):
            target_time = "recovery_target_time='{}'" "".format(params["target-time"])
        else:
            target_time = ""
        target_action = "promote" if is_master else "shutdown"
        immediate = "" if is_master else "recovery_target='immediate'"
        helpers.write(
            postgresql.recovery_conf_path(),
            dedent(
                """\
                             # Managed by Juju. PITR in progress.
                             standby_mode = {}
                             restore_command='{}'
                             recovery_target_timeline = {}
                             recovery_target_action = {}
                             {}
                             {}
                             """
            ).format(
                standby_mode,
                wal_e_restore_command(envdir=envdir),
                params["target-timeline"],
                target_action,
                target_time,
                immediate,
            ),
            mode=0o600,
            user="******",
            group="postgres",
        )

        # Avoid circular import. We could also avoid the import entirely
        # with a sufficiently complex set of handlers in the replication
        # module, but that seems to be a worse solution. Better to break
        # out this action into a separate module.
        from reactive.postgresql import replication

        if is_master:
            if ship_uri:
                # If master, trash the configured wal-e storage. This may
                # contain WAL and backups from the old cluster which will
                # conflict with the new cluster. Hopefully it does not
                # contain anything important, because we have no way to
                # prompt the user for confirmation.
                wal_e_run(["delete", "--confirm", "everything"])

            # Then, wait for recovery and promotion.
            postgresql.start()
            con = postgresql.connect()
            cur = con.cursor()
            while True:
                if postgresql.has_version("10"):
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_wal_replay_lsn()"""
                    )
                else:
                    cur.execute(
                        """SELECT pg_is_in_recovery(),
                                          pg_last_xlog_replay_location()"""
                    )
                in_rec, loc = cur.fetchone()
                if not in_rec:
                    break
                status_set("maintenance", "Recovery at {}".format(loc))
                time.sleep(10)
        else:
            # If standby, startup and wait for recovery to complete and
            # shutdown.
            status_set("maintenance", "Recovery")
            # Startup might shutdown immediately and look like a failure.
            postgresql.start(ignore_failure=True)
            # No recovery point status yet for standbys, as we would need
            # to handle connection failures when the DB shuts down. We
            # should do this.
            while postgresql.is_running():
                time.sleep(5)
            replication.update_recovery_conf(follow=replication.get_master())

    # Reactive handlers will deal with the rest of the cleanup.
    # eg. ensuring required users and roles exist
    replication.update_replication_states()
    reactive.remove_state("postgresql.cluster.configured")
    reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running())
    reactive.remove_state("postgresql.nagios.user_ensured")
    reactive.remove_state("postgresql.replication.replication_user_created")
    reactive.remove_state("postgresql.client.published")
Example #43
0
def block_on_invalid_config():
    """
    Sanity check charm configuration, blocking the unit if we have
    bogus bogus config values or config changes the charm does not
    yet (or cannot) support.

    We need to do this before the main reactive loop (@preflight),
    or we risk failing to run handlers that rely on @when_file_changed,
    reactive.helpers.data_changed or similar state tied to
    charmhelpers.core.unitdata transactions.
    """
    valid = True
    config = hookenv.config()

    enums = dict(
        version=set(["", "9.5", "9.6", "10", "11", "12"]),
        package_status=set(["install", "hold"]),
    )
    for key, vals in enums.items():
        config[key] = (config.get(key) or "").lower()
        if config[key] not in vals:
            valid = False
            status_set("blocked",
                       "Invalid value for {} ({!r})".format(key, config[key]))

    unchangeable_config = ["locale", "encoding", "manual_replication"]
    if config._prev_dict is not None:
        for name in unchangeable_config:
            if config.changed(name):
                config[name] = config.previous(name)
                valid = False
                status_set(
                    "blocked",
                    "Cannot change {!r} after install "
                    "(from {!r} to {!r}).".format(name, config.previous(name),
                                                  config.get("name")),
                )
        if config.changed("version") and (config.previous("version") !=
                                          postgresql.version()):
            valid = False
            status_set(
                "blocked",
                "Cannot change version after install "
                "(from {!r} to {!r}).".format(config.previous("version"),
                                              config["version"]),
            )
            config["version"] = config.previous("version")
            valid = False

    metrics_target = config["metrics_target"].strip()
    if metrics_target:
        if ":" not in metrics_target:
            status_set("blocked",
                       "Invalid metrics_target {}".format(metrics_target))
            valid = False
        metrics_interval = config["metrics_sample_interval"]
        if not metrics_interval:
            status_set(
                "blocked",
                "metrics_sample_interval is required when "
                "metrics_target is set",
            )
            valid = False

    if not valid:
        raise SystemExit(0)
Example #44
0
def wait_for_clone():
    status_set("waiting",
               "Waiting for permission to clone {}".format(get_master()))
Example #45
0
def depart_storage_relation():
    status_set("blocked",
               "Unable to continue after departing block storage relation")