def update_nrpe_config(): update_nagios_pgpass() nrpe = NRPE() user = nagios_username() port = postgresql.port() nrpe.add_check(shortname="pgsql", description="Check pgsql", check_cmd="check_pgsql -P {} -l {}".format(port, user)) if reactive.is_state("postgresql.replication.is_master"): # TODO: These should be calcualted from the backup schedule, # which is difficult since that is specified in crontab format. warn_age = 172800 crit_age = 194400 backups_log = helpers.backups_log_path() nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=("check_file_age -w {} -c {} -f {}" "".format(warn_age, crit_age, backups_log)), ) else: # Standbys don't do backups. We still generate a check though, # to ensure alerts get through to monitoring after a failover. nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=r"check_dummy 0 standby_does_not_backup", ) nrpe.write() reactive.remove_state("postgresql.nagios.needs_update")
def test_port(self, pgconf_path): # Pull the configured port from postgresql.conf. with tempfile.NamedTemporaryFile("w") as pgconf: pgconf.write("# Some rubbish\n") pgconf.write(" Port = 1234 # Picked by pg_createcluster(1)\n") pgconf.flush() pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 1234) with tempfile.NamedTemporaryFile("w") as pgconf: pgconf.write("port='1235'\n") pgconf.write("# Some rubbish\n") pgconf.flush() pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 1235) with tempfile.NamedTemporaryFile("w") as pgconf: pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 5432) # Fallback to default.
def test_port(self, pgconf_path): # Pull the configured port from postgresql.conf. with tempfile.NamedTemporaryFile('w') as pgconf: pgconf.write('# Some rubbish\n') pgconf.write(' Port = 1234 # Picked by pg_createcluster(1)\n') pgconf.flush() pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 1234) with tempfile.NamedTemporaryFile('w') as pgconf: pgconf.write("port='1235'\n") pgconf.write('# Some rubbish\n') pgconf.flush() pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 1235) with tempfile.NamedTemporaryFile('w') as pgconf: pgconf_path.return_value = pgconf.name self.assertEqual(postgresql.port(), 5432) # Fallback to default.
def db_relation_common(rel): """Publish unit specific relation details.""" local = rel.local if "database" not in local: return # Not yet ready. # Version number, allowing clients to adjust or block if their # expectations are not met. local["version"] = postgresql.version() # Calculate the state of this unit. 'standalone' will disappear # in a future version of this interface, as this state was # only needed to deal with race conditions now solved by # Juju leadership. We check for is_primary() rather than # the postgresql.replication.is_master reactive state to # publish the correct state when we are using manual replication # (there might be multiple independent masters, possibly useful for # sharding, or perhaps this is a multi master BDR setup). if postgresql.is_primary(): if reactive.helpers.is_state("postgresql.replication.has_peers"): local["state"] = "master" else: local["state"] = "standalone" else: local["state"] = "hot standby" # Host is the private ip address, but this might change and # become the address of an attached proxy or alternative peer # if this unit is in maintenance. local["host"] = hookenv.unit_private_ip() # Port will be 5432, unless the user has overridden it or # something very weird happened when the packages where installed. local["port"] = str(postgresql.port()) # The list of remote units on this relation granted access. # This is to avoid the race condition where a new client unit # joins an existing client relation and sees valid credentials, # before we have had a chance to grant it access. local["allowed-units"] = " ".join(unit for unit, relinfo in rel.items() if "private-address" in relinfo)
def db_relation_common(rel): """Publish unit specific relation details.""" local = rel.local if "database" not in local: return # Not yet ready. # Version number, allowing clients to adjust or block if their # expectations are not met. local["version"] = postgresql.version() # Calculate the state of this unit. 'standalone' will disappear # in a future version of this interface, as this state was # only needed to deal with race conditions now solved by # Juju leadership. We check for is_primary() rather than # the postgresql.replication.is_master reactive state to # publish the correct state when we are using manual replication # (there might be multiple independent masters, possibly useful for # sharding, or perhaps this is a multi master BDR setup). if postgresql.is_primary(): if reactive.helpers.is_state("postgresql.replication.has_peers"): local["state"] = "master" else: local["state"] = "standalone" else: local["state"] = "hot standby" # Host is the private ip address, but this might change and # become the address of an attached proxy or alternative peer # if this unit is in maintenance. local["host"] = ingress_address(local.relname, local.relid) # Port will be 5432, unless the user has overridden it or # something very weird happened when the packages where installed. local["port"] = str(postgresql.port()) # The list of remote units on this relation granted access. # This is to avoid the race condition where a new client unit # joins an existing client relation and sees valid credentials, # before we have had a chance to grant it access. local["allowed-units"] = " ".join(unit for unit, relinfo in rel.items() if len(incoming_addresses(relinfo)) > 0) # The list of IP address ranges on this relation granted access. # This will replace allowed-units, which does not work with cross # model ralations due to the anonymization of the external client. local["allowed-subnets"] = ",".join( sorted({r: True for r in chain(*[incoming_addresses(relinfo) for relinfo in rel.values()])}.keys()) ) # v2 protocol. Publish connection strings for this unit and its peers. # Clients should use these connection strings in favour of the old # host, port, database settings. A single proxy unit can thus # publish several end points to clients. master = replication.get_master() if replication.is_master(): master_relinfo = local else: master_relinfo = rel.peers.get(master) local["master"] = relinfo_to_cs(master_relinfo) if rel.peers: all_relinfo = rel.peers.values() all_relinfo = list(rel.peers.values()) if rel.peers else [] all_relinfo.append(rel.local) standbys = filter( None, [relinfo_to_cs(relinfo) for relinfo in all_relinfo if relinfo.unit != master], ) local["standbys"] = "\n".join(sorted(standbys)) or None
def update_nrpe_config(): update_nagios_pgpass() nrpe = NRPE() user = nagios_username() port = postgresql.port() nrpe.add_check( shortname="pgsql", description="Check pgsql", check_cmd="check_pgsql -P {} -l {}".format(port, user), ) # copy the check script which will run cronned as postgres user with open("scripts/find_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format(helpers.scripts_dir(), "find_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # create an (empty) file with appropriate permissions for the above check_output_path = "/var/lib/nagios/postgres-wal-max-age.txt" if not os.path.exists(check_output_path): helpers.write(check_output_path, b"0\n", mode=0o644, user="******", group="postgres") # retrieve the threshold values from the charm config config = hookenv.config() check_warn_threshold = config["wal_archive_warn_threshold"] or 0 check_crit_threshold = config["wal_archive_crit_threshold"] or 0 check_cron_path = "/etc/cron.d/postgres-wal-archive-check" if check_warn_threshold and check_crit_threshold: # create the cron job to run the above check_cron = "*/2 * * * * postgres {}".format(check_script_path) helpers.write(check_cron_path, check_cron, mode=0o644) # copy the nagios plugin which will check the cronned output with open("scripts/check_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format("/usr/local/lib/nagios/plugins", "check_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # write the nagios check definition nrpe.add_check( shortname="pgsql_stale_wal", description="Check for stale WAL backups", check_cmd="{} {} {}".format(check_script_path, check_warn_threshold, check_crit_threshold), ) if reactive.is_state("postgresql.replication.is_master"): # TODO: These should be calculated from the backup schedule, # which is difficult since that is specified in crontab format. warn_age = 172800 crit_age = 194400 backups_log = helpers.backups_log_path() nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=("check_file_age -w {} -c {} -f {}" "".format(warn_age, crit_age, backups_log)), ) else: # Standbys don't do backups. We still generate a check though, # to ensure alerts get through to monitoring after a failover. nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=r"check_dummy 0 standby_does_not_backup", ) nrpe.write() reactive.remove_state("postgresql.nagios.needs_update")
def publish_replication_details(): peer = helpers.get_peer_relation() if peer is not None: peer.local["host"] = hookenv.unit_private_ip() peer.local["port"] = str(postgresql.port()) peer.local["allowed-units"] = " ".join(sorted(peer.keys()))