def update_nrpe_checks(): if os.path.isdir(NAGIOS_PLUGINS): rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) user = '******' vhost = 'nagios' password_file = os.path.join(RABBIT_DIR, '%s.passwd' % user) if os.path.exists(password_file): password = open(password_file).read().strip() else: cmd = ['pwgen', '64', '1'] password = subprocess.check_output(cmd).strip() with open(password_file, 'wb') as out: out.write(password) rabbit.create_vhost(vhost) rabbit.create_user(user, password) rabbit.grant_permissions(user, vhost) nrpe_compat = NRPE() nrpe_compat.add_check( shortname=rabbit.RABBIT_USER, description='Check RabbitMQ', check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}' ''.format(NAGIOS_PLUGINS, user, password, vhost) ) nrpe_compat.write()
def update_nrpe_checks(): if os.path.isdir(NAGIOS_PLUGINS): rsync( os.path.join(os.getenv('CHARM_DIR'), 'scripts', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) user = '******' vhost = 'nagios' password_file = os.path.join(RABBIT_DIR, '%s.passwd' % user) if os.path.exists(password_file): password = open(password_file).read().strip() else: cmd = ['pwgen', '64', '1'] password = subprocess.check_output(cmd).strip() with open(password_file, 'wb') as out: out.write(password) rabbit.create_vhost(vhost) rabbit.create_user(user, password) rabbit.grant_permissions(user, vhost) nrpe_compat = NRPE() nrpe_compat.add_check( shortname=rabbit.RABBIT_USER, description='Check RabbitMQ', check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}' ''.format(NAGIOS_PLUGINS, user, password, vhost)) nrpe_compat.write()
def render_checks(self, creds): render(source='keystone.yaml', target=self.oscreds, context=creds, owner='nagios', group='nagios') nrpe = NRPE() if not os.path.exists(self.plugins_dir): os.makedirs(self.plugins_dir) charm_plugin_dir = os.path.join(hookenv.charm_dir(), 'files', 'plugins/') host.rsync(charm_plugin_dir, self.plugins_dir, options=['--executability']) contrail_check_command = os.path.join(self.plugins_dir, 'check_contrail_alarms.py') nrpe.add_check( shortname='contrail_alarms', description='Check Contrail alarms', check_cmd=contrail_check_command, ) nrpe.write()
def configure_rally_check(self): kv = unitdata.kv() if kv.get('rallyconfigured', False): return self.update_rally_checkfiles() rally_check = os.path.join(self.plugins_dir, 'check_rally.py') nrpe = NRPE() nrpe.add_check( shortname='rally', description='Check that all rally tests pass', check_cmd=rally_check, ) nrpe.write() kv.set('rallyconfigured', True)
def update_nrpe_checks(): log("Refreshing nrpe checks") # Find out if nrpe set nagios_hostname hostname = None for rel in relations_of_type("nrpe-external-master"): if "nagios_hostname" in rel: hostname = rel["nagios_hostname"] break nrpe = NRPE(hostname=hostname) nrpe.add_check(shortname="mysql_proc", description="Check MySQL process", check_cmd="check_procs -c 1:1 -C mysqld") nrpe.add_check( shortname="mysql", description="Check MySQL connectivity", check_cmd="check_mysql -u nagios -p {}".format(nagios_password()), ) nrpe.write()
def update_nagios(svc): status_set('maintenance', 'configuring Nagios checks') hostname = get_nagios_hostname() current_unit = get_nagios_unit_name() nrpe = NRPE(hostname=hostname) add_init_service_checks(nrpe, ['vault'], current_unit) write_file('/usr/lib/nagios/plugins/check_vault_version.py', open('files/nagios/check_vault_version.py', 'rb').read(), perms=0o755) nrpe.add_check( 'vault_version', 'Check running vault server version is same as installed snap', '/usr/lib/nagios/plugins/check_vault_version.py', ) nrpe.write() set_state('vault.nrpe.configured')
def update_nrpe_checks(): log('Refreshing nrpe checks') # Find out if nrpe set nagios_hostname hostname = None for rel in relations_of_type('nrpe-external-master'): if 'nagios_hostname' in rel: hostname = rel['nagios_hostname'] break nrpe = NRPE(hostname=hostname) nrpe.add_check(shortname='mysql_proc', description='Check MySQL process', check_cmd='check_procs -c 1:1 -C mysqld') nrpe.add_check(shortname='mysql', description='Check MySQL connectivity', check_cmd='check_mysql -u nagios -p {}'.format( nagios_password())) nrpe.write()
def update_nagios(svc): status_set('maintenance', 'configuring Nagios checks') hostname = get_nagios_hostname() current_unit = get_nagios_unit_name() nrpe = NRPE(hostname=hostname) remove_deprecated_check(nrpe, ['vault_version']) add_init_service_checks(nrpe, ['vault'], current_unit) try: os.remove('/usr/lib/nagios/plugins/check_vault_version.py') except FileNotFoundError: pass write_file('/usr/lib/nagios/plugins/check_vault_health.py', open('files/nagios/check_vault_health.py', 'rb').read(), perms=0o755) nrpe.add_check( 'vault_health', 'Check running vault server version and health', '/usr/lib/nagios/plugins/check_vault_health.py', ) nrpe.write() set_state('vault.nrpe.configured')
def update_nrpe_config(): update_nagios_pgpass() nrpe = NRPE() user = nagios_username() port = postgresql.port() nrpe.add_check(shortname="pgsql", description="Check pgsql", check_cmd="check_pgsql -P {} -l {}".format(port, user)) if reactive.is_state("postgresql.replication.is_master"): # TODO: These should be calcualted from the backup schedule, # which is difficult since that is specified in crontab format. warn_age = 172800 crit_age = 194400 backups_log = helpers.backups_log_path() nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=("check_file_age -w {} -c {} -f {}" "".format(warn_age, crit_age, backups_log)), ) else: # Standbys don't do backups. We still generate a check though, # to ensure alerts get through to monitoring after a failover. nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=r"check_dummy 0 standby_does_not_backup", ) nrpe.write() reactive.remove_state("postgresql.nagios.needs_update")
def update_nrpe_config(): update_nagios_pgpass() nrpe = NRPE() user = nagios_username() port = postgresql.port() nrpe.add_check( shortname="pgsql", description="Check pgsql", check_cmd="check_pgsql -P {} -l {}".format(port, user), ) # copy the check script which will run cronned as postgres user with open("scripts/find_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format(helpers.scripts_dir(), "find_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # create an (empty) file with appropriate permissions for the above check_output_path = "/var/lib/nagios/postgres-wal-max-age.txt" if not os.path.exists(check_output_path): helpers.write(check_output_path, b"0\n", mode=0o644, user="******", group="postgres") # retrieve the threshold values from the charm config config = hookenv.config() check_warn_threshold = config["wal_archive_warn_threshold"] or 0 check_crit_threshold = config["wal_archive_crit_threshold"] or 0 check_cron_path = "/etc/cron.d/postgres-wal-archive-check" if check_warn_threshold and check_crit_threshold: # create the cron job to run the above check_cron = "*/2 * * * * postgres {}".format(check_script_path) helpers.write(check_cron_path, check_cron, mode=0o644) # copy the nagios plugin which will check the cronned output with open("scripts/check_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format("/usr/local/lib/nagios/plugins", "check_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # write the nagios check definition nrpe.add_check( shortname="pgsql_stale_wal", description="Check for stale WAL backups", check_cmd="{} {} {}".format(check_script_path, check_warn_threshold, check_crit_threshold), ) if reactive.is_state("postgresql.replication.is_master"): # TODO: These should be calculated from the backup schedule, # which is difficult since that is specified in crontab format. warn_age = 172800 crit_age = 194400 backups_log = helpers.backups_log_path() nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=("check_file_age -w {} -c {} -f {}" "".format(warn_age, crit_age, backups_log)), ) else: # Standbys don't do backups. We still generate a check though, # to ensure alerts get through to monitoring after a failover. nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=r"check_dummy 0 standby_does_not_backup", ) nrpe.write() reactive.remove_state("postgresql.nagios.needs_update")
def create_endpoint_checks(self, creds): """ Create an NRPE check for each Keystone catalog endpoint. Read the Keystone catalog, and create a check for each endpoint listed. If there is a healthcheck endpoint for the API, use that URL, otherwise check the url '/'. If SSL, add a check for the cert. v2 endpoint needs the 'interface' attribute: <Endpoint {'id': 'XXXXX', 'region': 'RegionOne', 'publicurl': 'http://10.x.x.x:9696', 'service_id': 'YYY', 'internalurl': 'http://10.x.x.x:9696', 'enabled': True, 'adminurl': 'http://10.x.x.x:9696'}> """ # provide URLs that can be used for healthcheck for some services # This also provides a nasty hack-ish way to add switches if we need # for some services. health_check_params = { 'aodh': '/healthcheck', 'barbican': '/v1 -e Unauthorized', 'ceilometer': '/ -e Unauthorized -d x-openstack-request-id', 'cinderv1': '/v1 -e Unauthorized -d x-openstack-request-id', 'cinderv2': '/v2 -e Unauthorized', 'cinderv3': '/v3 -e Unauthorized -d x-openstack-request-id', 'designate': '/v2 -e Unauthorized', 'glance': '/healthcheck', 'gnocchi': '/v1 -e Unauthorized', 'heat': '/v1 -e Unauthorized', 'keystone': '/healthcheck', 'nova': '/healthcheck', 'octavia': '/v2 -e Unauthorized', 'placement': '/healthcheck -e Unauthorized -d x-openstack-request-id', 's3': '/healthcheck', 'swift': self.charm_config.get('swift_check_params', '/'), } self.get_keystone_client(creds) endpoints = self.keystone_endpoints services = [svc for svc in self.keystone_services if svc.enabled] nrpe = NRPE() skip_service = set() for endpoint in endpoints: endpoint.service_names = [ x.name for x in services if x.id == endpoint.service_id ] service_name = endpoint.service_names[0] endpoint.healthcheck_url = health_check_params.get( service_name, '/') # Note(aluria): glance-simplestreams-sync does not provide an API to check if service_name == 'image-stream': continue if not hasattr(endpoint, 'interface'): if service_name == 'keystone': # Note(aluria): filter:healthcheck is not configured in v2 # https://docs.openstack.org/keystone/pike/configuration.html#health-check-middleware continue for interface in 'admin internal public'.split(): old_interface_name = '{}url'.format(interface) if not hasattr(endpoint, old_interface_name): continue endpoint.interface = interface endpoint.url = getattr(endpoint, old_interface_name) skip_service.add(service_name) break check_url = urlparse(endpoint.url) if not self.charm_config.get('check_{}_urls'.format( endpoint.interface)): nrpe.remove_check( shortname='{}_{}'.format(service_name, endpoint.interface)) if check_url.scheme == 'https': nrpe.remove_check(shortname='{}_{}_cert'.format( service_name, endpoint.interface)) continue cmd_params = ['/usr/lib/nagios/plugins/check_http'] host, port = self._split_url(check_url.netloc, check_url.scheme) cmd_params.append('-H {} -p {}'.format(host, port)) cmd_params.append('-u {}'.format(endpoint.healthcheck_url)) # if this is https, we want to add a check for cert expiry # also need to tell check_http use use TLS if check_url.scheme == 'https': cmd_params.append('-S') # Add an extra check for TLS cert expiry cmd_params_cert = cmd_params.copy() cmd_params_cert.append('-C {},{}'.format( self.charm_config['tls_warn_days'] or 30, self.charm_config['tls_crit_days'] or 14)) nrpe.add_check( shortname='{}_{}_cert'.format(service_name, endpoint.interface), description='Certificate expiry check for {} {}'.format( service_name, endpoint.interface), check_cmd=' '.join(cmd_params_cert)) # Add the actual health check for the URL nrpe.add_check(shortname='{}_{}'.format(service_name, endpoint.interface), description='Endpoint url check for {} {}'.format( service_name, endpoint.interface), check_cmd=' '.join(cmd_params)) nrpe.write()
def render_checks(self, creds): render(source='nagios.novarc', target=self.novarc, context=creds, owner='nagios', group='nagios') nrpe = NRPE() if not os.path.exists(self.plugins_dir): os.makedirs(self.plugins_dir) self.update_plugins() nova_check_command = os.path.join(self.plugins_dir, 'check_nova_services.py') check_command = '{} --warn {} --crit {} --skip-aggregates {} {}'.format( nova_check_command, self.nova_warn, self.nova_crit, self.nova_skip_aggregates, self.skip_disabled).strip() nrpe.add_check( shortname='nova_services', description='Check that enabled Nova services are up', check_cmd=check_command, ) if self.is_neutron_agents_check_enabled: nrpe.add_check( shortname='neutron_agents', description='Check that enabled Neutron agents are up', check_cmd=os.path.join(self.plugins_dir, 'check_neutron_agents.sh'), ) else: nrpe.remove_check(shortname='neutron_agents') if self.is_loadbalancers_check_enabled: nrpe.add_check( shortname='loadbalancers', description='Check loadbalancers status', check_cmd=os.path.join(self.plugins_dir, 'check_loadbalancers.py'), ) else: nrpe.remove_check(shortname='loadbalancers') if self.contrail_analytics_vip: contrail_check_command = '{} --host {}'.format( os.path.join(self.plugins_dir, 'check_contrail_analytics_alarms.py'), self.contrail_analytics_vip) nrpe.add_check( shortname='contrail_analytics_alarms', description='Check Contrail Analytics alarms', check_cmd=contrail_check_command, ) else: nrpe.remove_check(shortname='contrail_analytics_alarms') if len(self.check_dns): nrpe.add_check( shortname='dns_multi', description='Check DNS names are resolvable', check_cmd='{} {}'.format( os.path.join(self.plugins_dir, 'check_dns_multi.sh'), ' '.join(self.check_dns.split())), ) else: nrpe.remove_check(shortname='dns_multi') nrpe.write() self.create_endpoint_checks(creds)