def check(args): COMPUTE_ENDPOINT = 'http://{ip}:8774/v3'.format(ip=args.ip) try: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: status_err(str(e)) # gather nova service states if args.host: services = nova.services.list(host=args.host) else: services = nova.services.list() if len(services) == 0: status_err("No host(s) found in the service list") # return all the things status_ok() for service in services: service_is_up = True if service.status == 'enabled' and service.state == 'down': service_is_up = False if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) metric_bool(name, service_is_up)
def get_osd_statistics(client=None, keyring=None, osd_ids=None, container_name=None): osd_dump = get_ceph_osd_dump(client=client, keyring=keyring, container_name=container_name) pg_osds_dump = get_ceph_pg_dump_osds(client=client, keyring=keyring, container_name=container_name) for osd_id in osd_ids: osd_ref = 'osd.%s' % osd_id for _osd in osd_dump['osds']: if _osd['osd'] == osd_id: osd = _osd break else: msg = 'The OSD ID %s does not exist.' % osd_id raise maas_common.MaaSException(msg) key = 'up' name = '_'.join((osd_ref, key)) maas_common.metric_bool(name, osd[key]) for _osd in pg_osds_dump: if _osd['osd'] == osd_id: osd = _osd break
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token api_endpoint = 'http://{ip}:9292/v2'.format(ip=args.ip) s = Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: # Hit something that isn't querying the glance-registry, since we # query glance-registry in separate checks r = s.get('%s/schemas/image' % api_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_api_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(args, tenant_id): CEILOMETER_ENDPOINT = 'http://{ip}:8777'.format(ip=args.ip) try: ceilometer = get_ceilometer_client(endpoint=CEILOMETER_ENDPOINT) is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() meters = ceilometer.meters.list() # Exceptions are only thrown when we iterate over meter [i.meter_id for i in meters] end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('ceilometer_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('ceilometer_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def main(): try: os.stat('/usr/sbin/ssacli') ssacli_bin = 'ssacli' except Exception: try: os.stat('/usr/sbin/hpssacli') ssacli_bin = 'hpssacli' except Exception: maas_common.status_err('Neither ssacli or hpssacli could be found', m_name='hp_monitoring') status = {} status['hardware_processors_status'] = \ get_chassis_status('hpasmcli', 'server') status['hardware_memory_status'] = get_chassis_status('hpasmcli', 'dimm') status['hardware_powersupply_status'] = \ get_powersupply_status('hpasmcli', 'powersupply') status['hardware_disk_status'] = get_drive_status(ssacli_bin) status['hardware_controller_status'] = get_controller_status(ssacli_bin) status['hardware_controller_cache_status'] = \ get_controller_cache_status(ssacli_bin) status['hardware_controller_battery_status'] = \ get_controller_battery_status(ssacli_bin) maas_common.status_ok(m_name='maas_hwvendor') for name, value in status.viewitems(): maas_common.metric_bool(name, value, m_name='maas_hwvendor')
def check(args): metadata_endpoint = ('http://{ip}:8775'.format(ip=args.ip)) is_up = True s = requests.Session() try: # looks like we can only get / (ec2 versions) without specifying # an instance ID and other headers versions = s.get('%s/' % metadata_endpoint, verify=False, timeout=10) milliseconds = versions.elapsed.total_seconds() * 1000 if not versions.ok or '1.0' not in versions.content.splitlines(): is_up = False except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('nova_api_metadata_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_metadata_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): try: nova = get_openstack_client('compute') except Exception as e: metric_bool('client_success', False, m_name='maas_nova') status_err(str(e), m_name='maas_nova') else: metric_bool('client_success', True, m_name='maas_nova') # get some cloud stats stats = [nova.get_hypervisor(i.id) for i in nova.hypervisors()] cloud_stats = collections.defaultdict(dict) count = 0 for stat in stats: count += 1 setattr(stat, 'count', count) for metric_name, vals in stats_mapping.iteritems(): multiplier = 1 if metric_name == 'total_vcpus': multiplier = args.cpu_allocation_ratio elif metric_name == 'total_memory': multiplier = args.mem_allocation_ratio cloud_stats[metric_name]['value'] = \ (getattr(stat, vals['stat_name']) * multiplier) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok(m_name='maas_nova') for metric_name in cloud_stats.iterkeys(): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = 'http://{ip}:8776/v1/{tenant}' \ .format(ip=args.ip, tenant=keystone.tenant_id) s = requests.Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: r = s.get('%s/os-services' % VOLUME_ENDPOINT, verify=False, timeout=10) except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) if not r.ok: status_err('could not get response from cinder api') status_ok() services = r.json()['services'] for service in services: service_is_up = True if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False metric_bool('%s_on_host_%s' % (service['binary'], service['host']), service_is_up)
def check(auth_ref, args): MAGNUM_ENDPOINT = 'http://{ip}:9511/v1'.format(ip=args.ip, ) try: if args.ip: magnum = get_magnum_client(endpoint=MAGNUM_ENDPOINT) else: magnum = get_magnum_client() api_is_up = True except exc.HttpError as e: api_is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() magnum.cluster_templates.list() end = time.time() milliseconds = (end - start) * 1000 status_ok() metric_bool('magnum_api_local_status', api_is_up) if api_is_up: # only want to send other metrics if api is up metric('magnum_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check_process_running(process_names, container_name=None): """Check to see if processes are running. Check if each of the processes in process_names are in a list of running processes in the specified container name, or on this host. """ if not process_names: # The caller has not provided a value for process_names, which gives us # nothing to do. Return an error for the check. status_err('No process names provided') procs_path = '/sys/fs/cgroup/cpu/cgroup.procs' if container_name is not None: # Checking for processes in a container, not the parent host procs_path = os.path.join('/sys/fs/cgroup/cpu/lxc', container_name, 'cgroup.procs') procs = get_processes(procs_path) if not procs: # Unable to get a list of process names for the container or host. status_err('Could not get a list of running processes') # Since we've fetched a process list, report status_ok. status_ok() # Report the presence of each process from the command line in the # running process list for the host or specified container. for process_name in process_names: metric_bool('%s_process_status' % process_name, process_name in procs)
def check(auth_ref, args): GLANCE_ENDPOINT = ('http://{ip}:9292/v1'.format(ip=args.ip)) try: if args.ip: glance = get_glance_client(endpoint=GLANCE_ENDPOINT) else: glance = get_glance_client() is_up = True except exc.HTTPException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() glance.images.list(search_opts={'all_tenants': 1}) end = time.time() milliseconds = (end - start) * 1000 # gather some metrics images = glance.images.list(search_opts={'all_tenants': 1}) status_count = collections.Counter([s.status for s in images]) status_ok() metric_bool('glance_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('glance_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') for status in IMAGE_STATUSES: metric('glance_%s_images' % status, 'uint32', status_count[status], 'images')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format (ip=args.ip, tenant=tenant_id)) try: if args.ip: heat = get_heat_client(endpoint=HEAT_ENDPOINT) else: heat = get_heat_client() is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() heat.build_info.build_info() end = time.time() milliseconds = (end - start) * 1000 status_ok() metric_bool('heat_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def main(): metrics = {} session = requests.Session() # Make a Session to store the auth creds session.auth = (options.username, options.password) protocol = 'https' if options.https else 'http' _get_connection_metrics(session, metrics, protocol, options.host, options.port) _get_overview_metrics(session, metrics, protocol, options.host, options.port) _get_node_metrics(session, metrics, protocol, options.host, options.port, options.name) _get_queue_metrics(session, metrics, protocol, options.host, options.port) _get_consumer_metrics(session, metrics, protocol, options.host, options.port) status_ok(m_name='maas_rabbitmq') for k, v in metrics.items(): if v['value'] is True or v['value'] is False: metric_bool('rabbitmq_%s_status' % k, not v['value']) else: metric('rabbitmq_%s' % k, 'int64', v['value'], v['unit'])
def check(args): NETWORK_ENDPOINT = "http://{hostname}:9696".format(hostname=args.hostname) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: status_err(str(e)) # gather nova service states if args.host: agents = neutron.list_agents(host=args.host)["agents"] else: agents = neutron.list_agents()["agents"] if len(agents) == 0: status_err("No host(s) found in the agents list") # return all the things status_ok() for agent in agents: agent_is_up = True if agent["admin_state_up"] and not agent["alive"]: agent_is_up = False if args.host: name = "%s_status" % agent["binary"] else: name = "%s_%s_on_host_%s" % (agent["binary"], agent["id"], agent["host"]) metric_bool(name, agent_is_up)
def check(args): IDENTITY_ENDPOINT = 'http://{ip}:35357/v3'.format(ip=args.ip) try: keystone = get_keystone_client(endpoint=IDENTITY_ENDPOINT) is_up = True except (exc.HttpServerError, exc.ClientException): is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() keystone.services.list() end = time.time() milliseconds = (end - start) * 1000 # gather some vaguely interesting metrics to return project_count = len(keystone.projects.list()) user_count = len(keystone.users.list(domain='Default')) status_ok() metric_bool('keystone_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('keystone_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('keystone_user_count', 'uint32', user_count, 'users') metric('keystone_tenant_count', 'uint32', project_count, 'tenants') metric('keystone_tenant_count', 'uint32', project_count, 'tenants')
def check(args): IDENTITY_ENDPOINT = 'http://{ip}:35357/v2.0'.format(ip=args.ip) try: keystone = get_keystone_client(endpoint=IDENTITY_ENDPOINT) is_up = True except (exc.HttpServerError, exc.ClientException): is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() keystone.services.list() end = time() milliseconds = (end - start) * 1000 # gather some vaguely interesting metrics to return tenant_count = len(keystone.tenants.list()) user_count = len(keystone.users.list()) status_ok() metric_bool('keystone_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('keystone_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('keystone_user_count', 'uint32', user_count) metric('keystone_tenant_count', 'uint32', tenant_count)
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = ('http://{ip}:8776/v1/{tenant}'.format( ip=args.ip, tenant=keystone.tenant_id)) s = requests.Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: r = s.get('%s/volumes' % VOLUME_ENDPOINT, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) else: status_ok() metric_bool('cinder_api_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('cinder_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(args): NETWORK_ENDPOINT = 'http://{ip}:9696'.format(ip=args.ip) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) is_up = True # if we get a NeutronClientException don't bother sending any other metric # The API IS DOWN except exc.NeutronClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() neutron.list_agents() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('neutron_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('neutron_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): MAGNUM_ENDPOINT = 'http://{ip}:9511/v1'.format(ip=args.ip,) try: if args.ip: magnum = get_magnum_client(endpoint=MAGNUM_ENDPOINT) else: magnum = get_magnum_client() api_is_up = True except exc.HttpError as e: api_is_up = False metric_bool('client_success', False, m_name='maas_magnum') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_magnum') status_err(str(e), m_name='maas_magnum') else: metric_bool('client_success', True, m_name='maas_magnum') services = magnum.mservices.list() status_ok(m_name='maas_magnum') metric_bool('magnum_api_local_status', api_is_up, m_name='maas_magnum') if api_is_up: for service in services: metric_bool('_'.join([service.binary, 'status']), True if service.state == 'up' else False)
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token registry_endpoint = 'http://{ip}:9191'.format(ip=args.ip) s = Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: # /images returns a list of public, non-deleted images r = s.get('%s/images' % registry_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_registry_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_registry_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def main(): try: os.stat('/usr/sbin/ssacli') ssacli_bin = 'ssacli' except Exception: try: os.stat('/usr/sbin/hpssacli') ssacli_bin = 'hpssacli' except Exception: maas_common.status_err('Neither ssacli or hpssacli could be found', m_name='hp_monitoring') status = {} status['hardware_processors_status'] = \ get_chassis_status('hpasmcli', 'server') status['hardware_memory_status'] = get_chassis_status('hpasmcli', 'dimm') status['hardware_disk_status'] = get_drive_status(ssacli_bin) status['hardware_controller_status'] = get_controller_status(ssacli_bin) status['hardware_controller_cache_status'] = \ get_controller_cache_status(ssacli_bin) status['hardware_controller_battery_status'] = \ get_controller_battery_status(ssacli_bin) maas_common.status_ok(m_name='maas_hwvendor') for name, value in status.viewitems(): maas_common.metric_bool(name, value, m_name='maas_hwvendor')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = 'http://{ip}:8776/v1/{tenant}' \ .format(ip=args.ip, tenant=keystone.tenant_id) s = requests.Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: r = s.get('%s/os-services' % VOLUME_ENDPOINT, verify=False, timeout=10) except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) if not r.ok: status_err('could not get response from cinder api') status_ok() services = r.json()['services'] for service in services: service_is_up = True if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False metric_bool('%s_on_host_%s' % (service['binary'], service['host']), service_is_up)
def check(args): COMPUTE_ENDPOINT = 'http://{ip}:8774/v3'.format(ip=args.ip) try: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() nova.services.list() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('nova_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(args, tenant_id): HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format (ip=args.ip, tenant=tenant_id)) try: heat = get_heat_client(endpoint=HEAT_ENDPOINT) is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() heat.build_info.build_info() end = time() milliseconds = (end - start) * 1000 # Add other metrics stack_count = len(list(heat.stacks.list())) status_ok() metric_bool('heat_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('heat_stack_count', 'uint32', stack_count, 'stacks')
def check(args): try: nova = get_openstack_client('compute') except Exception as e: metric_bool('client_success', False, m_name='maas_nova') status_err(str(e), m_name='maas_nova') else: metric_bool('client_success', True, m_name='maas_nova') # get some cloud stats stats = [nova.get_hypervisor(i.id) for i in nova.hypervisors()] cloud_stats = collections.defaultdict(dict) count = 0 for stat in stats: count += 1 setattr(stat, 'count', count) for metric_name, vals in iter(stats_mapping.items()): multiplier = 1 if metric_name == 'total_vcpus': multiplier = args.cpu_allocation_ratio elif metric_name == 'total_memory': multiplier = args.mem_allocation_ratio cloud_stats[metric_name]['value'] = \ (getattr(stat, vals['stat_name']) * multiplier) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok(m_name='maas_nova') for metric_name in iter(cloud_stats): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def bonding_ifaces_check(_): bonding_ifaces = os.listdir("/proc/net/bonding") for bonding_iface in bonding_ifaces: bonding_iface_check_cmd = [ 'cat', '/proc/net/bonding/%s' % bonding_iface ] bonding_iface_check_cmd_output = subprocess.check_output( bonding_iface_check_cmd) bonding_iface_check_cmd_output_lines = ( bonding_iface_check_cmd_output.split('\n')) has_slave_down = False slave_count = 0 for idx, line in enumerate(bonding_iface_check_cmd_output_lines): if line.startswith("Slave Interface"): slave_count = slave_count + 1 slave_inface_mii_status_line = ( bonding_iface_check_cmd_output_lines[idx + 1]) slave_inface_mii_status = ( slave_inface_mii_status_line.split(":")[1]) if 'up' not in slave_inface_mii_status or slave_count < 2: has_slave_down = True if has_slave_down: metric_bool('host_bonding_iface_%s_slave_down' % bonding_iface, True) else: metric_bool('host_bonding_iface_%s_slave_down' % bonding_iface, False)
def main(): args = parse_args() galera_container = args.galera_container_name holland_bin = args.holland_binary holland_bs = args.holland_backupset today = datetime.date.today().strftime('%Y%m%d') yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y%m%d') # Get completed Holland backup set backupsets = \ container_holland_lb_check(galera_container, holland_bin, holland_bs) if len( [backup for backup in backupsets if yesterday or today in backup[0]]) > 0: status_ok() metric_bool('holland_backup_status', True) else: status_err('Could not find Holland backup from %s or %s' % (yesterday, today)) metric_bool('holland_backup_status', False) # Print metric about last backup print_metrics('holland_backup_size', float(backupsets[-1][1]) / 1024)
def main(): galera_hostname = args.galera_hostname holland_bin = args.holland_binary holland_bs = args.holland_backupset today = datetime.date.today().strftime('%Y%m%d') yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y%m%d') # Get completed Holland backup set backupsets = \ holland_lb_check(galera_hostname, holland_bin, holland_bs) if len( [backup for backup in backupsets if yesterday or today in backup[0]]) > 0: status_ok(m_name='maas_holland') metric_bool('holland_backup_status', True, m_name='maas_holland') else: metric_bool('holland_backup_status', False, m_name='maas_holland') status_err('Could not find Holland backup from %s or %s' % (yesterday, today), m_name='maas_holland') # Print metric about last backup print_metrics('holland_backup_size', "{0:.1f}".format(float(backupsets[-1][1]) / 1024))
def check(args): COMPUTE_ENDPOINT = 'http://{ip}:8774/v3'.format(ip=args.ip) try: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() nova.services.list() end = time() milliseconds = (end - start) * 1000 # gather some metrics status_count = collections.Counter( [s.status for s in nova.servers.list()]) status_ok() metric_bool('nova_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') for status in SERVER_STATUSES: metric('nova_servers_in_state_%s' % status, 'uint32', status_count[status])
def check(args): metadata_endpoint = ('http://{ip}:8775'.format(ip=args.ip)) is_up = True s = requests.Session() try: # looks like we can only get / (ec2 versions) without specifying # an instance ID and other headers versions = s.get('%s/' % metadata_endpoint, verify=False, timeout=10) milliseconds = versions.elapsed.total_seconds() * 1000 if not versions.ok or '1.0' not in versions.content.splitlines(): is_up = False except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False except Exception as e: status_err(str(e)) metric_values = dict() status_ok() metric_bool('nova_api_metadata_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_metadata_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric_values['nova_api_metadata_local_response_time'] = ('%.3f' % milliseconds) metric_influx(INFLUX_MEASUREMENT_NAME, metric_values)
def check(auth_ref, args): ironic_endpoint = ('{protocol}://{ip}:{port}/v1'.format( ip=args.ip, protocol=args.protocol, port=args.port)) try: if args.ip: ironic = get_ironic_client(endpoint=ironic_endpoint) else: ironic = get_ironic_client() is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') return else: metric_bool('client_success', True, m_name='maas_ironic') # pass limit=0 to list all nodes list without pagination all_nodes = ironic.node.list(limit=0) status_ok(m_name='maas_ironic') if is_up: maint_nodes = [node for node in all_nodes if node.maintenance] maint_nodes_count = len(maint_nodes) total_nodes = len(all_nodes) up_nodes = total_nodes - maint_nodes_count metric('ironic_up_nodes_count', 'uint32', up_nodes) metric('ironic_total_nodes_count', 'uint32', total_nodes)
def check(args): NETWORK_ENDPOINT = 'http://{hostname}:9696'.format(hostname=args.hostname) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: status_err(str(e)) # gather nova service states if args.host: agents = neutron.list_agents(host=args.host)['agents'] else: agents = neutron.list_agents()['agents'] if len(agents) == 0: status_err("No host(s) found in the agents list") # return all the things status_ok() for agent in agents: agent_is_up = True if agent['admin_state_up'] and not agent['alive']: agent_is_up = False if args.host: name = '%s_status' % agent['binary'] else: name = '%s_%s_on_host_%s' % (agent['binary'], agent['id'], agent['host']) name = name.replace(".", "_") metric_bool(name, agent_is_up)
def check(args): NETWORK_ENDPOINT = 'http://{ip}:9696'.format(ip=args.ip) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: status_err(str(e)) # gather nova service states if args.host: agents = neutron.list_agents(host=args.host)['agents'] else: agents = neutron.list_agents()['agents'] if len(agents) == 0: status_err("No host(s) found in the agents list") # return all the things status_ok() for agent in agents: agent_is_up = True if agent['admin_state_up'] and not agent['alive']: agent_is_up = False if args.host: name = '%s_status' % agent['binary'] else: name = '%s_%s_on_host_%s' % (agent['binary'], agent['id'], agent['host']) metric_bool(name, agent_is_up)
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token registry_endpoint = 'http://{ip}:9191'.format(ip=args.ip) s = Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: # /images returns a list of public, non-deleted images r = s.get('%s/images' % registry_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_registry_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_registry_local_response_time', 'uint32', milliseconds)
def check(args): cinder = get_openstack_client('block_storage') volume_endpoint = '%s/os-services' % str(cinder.get_endpoint()) try: # We cannot do /os-services?host=X as cinder returns a hostname of # X@lvm for cinder-volume binary resp = cinder.session.get(volume_endpoint, timeout=180) except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: metric_bool('client_success', False, m_name='maas_cinder') status_err(str(e), m_name='maas_cinder') if not resp.ok: metric_bool('client_success', False, m_name='maas_cinder') status_err( 'Could not get response from Cinder API', m_name='cinder' ) else: metric_bool('client_success', True, m_name='maas_cinder') services = resp.json()['services'] # We need to match against a host of X and X@lvm (or whatever backend) if args.host: backend = ''.join((args.host, '@')) services = [service for service in services if (service['host'].startswith(backend) or service['host'] == args.host)] if len(services) == 0: status_err( 'No host(s) found in the service list', m_name='maas_cinder' ) status_ok(m_name='maas_cinder') if args.host: for service in services: service_is_up = True name = '%s_status' % service['binary'] if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False if '@' in service['host']: [host, backend] = service['host'].split('@') name = '%s-%s_status' % (service['binary'], backend) metric_bool(name, service_is_up) else: for service in services: service_is_up = True if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False name = '%s_on_host_%s' % (service['binary'], service['host']) metric_bool(name, service_is_up)
def check(): try: NETWORK_ENDPOINT = 'http://{ip}:9696'.format(ip=CONFIGS['ip']) try: if CONFIGS['ip']: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) else: neutron = get_neutron_client() is_up = True # if we get a NeutronClientException don't bother sending # any other metric The API IS DOWN except exc.NeutronClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() neutron.list_agents() end = time.time() milliseconds = (end - start) * 1000 # gather some metrics networks = len(neutron.list_networks()['networks']) agents = len(neutron.list_agents()['agents']) routers = len(neutron.list_routers()['routers']) subnets = len(neutron.list_subnets()['subnets']) status_ok() metric_bool(PLUGIN, 'neutron_api_local_status', is_up, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) # only want to send other metrics if api is up if is_up: metric(PLUGIN, 'neutron_api_local_response_time', '%.3f' % milliseconds, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) metric(PLUGIN, 'neutron_networks', networks, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) metric(PLUGIN, 'neutron_agents', agents, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) metric(PLUGIN, 'neutron_routers', routers, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) metric(PLUGIN, 'neutron_subnets', subnets, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) except: metric_bool(PLUGIN, 'neutron_api_local_status', False, graphite_host=CONFIGS['graphite_host'], graphite_port=CONFIGS['graphite_port']) raise
def check(auth_ref, args): name = "ironic-conductor_status" for proc in psutil.process_iter(): if 'ironic-conducto' in proc.name(): metric_bool(name, True) break else: metric_bool(name, False)
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token tenant_id = keystone.tenant_id nova_version = '.'.join( map(str, get_os_component_major_api_version('nova'))) COMPUTE_ENDPOINT = ( '{protocol}://{hostname}:8774/v{version}/{tenant_id}'.format( protocol=args.protocol, hostname=args.hostname, version=nova_version, tenant_id=tenant_id)) try: nova = get_nova_client(auth_token=auth_token, bypass_url=COMPUTE_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_nova') for nova_service_type in NOVA_SERVICE_TYPE_LIST: metric('%s_status' % nova_service_type, 'string', '%s cannot reach API' % nova_service_type, m_name='maas_nova') status_err_no_exit(str(e), m_name='maas_nova') return else: metric_bool('client_success', True, m_name='maas_nova') # gather nova service states if args.host: services = nova.services.list(host=args.host) else: services = nova.services.list() if len(services) == 0: status_err("No host(s) found in the service list", m_name='maas_nova') # return all the things status_ok(m_name='maas_nova') for service in services: service_is_up = "Yes" if service.status.lower() == 'enabled': if service.state.lower() == 'down': service_is_up = "No" elif service.status.lower() == 'disabled': if service.disabled_reason: if 'auto' in service.disabled_reason.lower(): service_is_up = "No" if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) metric(name, 'string', service_is_up, m_name='maas_nova')
def get_mon_statistics(client=None, keyring=None, host=None, container_name=None): ceph_status = get_ceph_status(client=client, keyring=keyring, container_name=container_name) mon = [m for m in ceph_status['monmap']['mons'] if m['name'] == host] mon_in = mon[0]['rank'] in ceph_status['quorum'] maas_common.metric_bool('mon_in_quorum', mon_in)
def main(args): """Main function.""" if not args.processes: # The command line does not have any process names specified metric_bool('container_success', False, m_name='maas_container') status_err('No executable names supplied', m_name='maas_container') check_process_running(container_name=args.container, process_names=args.processes)
def check(args): # identify the container we will use for monitoring try: containers_list = subprocess.check_output(FIND_CONTAINER) container = containers_list.splitlines()[0] except (IndexError, subprocess.CalledProcessError): metric_bool('agents_found', False, m_name='maas_neutron') status_err('no running neutron agents containers found', m_name='maas_neutron') else: metric_bool('agents_found', True, m_name='maas_neutron') network_endpoint = '{protocol}://{host}:{port}'.format( host=args.neutron_host, protocol=args.protocol, port=args.port ) try: neutron = get_neutron_client(endpoint_url=network_endpoint) # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') status_err(str(e), m_name='maas_neutron') else: metric_bool('client_success', True, m_name='maas_neutron') # only check networks which have a port with DHCP enabled ports = neutron.list_ports(device_owner='network:dhcp')['ports'] nets = set([p['network_id'] for p in ports]) # perform checks for each identified network failures = [] for net_id in nets: namespace = 'qdhcp-%s' % net_id service_check_cmd = SERVICE_CHECK % namespace command = shlex.split('lxc-attach -n %s -- %s' % (container, service_check_cmd)) try: subprocess.check_output(command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # HTTP 404 response indicates the service is responsive. # this is the expected response because the maas testing host IP # is used to look up metadata and no metadata exists for this IP if '404 Not Found' not in e.output: failures.append(net_id) is_ok = len(failures) == 0 metric_bool('neutron-metadata-agent-proxy_status', is_ok, m_name='maas_neutron') if is_ok: status_ok(m_name='maas_neutron') else: status_err('neutron metadata agent proxies fail on host %s ' 'net_ids: %s' % (container, ','.join(failures)), m_name='maas_neutron')
def main(): status = {} status['hardware_processors_status'] = get_hpasmcli_status('server') status['hardware_memory_status'] = get_hpasmcli_status('dimm') status['hardware_disk_status'] = get_drive_status() maas_common.status_ok() for name, value in status.viewitems(): maas_common.metric_bool(name, value)
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = "http://{hostname}:8776/v1/{tenant}".format(hostname=args.hostname, tenant=keystone.tenant_id) s = requests.Session() s.headers.update({"Content-type": "application/json", "x-auth-token": auth_token}) try: # We cannot do /os-services?host=X as cinder returns a hostname of # X@lvm for cinder-volume binary r = s.get("%s/os-services" % VOLUME_ENDPOINT, verify=False, timeout=10) except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) if not r.ok: status_err("Could not get response from Cinder API") services = r.json()["services"] # We need to match against a host of X and X@lvm (or whatever backend) if args.host: backend = "".join((args.host, "@")) services = [ service for service in services if (service["host"].startswith(backend) or service["host"] == args.host) ] if len(services) == 0: status_err("No host(s) found in the service list") status_ok() if args.host: for service in services: service_is_up = True name = "%s_status" % service["binary"] if service["status"] == "enabled" and service["state"] != "up": service_is_up = False if "@" in service["host"]: [host, backend] = service["host"].split("@") name = "%s-%s_status" % (service["binary"], backend) metric_bool(name, service_is_up) else: for service in services: service_is_up = True if service["status"] == "enabled" and service["state"] != "up": service_is_up = False name = "%s_on_host_%s" % (service["binary"], service["host"]) metric_bool(name, service_is_up)
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = ('http://{ip}:8776/v1/{tenant}'.format (ip=args.ip, tenant=keystone.tenant_id)) s = requests.Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: vol = s.get('%s/volumes/detail' % VOLUME_ENDPOINT, verify=False, timeout=10) milliseconds = vol.elapsed.total_seconds() * 1000 snap = s.get('%s/snapshots/detail' % VOLUME_ENDPOINT, verify=False, timeout=10) is_up = vol.ok and snap.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False except Exception as e: status_err(str(e)) else: # gather some metrics vol_statuses = [v['status'] for v in vol.json()['volumes']] vol_status_count = collections.Counter(vol_statuses) total_vols = len(vol.json()['volumes']) snap_statuses = [v['status'] for v in snap.json()['snapshots']] snap_status_count = collections.Counter(snap_statuses) total_snaps = len(snap.json()['snapshots']) status_ok() metric_bool('cinder_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('cinder_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('cinder_total_volumes', 'uint32', total_vols, 'volumes') for status in VOLUME_STATUSES: metric('cinder_%s_volumes' % status, 'uint32', vol_status_count[status], 'volumes') metric('cinder_total_snapshots', 'uint32', total_snaps, 'snapshots') for status in VOLUME_STATUSES: metric('cinder_%s_snaps' % status, 'uint32', snap_status_count[status], 'snapshots')
def get_mon_statistics(report=None, host=None): mon = [m for m in report['monmap']['mons'] if m['name'] == host] mon_in = mon[0]['rank'] in report['quorum'] maas_common.metric_bool('mon_in_quorum', mon_in) health_status = 0 for each in report['health']['health']['health_services'][0]['mons']: if each['name'] == host: health_status = STATUSES[each['health']] break maas_common.metric('mon_health', 'uint32', health_status)
def check(args): # NOTE(npawelek): API calls for conductor status are only available # in ironic v1.49 and onward. Instead, we look for the process # directly until it becomes available within the API. name = "ironic-conductor_status" for proc in psutil.process_iter(): if 'ironic-conducto' in proc.name(): metric_bool(name, True) break else: metric_bool(name, False)
def get_mon_statistics(client=None, keyring=None, host=None): ceph_status = get_ceph_status(client=client, keyring=keyring) mon = [m for m in ceph_status['monmap']['mons'] if m['name'] == host] mon_in = mon[0]['rank'] in ceph_status['quorum'] maas_common.metric_bool('mon_in_quorum', mon_in) health_status = 0 for each in ceph_status['health']['health']['health_services'][0]['mons']: if each['name'] == host: health_status = STATUSES[each['health']] break maas_common.metric('mon_health', 'uint32', health_status)
def check(args): if on_lxc_container: containers = lxc.list_containers() neutron_agent_containers = [] for container in containers: if 'neutron_agents' in container: metric_bool('agents_found', True, m_name='maas_neutron') neutron_agent_containers.append(container) if len(neutron_agent_containers) == 0: metric_bool('agents_found', False, m_name='maas_neutron') status_err('no running neutron agents containers found', m_name='maas_neutron') return for neutron_agent_container in neutron_agent_containers: # Get the neutron_agent_container's init PID. try: c = lxc.Container(neutron_agent_container) # If the container wasn't found, exit now. if c.init_pid == -1: metric_bool('container_success', False, m_name='maas_neutron_agent_container') status_err( 'Could not find PID for container {}'.format( neutron_agent_container ), m_name='maas_neutron_agent_container' ) except (Exception, SystemError) as e: metric_bool('container_success', False, m_name='maas_neutron_agent_container') status_err( 'Container lookup failed on "{}". ERROR: "{}"' .format( neutron_agent_container, e ), m_name='maas_neutron_agent_container' ) else: metric_bool('container_success', True, m_name='maas_neutron_agent_container') # c is the lxc container instance of this # neutron_agent_container check_process_statuses(neutron_agent_container, c) else: ovs_agent_host = socket.gethostname() check_process_statuses(ovs_agent_host)