def check(args): try: start = datetime.datetime.now() # this is lame but the k8 python client did not work ret = subprocess.check_output([ 'kubectl', "--kubeconfig=%s" % args.kubeconfig, "--namespace=%s" % RACKSPACE_SYSTEM_NS, 'get', 'pods' ]) end = datetime.datetime.now() api_is_up = (len(ret.split('\n')) > 1 ) # if rack system is empty something is terribly wrong except subprocess.CalledProcessError as e: api_is_up = False metric_bool('client_success', False, m_name='maas_managed_k8') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_managed_k8') status_err(str(e), m_name='maas_managed_k8') else: metric_bool('client_success', True, m_name='maas_managed_k8') dt = (end - start) milliseconds = (dt.microseconds + dt.seconds * 10**6) / 10**3 status_ok(m_name='maas_managed_k8') metric_bool('managed_k8_api_local_status', api_is_up, m_name='maas_managed_k8') if api_is_up: # only want to send other metrics if api is up metric('managed_k8_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def main(): metrics = {} session = requests.Session() # Make a Session to store the auth creds session.auth = (options.username, options.password) protocol = 'https' if options.https else 'http' _get_connection_metrics(session, metrics, protocol, options.host, options.port) _get_overview_metrics(session, metrics, protocol, options.host, options.port) _get_node_metrics(session, metrics, protocol, options.host, options.port, options.name) _get_queue_metrics(session, metrics, protocol, options.host, options.port) _get_consumer_metrics(session, metrics, protocol, options.host, options.port) status_ok(m_name='maas_rabbitmq') for k, v in metrics.items(): if v['value'] is True or v['value'] is False: metric_bool('rabbitmq_%s_status' % k, not v['value']) else: metric('rabbitmq_%s' % k, 'int64', v['value'], v['unit'])
def check(args): try: nova = get_openstack_client('compute') except Exception as e: metric_bool('client_success', False, m_name='maas_nova') status_err(str(e), m_name='maas_nova') else: metric_bool('client_success', True, m_name='maas_nova') # get some cloud stats stats = [nova.get_hypervisor(i.id) for i in nova.hypervisors()] cloud_stats = collections.defaultdict(dict) count = 0 for stat in stats: count += 1 setattr(stat, 'count', count) for metric_name, vals in stats_mapping.iteritems(): multiplier = 1 if metric_name == 'total_vcpus': multiplier = args.cpu_allocation_ratio elif metric_name == 'total_memory': multiplier = args.mem_allocation_ratio cloud_stats[metric_name]['value'] = \ (getattr(stat, vals['stat_name']) * multiplier) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok(m_name='maas_nova') for metric_name in cloud_stats.iterkeys(): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def check(auth_ref, args): DESIGNATE_ENDPOINT = '{protocol}://{ip}:9001/'.format( protocol=args.protocol, ip=args.ip) try: if args.ip: endpoint = DESIGNATE_ENDPOINT else: endpoint = get_endpoint_url_for_service('dns', auth_ref, 'internal') # time something arbitrary start = datetime.datetime.now() r = requests.get(endpoint) end = datetime.datetime.now() api_is_up = (r.status_code == 200) except (requests.HTTPError, requests.Timeout, requests.ConnectionError): api_is_up = False metric_bool('client_success', False, m_name='maas_designate') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_designate') status_err(str(e), m_name='maas_designate') else: metric_bool('client_success', True, m_name='maas_designate') dt = (end - start) milliseconds = (dt.microseconds + dt.seconds * 10**6) / 10**3 status_ok(m_name='maas_designate') metric_bool('designate_api_local_status', api_is_up, m_name='maas_designate') if api_is_up: # only want to send other metrics if api is up metric('designate_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = ('http://{ip}:8776/v1/{tenant}'.format( ip=args.ip, tenant=keystone.tenant_id)) s = requests.Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: r = s.get('%s/volumes' % VOLUME_ENDPOINT, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) else: status_ok() metric_bool('cinder_api_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('cinder_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def main(args): bind_ip = str(args.ip) port = args.port is_up = True try: stats = item_stats(bind_ip, port) current_version = stats['version'] except (TypeError, IndexError): is_up = False metric_bool('client_success', False, m_name='maas_memcached') else: is_up = True metric_bool('client_success', True, m_name='maas_memcached') if current_version not in VERSIONS: status_err('This plugin has only been tested with version %s ' 'of memcached, and you are using version %s' % (VERSIONS, current_version), m_name='maas_memcached') status_ok(m_name='maas_memcached') metric_bool('memcache_api_local_status', is_up, m_name='maas_memcached') if is_up: for m, u in MEMCACHE_METRICS.iteritems(): metric('memcache_%s' % m, 'uint64', stats[m], u)
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token api_endpoint = 'http://{ip}:9292/v2'.format(ip=args.ip) s = Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: # Hit something that isn't querying the glance-registry, since we # query glance-registry in separate checks r = s.get('%s/schemas/image' % api_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_api_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id COMPUTE_ENDPOINT = ('http://{ip}:8774/v2/{tenant_id}'.format( ip=args.ip, tenant_id=tenant_id)) try: if args.ip: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) else: nova = get_nova_client() except Exception as e: status_err(str(e)) else: # get some cloud stats stats = nova.hypervisor_stats.statistics() cloud_stats = collections.defaultdict(dict) for metric_name, vals in stats_mapping.iteritems(): cloud_stats[metric_name]['value'] = \ getattr(stats, vals['stat_name']) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok() for metric_name in cloud_stats.iterkeys(): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token registry_endpoint = 'http://{ip}:9191'.format(ip=args.ip) s = Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: # /images returns a list of public, non-deleted images r = s.get('%s/images' % registry_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_registry_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_registry_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): metadata_endpoint = ('http://{ip}:8775'.format(ip=args.ip)) is_up = True s = requests.Session() try: # looks like we can only get / (ec2 versions) without specifying # an instance ID and other headers versions = s.get('%s/' % metadata_endpoint, verify=False, timeout=10) milliseconds = versions.elapsed.total_seconds() * 1000 if not versions.ok or '1.0' not in versions.content.splitlines(): is_up = False except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('nova_api_metadata_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_metadata_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check_process_running(process_names, container_name=None): """Check to see if processes are running. Check if each of the processes in process_names are in a list of running processes in the specified container name, or on this host. """ if not process_names: # The caller has not provided a value for process_names, which gives us # nothing to do. Return an error for the check. status_err('No process names provided') procs_path = '/sys/fs/cgroup/cpu/cgroup.procs' if container_name is not None: # Checking for processes in a container, not the parent host procs_path = os.path.join('/sys/fs/cgroup/cpu/lxc', container_name, 'cgroup.procs') procs = get_processes(procs_path) if not procs: # Unable to get a list of process names for the container or host. status_err('Could not get a list of running processes') # Report the presence of each process from the command line in the # running process list for the host or specified container. for process_name in process_names: metric('process_check', '%s_process_status' % process_name, str(int(process_name in procs)))
def check(args): ironic = get_openstack_client('baremetal') try: ironic_local_endpoint = generate_local_endpoint( str(ironic.get_endpoint()), args.ip, args.port, args.protocol, '/nodes') resp = ironic.session.get(ironic_local_endpoint) except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') else: is_up = resp.status_code == 200 milliseconds = resp.elapsed.total_seconds() * 1000 metric_bool('client_success', True, m_name='maas_ironic') status_ok(m_name='maas_ironic') metric_bool('ironic_api_local_status', is_up, m_name='maas_ironic') if is_up: metric('ironic_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): heat = get_openstack_client('orchestration') try: local_heat_endpoint = generate_local_endpoint( str(heat.get_endpoint()), args.ip, args.port, args.protocol, '/build_info' ) resp = heat.session.get(local_heat_endpoint) except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False metric_bool('client_success', False, m_name='maas_heat') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_heat') status_err(str(e), m_name='maas_heat') else: is_up = True milliseconds = resp.elapsed.total_seconds() * 1000 metric_bool('client_success', True, m_name='maas_heat') status_ok(m_name='maas_heat') metric_bool('heat_api_local_status', is_up, m_name='maas_heat') if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): IRONIC_ENDPOINT = ('http://{ip}:6385/v1'.format(ip=args.ip)) try: if args.ip: ironic = get_ironic_client(endpoint=IRONIC_ENDPOINT) else: ironic = get_ironic_client() is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') else: metric_bool('client_success', True, m_name='maas_ironic') # time something arbitrary start = time.time() ironic.node.list() end = time.time() milliseconds = (end - start) * 1000 status_ok(m_name='maas_ironic') metric_bool('ironic_api_local_status', is_up, m_name='maas_ironic') if is_up: # only want to send other metrics if api is up metric('ironic_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format (ip=args.ip, tenant=tenant_id)) try: if args.ip: heat = get_heat_client(endpoint=HEAT_ENDPOINT) else: heat = get_heat_client() is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() heat.build_info.build_info() end = time.time() milliseconds = (end - start) * 1000 status_ok() metric_bool('heat_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): octavia = get_openstack_client('load_balancer') try: if args.ip: octavia_local_endpoint = generate_local_endpoint( str(octavia.get_endpoint()), args.ip, args.port, args.protocol, '/lbaas/loadbalancers' ) resp = octavia.session.get(octavia_local_endpoint, timeout=180) except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False metric_bool('client_success', False, m_name='maas_octavia') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_octavia') status_err(str(e), m_name='maas_octavia') else: is_up = resp.ok metric_bool('client_success', True, m_name='maas_octavia') status_ok(m_name='maas_octavia') metric_bool('octavia_api_local_status', is_up, m_name='maas_octavia') if is_up: loadbalancers = resp.json()['loadbalancers'] num = len([lb for lb in loadbalancers if lb['provisioning_status'] == 'ERROR']) # only want to send other metrics if api is up metric('octavia_num_lb_in_error_status', 'uint32', num, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id heat_endpoint = ('{protocol}://{ip}:{port}/v1/{tenant}'.format( ip=args.ip, tenant=tenant_id, protocol=args.protocol, port=args.port)) try: if args.ip: heat = get_heat_client(endpoint=heat_endpoint) else: heat = get_heat_client() is_up = True except exc.HTTPException as e: is_up = False metric_bool('client_success', False, m_name='maas_heat') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_heat') status_err(str(e), m_name='maas_heat') else: metric_bool('client_success', True, m_name='maas_heat') # time something arbitrary start = time.time() heat.build_info.build_info() end = time.time() milliseconds = (end - start) * 1000 status_ok(m_name='maas_heat') metric_bool('heat_api_local_status', is_up, m_name='maas_heat') if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): octavia = get_openstack_client('load_balancer') try: if args.ip: octavia_local_endpoint = generate_local_endpoint( str(octavia.get_endpoint()), args.ip, args.port, args.protocol, '/lbaas/loadbalancers?limit=1') resp = octavia.session.get(octavia_local_endpoint, timeout=180) except (exc.HTTPError, exc.Timeout, exc.ConnectionError): is_up = False metric_bool('client_success', False, m_name='maas_octavia') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_octavia') status_err(str(e), m_name='maas_octavia') else: is_up = resp.ok metric_bool('client_success', True, m_name='maas_octavia') milliseconds = resp.elapsed.total_seconds() * 1000 status_ok(m_name='maas_octavia') metric_bool('octavia_api_local_status', is_up, m_name='maas_octavia') if is_up: # only want to send other metrics if api is up metric('octavia_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): glance = get_openstack_client('image') try: # Remove version from returned endpoint glance_endpoint = str(glance.get_endpoint().rsplit('/', 2)[0]) local_registry_url = generate_local_endpoint( glance_endpoint, args.ip, args.port, args.protocol, '/images' ) resp = glance.session.get(local_registry_url, timeout=180) milliseconds = resp.elapsed.total_seconds() * 1000 is_up = resp.status_code == 200 except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False metric_bool('client_success', False, m_name='maas_glance') except Exception as e: metric_bool('client_success', False, m_name='maas_glance') status_err(str(e), m_name='maas_glance') status_ok(m_name='maas_glance') metric_bool('client_success', True, m_name='maas_glance') metric_bool('glance_registry_local_status', is_up, m_name='maas_glance') # Only send remaining metrics if the API is up if is_up: metric('glance_registry_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): NETWORK_ENDPOINT = 'http://{ip}:9696'.format(ip=args.ip) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) is_up = True # if we get a NeutronClientException don't bother sending any other metric # The API IS DOWN except exc.NeutronClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() neutron.list_agents() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('neutron_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('neutron_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(args): IDENTITY_ENDPOINT = 'http://{ip}:35357/v2.0'.format(ip=args.ip) try: keystone = get_keystone_client(endpoint=IDENTITY_ENDPOINT) is_up = True except (exc.HttpServerError, exc.ClientException): is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() keystone.services.list() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('keystone_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('keystone_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): # We call get_keystone_client here as there is some logic within to get a # new token if previous one is bad. keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token registry_endpoint = 'http://{ip}:9191'.format(ip=args.ip) s = requests.Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: # /images returns a list of public, non-deleted images r = s.get('%s/images' % registry_endpoint, verify=False, timeout=10) is_up = r.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False except Exception as e: status_err(str(e)) status_ok() metric_bool('glance_registry_local_status', is_up) # only want to send other metrics if api is up if is_up: milliseconds = r.elapsed.total_seconds() * 1000 metric('glance_registry_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): octavia = get_openstack_client('load_balancer') try: if args.ip: octavia_local_endpoint = generate_local_endpoint( str(octavia.get_endpoint()), args.ip, args.port, args.protocol, '/lbaas/loadbalancers') resp = octavia.session.get(octavia_local_endpoint, timeout=180) except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False metric_bool('client_success', False, m_name='maas_octavia') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_octavia') status_err(str(e), m_name='maas_octavia') else: is_up = resp.ok metric_bool('client_success', True, m_name='maas_octavia') status_ok(m_name='maas_octavia') metric_bool('octavia_api_local_status', is_up, m_name='maas_octavia') if is_up: loadbalancers = resp.json()['loadbalancers'] num = len([ lb for lb in loadbalancers if lb['provisioning_status'] == 'ERROR' ]) # only want to send other metrics if api is up metric('octavia_num_lb_in_error_status', 'uint32', num, 'ms')
def check(args, tenant_id): HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format (ip=args.ip, tenant=tenant_id)) try: heat = get_heat_client(endpoint=HEAT_ENDPOINT) is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() heat.build_info.build_info() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('heat_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): ironic_endpoint = ('{protocol}://{ip}:{port}/v1'.format( ip=args.ip, protocol=args.protocol, port=args.port)) try: if args.ip: ironic = get_ironic_client(endpoint=ironic_endpoint) else: ironic = get_ironic_client() is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') else: metric_bool('client_success', True, m_name='maas_ironic') # time something arbitrary start = time.time() ironic.node.list() end = time.time() milliseconds = (end - start) * 1000 status_ok(m_name='maas_ironic') metric_bool('ironic_api_local_status', is_up, m_name='maas_ironic') if is_up: # only want to send other metrics if api is up metric('ironic_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = ('http://{hostname}:8776/v1/{tenant}'.format( hostname=args.hostname, tenant=keystone.tenant_id)) s = requests.Session() s.headers.update({ 'Content-type': 'application/json', 'x-auth-token': auth_token }) try: # We cannot do /os-services?host=X as cinder returns a hostname of # X@lvm for cinder-volume binary r = s.get('%s/os-services' % VOLUME_ENDPOINT, verify=False, timeout=10) except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: status_err(str(e)) if not r.ok: status_err('Could not get response from Cinder API') services = r.json()['services'] # We need to match against a host of X and X@lvm (or whatever backend) if args.host: backend = ''.join((args.host, '@')) services = [ service for service in services if (service['host'].startswith(backend) or service['host'] == args.host) ] if len(services) == 0: status_err('No host(s) found in the service list') if args.host: for service in services: service_is_up = True name = '%s_status' % service['binary'] if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False if '@' in service['host']: [host, backend] = service['host'].split('@') name = '%s-%s_status' % (service['binary'], backend) metric('cinder_service', name, str(int(service_is_up))) else: for service in services: service_is_up = True if service['status'] == 'enabled' and service['state'] != 'up': service_is_up = False name = '%s_on_host_%s' % (service['binary'], service['host']) metric('cinder_service', name, str(int(service_is_up)))
def check(args): COMPUTE_ENDPOINT = 'http://{ip}:8774/v3'.format(ip=args.ip) try: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() nova.services.list() end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('nova_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_local_response_time', 'uint32', '%.3f' % milliseconds, 'ms')
def get_rgw_checkup(client, keyring=None, rgw_address=None, container_name=None): rgw_status = get_ceph_rgw_hostcheck(rgw_address, container_name=container_name) maas_common.metric('rgw_up', 'uint32', rgw_status)
def check(auth_ref, args): MAGNUM_ENDPOINT = 'http://{ip}:9511/v1'.format(ip=args.ip,) try: if args.ip: magnum = get_magnum_client(endpoint=MAGNUM_ENDPOINT) else: magnum = get_magnum_client() api_is_up = True except exc.HttpError as e: api_is_up = False metric_bool('client_success', False, m_name='maas_magnum') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_magnum') status_err(str(e), m_name='maas_magnum') else: metric_bool('client_success', True, m_name='maas_magnum') # time something arbitrary start = time.time() magnum.cluster_templates.list() end = time.time() milliseconds = (end - start) * 1000 status_ok(m_name='maas_magnum') metric_bool('magnum_api_local_status', api_is_up, m_name='maas_magnum') if api_is_up: # only want to send other metrics if api is up metric('magnum_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): COMPUTE_ENDPOINT = 'http://{ip}:8774/v3'.format(ip=args.ip) try: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() nova.services.list() end = time() milliseconds = (end - start) * 1000 # gather some metrics status_count = collections.Counter( [s.status for s in nova.servers.list()]) status_ok() metric_bool('nova_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('nova_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') for status in SERVER_STATUSES: metric('nova_servers_in_state_%s' % status, 'uint32', status_count[status])
def check(args): ironic = get_openstack_client('baremetal') try: ironic_local_endpoint = generate_local_endpoint( str(ironic.get_endpoint()), args.ip, args.port, args.protocol, '/nodes' ) resp = ironic.session.get(ironic_local_endpoint) except (exc.ConnectionError, exc.HTTPError, exc.Timeout): is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') else: is_up = resp.status_code == 200 milliseconds = resp.elapsed.total_seconds() * 1000 metric_bool('client_success', True, m_name='maas_ironic') status_ok(m_name='maas_ironic') metric_bool('ironic_api_local_status', is_up, m_name='maas_ironic') if is_up: metric('ironic_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args): try: nova = get_openstack_client('compute') except Exception as e: metric_bool('client_success', False, m_name='maas_nova') status_err(str(e), m_name='maas_nova') else: metric_bool('client_success', True, m_name='maas_nova') # get some cloud stats stats = [nova.get_hypervisor(i.id) for i in nova.hypervisors()] cloud_stats = collections.defaultdict(dict) count = 0 for stat in stats: count += 1 setattr(stat, 'count', count) for metric_name, vals in iter(stats_mapping.items()): multiplier = 1 if metric_name == 'total_vcpus': multiplier = args.cpu_allocation_ratio elif metric_name == 'total_memory': multiplier = args.mem_allocation_ratio cloud_stats[metric_name]['value'] = \ (getattr(stat, vals['stat_name']) * multiplier) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok(m_name='maas_nova') for metric_name in iter(cloud_stats): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def check(args): metadata_endpoint = ('{protocol}://{ip}:{port}'.format( ip=args.ip, protocol=args.protocol, port=args.port)) is_up = True s = requests.Session() try: # looks like we can only get / (ec2 versions) without specifying # an instance ID and other headers versions = s.get('%s/' % metadata_endpoint, verify=False, timeout=180) milliseconds = versions.elapsed.total_seconds() * 1000 if not versions.ok or '1.0' not in versions.content.decode( ).splitlines(): is_up = False except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False metric_bool('client_success', False, m_name='maas_nova') except Exception as e: metric_bool('client_success', False, m_name='maas_nova') status_err(str(e), m_name='maas_nova') else: metric_bool('client_success', True, m_name='maas_nova') status_ok(m_name='maas_nova') metric_bool('nova_api_metadata_local_status', is_up, m_name='maas_nova') # only want to send other metrics if api is up if is_up: metric('nova_api_metadata_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): MAGNUM_ENDPOINT = 'http://{ip}:9511/v1'.format(ip=args.ip, ) try: if args.ip: magnum = get_magnum_client(endpoint=MAGNUM_ENDPOINT) else: magnum = get_magnum_client() api_is_up = True except exc.HttpError as e: api_is_up = False metric_bool('client_success', False, m_name='maas_magnum') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_magnum') status_err(str(e), m_name='maas_magnum') else: metric_bool('client_success', True, m_name='maas_magnum') # time something arbitrary start = time.time() magnum.cluster_templates.list() end = time.time() milliseconds = (end - start) * 1000 status_ok(m_name='maas_magnum') metric_bool('magnum_api_local_status', api_is_up, m_name='maas_magnum') if api_is_up: # only want to send other metrics if api is up metric('magnum_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): ironic_endpoint = ('{protocol}://{ip}:{port}/v1'.format( ip=args.ip, protocol=args.protocol, port=args.port)) try: if args.ip: ironic = get_ironic_client(endpoint=ironic_endpoint) else: ironic = get_ironic_client() is_up = True except exc.ClientException: is_up = False # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_ironic') status_err(str(e), m_name='maas_ironic') return else: metric_bool('client_success', True, m_name='maas_ironic') # pass limit=0 to list all nodes list without pagination all_nodes = ironic.node.list(limit=0) status_ok(m_name='maas_ironic') if is_up: maint_nodes = [node for node in all_nodes if node.maintenance] maint_nodes_count = len(maint_nodes) total_nodes = len(all_nodes) up_nodes = total_nodes - maint_nodes_count metric('ironic_up_nodes_count', 'uint32', up_nodes) metric('ironic_total_nodes_count', 'uint32', total_nodes)
def check(auth_ref, args): OCTAVIA_ENDPOINT = 'http://{ip}:9876/v1'.format(ip=args.ip,) try: if args.ip: endpoint = OCTAVIA_ENDPOINT else: endpoint = get_endpoint_url_for_service( 'load-balancer', auth_ref, 'internal') # time something arbitrary start = datetime.datetime.now() r = requests.get(endpoint + "/v1/loadbalancers?limit=1") end = datetime.datetime.now() api_is_up = (r.status_code == 200) except (requests.HTTPError, requests.Timeout, requests.ConnectionError): api_is_up = False metric_bool('client_success', False, m_name='maas_octavia') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_octavia') status_err(str(e), m_name='maas_octavia') else: metric_bool('client_success', True, m_name='maas_octavia') dt = (end - start) milliseconds = (dt.microseconds + dt.seconds * 10 ** 6) / 10 ** 3 status_ok(m_name='maas_octavia') metric_bool('octavia_api_local_status', api_is_up, m_name='maas_octavia') if api_is_up: # only want to send other metrics if api is up metric('octavia_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args, tenant_id): CEILOMETER_ENDPOINT = 'http://{ip}:8777'.format(ip=args.ip) try: ceilometer = get_ceilometer_client(endpoint=CEILOMETER_ENDPOINT) is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() meters = ceilometer.meters.list() # Exceptions are only thrown when we iterate over meter [i.meter_id for i in meters] end = time() milliseconds = (end - start) * 1000 status_ok() metric_bool('ceilometer_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('ceilometer_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id COMPUTE_ENDPOINT = ( 'http://{ip}:8774/v2/{tenant_id}'.format(ip=args.ip, tenant_id=tenant_id) ) try: if args.ip: nova = get_nova_client(bypass_url=COMPUTE_ENDPOINT) else: nova = get_nova_client() except Exception as e: status_err(str(e)) else: # get some cloud stats stats = nova.hypervisor_stats.statistics() cloud_stats = collections.defaultdict(dict) for metric_name, vals in stats_mapping.iteritems(): cloud_stats[metric_name]['value'] = \ getattr(stats, vals['stat_name']) cloud_stats[metric_name]['unit'] = \ vals['unit'] cloud_stats[metric_name]['type'] = \ vals['type'] status_ok() for metric_name in cloud_stats.iterkeys(): metric('cloud_resource_%s' % metric_name, cloud_stats[metric_name]['type'], cloud_stats[metric_name]['value'], cloud_stats[metric_name]['unit'])
def get_health_checks(client=None, keyring=None, section=None, container_name=None, deploy_osp=False): metrics = [] ceph_status = get_ceph_status(client=client, keyring=keyring, container_name=container_name, deploy_osp=deploy_osp) # Go through the detailed health checks and generate metrics # for each based on the given section for curcheck in DETAILED_CHECKS[section]: if curcheck in ceph_status['health']['checks']: severity = ceph_status['health']['checks'][curcheck]['severity'] metrics.append({ 'name': curcheck, 'type': 'uint32', 'value': STATUSES[severity] }) else: metrics.append({ 'name': curcheck, 'type': 'uint32', 'value': STATUSES['HEALTH_OK'] }) # Submit gathered metrics for m in metrics: metric(m['name'], m['type'], m['value'])
def check(args): try: # attempt a query to example.com # return good check on any valid response start = datetime.datetime.now() message = dns.message.make_query("example.org", "A") answer = dns.query.udp(message, timeout=5, where=args.ip, port=5354) end = datetime.datetime.now() # int of return code mdns_is_up = (answer.rcode() <= 16) except (dns.exception.Timeout): mdns_is_up = False metric_bool('client_success', False, m_name='maas_designate') except Exception as e: metric_bool('client_success', False, m_name='maas_designate') status_err(str(e), m_name='maas_designate') else: metric_bool('client_success', True, m_name='maas_designate') dt = (end - start) milliseconds = (dt.microseconds + dt.seconds * 10 ** 6) / 10 ** 3 status_ok(m_name='maas_designate') metric_bool('designate_mdns_local_status', mdns_is_up, m_name='maas_designate') if mdns_is_up: # only want to send other metrics if api is up metric('designate_mdns_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) tenant_id = keystone.tenant_id HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format(ip=args.ip, tenant=tenant_id)) try: if args.ip: heat = get_heat_client(endpoint=HEAT_ENDPOINT) else: heat = get_heat_client() is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() heat.build_info.build_info() end = time.time() milliseconds = (end - start) * 1000 metric('heat_api', 'heat_api_local_status', str(int(is_up))) if is_up: # only want to send other metrics if api is up metric('heat_api', 'heat_api_local_response_time', '%.3f' % milliseconds)
def check(auth_ref, args): GLANCE_ENDPOINT = ('http://{ip}:9292/v1'.format(ip=args.ip)) try: if args.ip: glance = get_glance_client(endpoint=GLANCE_ENDPOINT) else: glance = get_glance_client() is_up = True except exc.HTTPException: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time.time() glance.images.list(search_opts={'all_tenants': 1}) end = time.time() milliseconds = (end - start) * 1000 # gather some metrics images = glance.images.list(search_opts={'all_tenants': 1}) status_count = collections.Counter([s.status for s in images]) status_ok() metric_bool('glance_api_local_status', is_up) # only want to send other metrics if api is up if is_up: metric('glance_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') for status in IMAGE_STATUSES: metric('glance_%s_images' % status, 'uint32', status_count[status], 'images')
def check(args): octavia = get_openstack_client('load_balancer') try: if args.ip: octavia_local_endpoint = generate_local_endpoint( str(octavia.get_endpoint()), args.ip, args.port, args.protocol, '/lbaas/loadbalancers?limit=1' ) resp = octavia.session.get(octavia_local_endpoint, timeout=180) except (exc.HTTPError, exc.Timeout, exc.ConnectionError): is_up = False metric_bool('client_success', False, m_name='maas_octavia') # Any other exception presumably isn't an API error except Exception as e: metric_bool('client_success', False, m_name='maas_octavia') status_err(str(e), m_name='maas_octavia') else: is_up = resp.ok metric_bool('client_success', True, m_name='maas_octavia') milliseconds = resp.elapsed.total_seconds() * 1000 status_ok(m_name='maas_octavia') metric_bool('octavia_api_local_status', is_up, m_name='maas_octavia') if is_up: # only want to send other metrics if api is up metric('octavia_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms')
def check(args, tenant_id): HEAT_ENDPOINT = ('http://{ip}:8004/v1/{tenant}'.format (ip=args.ip, tenant=tenant_id)) try: heat = get_heat_client(endpoint=HEAT_ENDPOINT) is_up = True except exc.HTTPException as e: is_up = False # Any other exception presumably isn't an API error except Exception as e: status_err(str(e)) else: # time something arbitrary start = time() heat.build_info.build_info() end = time() milliseconds = (end - start) * 1000 # Add other metrics stack_count = len(list(heat.stacks.list())) status_ok() metric_bool('heat_api_local_status', is_up) if is_up: # only want to send other metrics if api is up metric('heat_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('heat_stack_count', 'uint32', stack_count, 'stacks')
def get_cluster_statistics(client=None, keyring=None, container_name=None): metrics = [] ceph_status = get_ceph_status(client=client, keyring=keyring, container_name=container_name) # Get overall cluster health # For luminous+ this is the ceph_status.health.status # For < Luminous this is the ceph_status.health.overall_status ceph_health_status = ceph_status['health']['overall_status'] if 'status' in ceph_status['health']: ceph_health_status = ceph_status['health']['status'] metrics.append({ 'name': 'cluster_health', 'type': 'uint32', 'value': STATUSES[ceph_health_status]}) # Collect epochs for the mon and osd maps metrics.append({'name': "monmap_epoch", 'type': 'uint32', 'value': ceph_status['monmap']['epoch']}) metrics.append({'name': "osdmap_epoch", 'type': 'uint32', 'value': ceph_status['osdmap']['osdmap']['epoch']}) # Collect OSDs per state osds = {'total': ceph_status['osdmap']['osdmap']['num_osds'], 'up': ceph_status['osdmap']['osdmap']['num_up_osds'], 'in': ceph_status['osdmap']['osdmap']['num_in_osds']} for k in osds: metrics.append({'name': 'osds_%s' % k, 'type': 'uint32', 'value': osds[k]}) # Collect cluster size & utilisation metrics.append({'name': 'osds_kb_used', 'type': 'uint64', 'value': ceph_status['pgmap']['bytes_used'] / 1024}) metrics.append({'name': 'osds_kb_avail', 'type': 'uint64', 'value': ceph_status['pgmap']['bytes_avail'] / 1024}) metrics.append({'name': 'osds_kb', 'type': 'uint64', 'value': ceph_status['pgmap']['bytes_total'] / 1024}) # Collect num PGs and num healthy PGs pgs = {'total': ceph_status['pgmap']['num_pgs'], 'active_clean': 0} for state in ceph_status['pgmap']['pgs_by_state']: if state['state_name'] == 'active+clean': pgs['active_clean'] = state['count'] break for k in pgs: metrics.append({'name': 'pgs_%s' % k, 'type': 'uint32', 'value': pgs[k]}) # Submit gathered metrics for m in metrics: maas_common.metric(m['name'], m['type'], m['value'])
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token VOLUME_ENDPOINT = ('http://{ip}:8776/v1/{tenant}'.format (ip=args.ip, tenant=keystone.tenant_id)) s = requests.Session() s.headers.update( {'Content-type': 'application/json', 'x-auth-token': auth_token}) try: vol = s.get('%s/volumes/detail' % VOLUME_ENDPOINT, verify=False, timeout=5) milliseconds = vol.elapsed.total_seconds() * 1000 snap = s.get('%s/snapshots/detail' % VOLUME_ENDPOINT, verify=False, timeout=5) is_up = vol.ok and snap.ok except (exc.ConnectionError, exc.HTTPError, exc.Timeout) as e: is_up = False metric_bool('client_success', False, m_name='maas_cinder') except Exception as e: metric_bool('client_success', False, m_name='maas_cinder') status_err(str(e), m_name='maas_cinder') else: metric_bool('client_success', True, m_name='maas_cinder') # gather some metrics vol_statuses = [v['status'] for v in vol.json()['volumes']] vol_status_count = collections.Counter(vol_statuses) total_vols = len(vol.json()['volumes']) snap_statuses = [v['status'] for v in snap.json()['snapshots']] snap_status_count = collections.Counter(snap_statuses) total_snaps = len(snap.json()['snapshots']) status_ok(m_name='maas_cinder') metric_bool('cinder_api_local_status', is_up, m_name='maas_cinder') # only want to send other metrics if api is up if is_up: metric('cinder_api_local_response_time', 'double', '%.3f' % milliseconds, 'ms') metric('total_cinder_volumes', 'uint32', total_vols, 'volumes') for status in VOLUME_STATUSES: metric('cinder_%s_volumes' % status, 'uint32', vol_status_count[status], 'volumes') metric('total_cinder_snapshots', 'uint32', total_snaps, 'snapshots') for status in VOLUME_STATUSES: metric('cinder_%s_snaps' % status, 'uint32', snap_status_count[status], 'snapshots')
def main(): try: metrics = get_metrics() except maas_common.MaaSException as e: maas_common.status_err(str(e)) else: maas_common.status_ok() for name, data in metrics.viewitems(): maas_common.metric(name, 'uint32', data['value'])
def main(): options = parse_args() configure(options) query = build_query(options) num_hits = get_count_for_querystring(query) status_ok() metric('HITS', 'uint32', num_hits)
def main(): configure(options) latest = most_recent_index() num_errors = get_number_of('ERROR', latest) num_warnings = get_number_of('WARN*', latest) status_ok(m_name='maas_galera') metric('NUMBER_OF_LOG_ERRORS', 'uint32', num_errors) metric('NUMBER_OF_LOG_WARNINGS', 'uint32', num_warnings)
def get_poller_fd_details(): """Generate metrics for the poller's file descriptor usage""" proc = _get_poller_proc() if proc is None: return metric("maas_poller_fd_count", "uint32", proc.num_fds()) # rlimit returns soft and hard limits, but only use hard _, hard_limit = proc.rlimit(psutil.RLIMIT_NOFILE) metric("maas_poller_fd_max", "uint32", hard_limit)
def get_mon_statistics(report=None, host=None): mon = [m for m in report['monmap']['mons'] if m['name'] == host] mon_in = mon[0]['rank'] in report['quorum'] maas_common.metric_bool('mon_in_quorum', mon_in) health_status = 0 for each in report['health']['health']['health_services'][0]['mons']: if each['name'] == host: health_status = STATUSES[each['health']] break maas_common.metric('mon_health', 'uint32', health_status)
def main(): options, _ = parse_args() configure(options) latest = most_recent_index() num_errors = get_number_of('ERROR', latest) num_warnings = get_number_of('WARN*', latest) status_ok() metric('NUMBER_OF_LOG_ERRORS', 'uint32', num_errors) metric('NUMBER_OF_LOG_WARNINGS', 'uint32', num_warnings)
def get_mon_statistics(client=None, keyring=None, host=None): ceph_status = get_ceph_status(client=client, keyring=keyring) mon = [m for m in ceph_status['monmap']['mons'] if m['name'] == host] mon_in = mon[0]['rank'] in ceph_status['quorum'] maas_common.metric_bool('mon_in_quorum', mon_in) health_status = 0 for each in ceph_status['health']['health']['health_services'][0]['mons']: if each['name'] == host: health_status = STATUSES[each['health']] break maas_common.metric('mon_health', 'uint32', health_status)
def check_for_failed_actions(): output = check_command('pcs', 'status') pattern = re.compile( "Failed|Stopped|Notice|Fail|Error|Warning|Faulty", flags=re.IGNORECASE) bad_things_happened = re.search(pattern, output) if bad_things_happened: metric('pacemaker_failed_actions', 'string', 'Errors in pacemaker cluster') else: metric('pacemaker_failed_actions', 'string', 'Pacemaker cluster is OK')