def recon_output(for_ring, options=None, swift_recon_path=None, deploy_osp=False): """Run swift-recon and filter out extraneous printed lines. :: >>> recon_output('account', '-r') ['[2014-11-21 00:25:16] Checking on replication', '[replication_failure] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_success] low: 2, high: 4, avg: 3.0, total: 6, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_time] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_attempted] low: 1, high: 2, avg: 1.5, total: 3, \ Failed: 0.0%, no_result: 0, reported: 2', 'Oldest completion was 2014-11-21 00:24:51 (25 seconds ago) by \ 192.168.31.1:6002.', 'Most recent completion was 2014-11-21 00:24:56 (20 seconds ago) by \ 192.168.31.2:6002.'] :param str for_ring: Which ring to run swift-recon on :param list options: Command line options with which to run swift-recon :returns: Strings from output that are most important :rtype: list """ # identify the container we will use for monitoring container = get_container_name(deploy_osp, for_ring) command = [os.path.join(swift_recon_path or "", 'swift-recon'), for_ring] command.extend(options or []) command_options = ' '.join(command) if not container: _full_command = '{command_options}'.format( command_options=command_options) elif deploy_osp: _full_command = '{container_exec_command} {command_options}'.format( container_exec_command='docker exec {}'.format(container), command_options=command_options) else: _full_command = '{container_exec_command} {command_options}'.format( container_exec_command='lxc-attach -n {} -- bash -c'.format( container), command_options='"{}"'.format(command_options)) full_command = shlex.split(_full_command) try: out = subprocess.check_output(full_command) except subprocess.CalledProcessError as error: # in case attach command fails we return no metrics rather than # letting it fail to give out red herring alarms status_err_no_exit("Attach container command failed: %s" % str(error), m_name='maas_swift') return [] return filter(lambda s: s and not s.startswith(('==', '-')), out.split('\n'))
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token tenant_id = keystone.tenant_id nova_version = '.'.join( map(str, get_os_component_major_api_version('nova'))) COMPUTE_ENDPOINT = ( '{protocol}://{hostname}:8774/v{version}/{tenant_id}'.format( protocol=args.protocol, hostname=args.hostname, version=nova_version, tenant_id=tenant_id)) try: nova = get_nova_client(auth_token=auth_token, bypass_url=COMPUTE_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_nova') for nova_service_type in NOVA_SERVICE_TYPE_LIST: metric('%s_status' % nova_service_type, 'string', '%s cannot reach API' % nova_service_type, m_name='maas_nova') status_err_no_exit(str(e), m_name='maas_nova') return else: metric_bool('client_success', True, m_name='maas_nova') # gather nova service states if args.host: services = nova.services.list(host=args.host) else: services = nova.services.list() if len(services) == 0: status_err("No host(s) found in the service list", m_name='maas_nova') # return all the things status_ok(m_name='maas_nova') for service in services: service_is_up = "Yes" if service.status.lower() == 'enabled': if service.state.lower() == 'down': service_is_up = "No" elif service.status.lower() == 'disabled': if service.disabled_reason: if 'auto' in service.disabled_reason.lower(): service_is_up = "No" if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) metric(name, 'string', service_is_up, m_name='maas_nova')
def check(args): NETWORK_ENDPOINT = '{protocol}://{hostname}:9696'.format( protocol=args.protocol, hostname=args.hostname) try: neutron = get_neutron_client(endpoint_url=NETWORK_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') for neutron_agent_type in NEUTRON_AGENT_TYPE_LIST: metric('%s_status' % neutron_agent_type, 'string', '%s cannot reach API' % neutron_agent_type, m_name='maas_neutron') status_err_no_exit(str(e), m_name='maas_neutron') return else: metric_bool('client_success', True, m_name='maas_neutron') # gather neutron service states if args.host: agents = neutron.list_agents(host=args.host)['agents'] elif args.fqdn: agents = neutron.list_agents(host=args.fqdn)['agents'] else: agents = neutron.list_agents()['agents'] if len(agents) == 0: metric_bool('agents_found', False, m_name='maas_neutron') status_err("No host(s) found in the agents list", m_name='maas_neutron') else: metric_bool('agents_found', True, m_name='maas_neutron') # return all the things status_ok(m_name='maas_neutron') for agent in agents: agent_is_up = "Yes" if agent['admin_state_up'] and not agent['alive']: agent_is_up = "No" if args.host: name = '%s_status' % agent['binary'] elif args.fqdn: name = '%z_status' % agent['binary'] else: name = '%s_%s_on_host_%s' % (agent['binary'], agent['id'], agent['host']) metric(name, 'string', agent_is_up, m_name='maas_neutron')
def check_process_statuses(container_or_host_name, container=None): process_names = ['ovsdb-server', 'ovs-vswitchd', 'neutron-openvswitch-agent'] if container is None: pid = None else: pid = container.init_pid # Get the processes within the neutron agent container (or a # compute host). procs = get_processes(parent_pid=pid) # Make a list of command lines from each PID. There's a # chance that one or more PIDs may have exited already and # this causes a NoSuchProcess exception. cmdlines = [] for proc in procs: try: # In psutil 1.2.1, cmdline is an attribute, but in # 5.x, it's now a callable method. cmdline_check = getattr(proc, "cmdline", None) if callable(cmdline_check): cmdline_check_value = proc.cmdline() else: cmdline_check_value = proc.cmdline cmdlines.append(map(os.path.basename, cmdline_check_value)) except Exception as e: status_err_no_exit('Error while retrieving process %s, ERROR: %s' % (cmdline_check_value, str(e)), m_name='maas_neutron') # Loop through the process names provided on the command line to # see if ovsdb-server, ovs-vswitchd, and neutron-openvswitch # exist on the system or in a container. # suppress some character which throw MaaS off # ovsdb-server and ovs-vswitchd are not directly in the command # line parsing so we use condition # `process_name in x or process_name in x[0]` pattern = re.compile('[^-\w]+') for process_name in process_names: matches = [x for x in cmdlines if process_name in x or ( len(x) > 0 and process_name in x[0] ) ] metric_bool('%s_process_status' % ( pattern.sub('', process_name) ), len(matches) > 0)
def recon_output(for_ring, options=None, swift_recon_path=None): """Run swift-recon and filter out extraneous printed lines. :: >>> recon_output('account', '-r') ['[2014-11-21 00:25:16] Checking on replication', '[replication_failure] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_success] low: 2, high: 4, avg: 3.0, total: 6, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_time] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_attempted] low: 1, high: 2, avg: 1.5, total: 3, \ Failed: 0.0%, no_result: 0, reported: 2', 'Oldest completion was 2014-11-21 00:24:51 (25 seconds ago) by \ 192.168.31.1:6002.', 'Most recent completion was 2014-11-21 00:24:56 (20 seconds ago) by \ 192.168.31.2:6002.'] :param str for_ring: Which ring to run swift-recon on :param list options: Command line options with which to run swift-recon :returns: Strings from output that are most important :rtype: list """ # identify the container we will use for monitoring get_container = shlex.split('lxc-ls -1 --running ".*(swift_proxy|swift)"') try: containers_list = subprocess.check_output(get_container) container = containers_list.splitlines()[0] except (IndexError, subprocess.CalledProcessError): status_err('no running swift proxy containers found', m_name='maas_swift') command = [os.path.join(swift_recon_path or "", 'swift-recon'), for_ring] command.extend(options or []) command_options = ' '.join(command) full_command = shlex.split('lxc-attach -n %s -- bash -c "%s"' % (container, command_options)) try: out = subprocess.check_output(full_command) except subprocess.CalledProcessError as error: # in case attach command fails we return no metrics rather than # letting it fail to give out red herring alarms status_err_no_exit("Attach container command failed: %s" % str(error), m_name='maas_swift') return [] return filter(lambda s: s and not s.startswith(('==', '-')), out.split('\n'))
def _get_node_metrics(session, metrics, protocol, host, port, name): response = _get_rabbit_json(session, NODES_URL % (protocol, host, port)) # Either use the option provided by the commandline flag or the current # hostname name = '@' + (name or hostname()) is_cluster_member = False # Ensure this node is a member of the cluster nodes_matching_name = [n for n in response if n['name'].endswith(name)] is_cluster_member = any(nodes_matching_name) if CLUSTERED: if len(response) < CLUSTER_SIZE: status_err_no_exit('cluster too small', m_name='maas_rabbitmq') if not is_cluster_member: status_err_no_exit('{0} not a member of the cluster'.format(name), m_name='maas_rabbitmq') if sum([len(n.get('partitions', 0)) for n in response]): status_err_no_exit('At least one partition found in the rabbit ' 'cluster', m_name='maas_rabbitmq') if any([len(n.get('cluster_links', [])) != CLUSTER_SIZE - 1 for n in response]): status_err_no_exit('At least one rabbit node is missing a cluster' ' link', m_name='maas_rabbitmq') for k, v in NODES_METRICS.items(): metrics[k] = {'value': nodes_matching_name[0][k], 'unit': v}
def _get_node_metrics(session, metrics, protocol, host, port, name): response = _get_rabbit_json(session, NODES_URL % (protocol, host, port)) # Either use the option provided by the commandline flag or the current # hostname name = '@' + (name or hostname()) is_cluster_member = False # Ensure this node is a member of the cluster nodes_matching_name = [n for n in response if n['name'].endswith(name)] is_cluster_member = any(nodes_matching_name) if CLUSTERED: if len(response) < CLUSTER_SIZE: status_err_no_exit('cluster too small', m_name='maas_rabbitmq') if not is_cluster_member: status_err_no_exit('{0} not a member of the cluster'.format(name), m_name='maas_rabbitmq') if sum([len(n.get('partitions', 0)) for n in response]): status_err_no_exit( 'At least one partition found in the rabbit ' 'cluster', m_name='maas_rabbitmq') if any([ len(n.get('cluster_links', [])) != CLUSTER_SIZE - 1 for n in response ]): status_err_no_exit( 'At least one rabbit node is missing a cluster' ' link', m_name='maas_rabbitmq') for k, v in NODES_METRICS.items(): metrics[k] = {'value': nodes_matching_name[0][k], 'unit': v}
def check(auth_ref, args): keystone = get_keystone_client(auth_ref) auth_token = keystone.auth_token tenant_id = keystone.tenant_id COMPUTE_ENDPOINT = ( '{protocol}://{hostname}:8774/v2.1/{tenant_id}' .format(protocol=args.protocol, hostname=args.hostname, tenant_id=tenant_id) ) try: nova = get_nova_client(auth_token=auth_token, bypass_url=COMPUTE_ENDPOINT) # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_nova') for nova_service_type in NOVA_SERVICE_TYPE_LIST: metric('%s_status' % nova_service_type, 'string', '%s cannot reach API' % nova_service_type, m_name='maas_nova') status_err_no_exit(str(e), m_name='maas_nova') return else: metric_bool('client_success', True, m_name='maas_nova') # gather nova service states if args.host: services = nova.services.list(host=args.host) else: services = nova.services.list() if len(services) == 0: status_err("No host(s) found in the service list", m_name='maas_nova') # return all the things status_ok(m_name='maas_nova') for service in services: service_is_up = "Yes" if service.status == 'enabled' and service.state == 'down': service_is_up = "No" if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) metric(name, 'string', service_is_up, m_name='maas_nova')
def check(args): nova = get_openstack_client('compute') try: if args.host: services = [i for i in nova.services() if i.host == args.host] else: services = [i for i in nova.services()] # not gathering api status metric here so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_nova') for nova_service_type in NOVA_SERVICE_TYPE_LIST: metric('%s_status' % nova_service_type, 'string', '%s cannot reach API' % nova_service_type, m_name='maas_nova') status_err_no_exit(str(e), m_name='maas_nova') return else: metric_bool('client_success', True, m_name='maas_nova') if len(services) == 0: status_err("No host(s) found in the service list", m_name='maas_nova') # return all the things status_ok(m_name='maas_nova') for service in services: service_is_up = "Yes" if service.status.lower() == 'enabled': if service.state.lower() == 'down': service_is_up = "No" elif service.status.lower() == 'disabled': try: if service.disabled_reason: if 'auto' in service.disabled_reason.lower(): service_is_up = "No" except AttributeError: pass if args.host: name = '%s_status' % service.binary else: name = '%s_on_host_%s_status' % (service.binary, service.host) metric(name, 'string', service_is_up, m_name='maas_nova')
def check(args): neutron = get_openstack_client('network') try: if args.host: agents = [i for i in neutron.agents(host=args.host)] elif args.fqdn: agents = [i for i in neutron.agents(host=args.fqdn)] else: agents = [i for i in neutron.agents()] # An API status metric is not gathered so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') for neutron_agent_type in NEUTRON_AGENT_TYPE_LIST: metric('%s_status' % neutron_agent_type, 'string', '%s cannot reach API' % neutron_agent_type, m_name='maas_neutron') status_err_no_exit(str(e), m_name='maas_neutron') return else: metric_bool('client_success', True, m_name='maas_neutron') if len(agents) == 0: status_err("No host(s) found in the agents list", m_name='maas_neutron') # Return all the things status_ok(m_name='maas_neutron') for agent in agents: agent_is_up = "Yes" if agent['is_admin_state_up'] and not agent['is_alive']: agent_is_up = "No" if args.host: name = '%s_status' % agent['binary'] elif args.fqdn: name = '%s_status' % agent['binary'] else: name = '%s_%s_on_host_%s' % (agent['binary'], agent['id'], agent['host']) metric(name, 'string', agent_is_up, m_name='maas_neutron')
def check(args): neutron = get_openstack_client('network') try: # Gather neutron agent states if args.host: agents = [i for i in neutron.agents(host=args.host)] elif args.fqdn: agents = [i for i in neutron.agents(host=args.fqdn)] else: agents = [i for i in neutron.agents()] # An API status metric is not gathered so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') metric('%s_status' % "neutron-openvswitch-agent", 'string', '%s cannot reach API' % "neutron-openvswitch-agent", m_name='maas_neutron') status_err_no_exit(str(e), m_name='maas_neutron') return else: metric_bool('client_success', True, m_name='maas_neutron') try: ovs_agent = next(a for a in agents if 'openvswitch' in a['binary']) except StopIteration: status_err("No host(s) found in the agents list", m_name='maas_neutron') else: # Return all the things status_ok(m_name='maas_neutron') agent_is_up = "Yes" if ovs_agent['is_admin_state_up'] and not ovs_agent['is_alive']: agent_is_up = "No" if args.host: name = '%s_status' % ovs_agent['binary'] elif args.fqdn: name = '%s_status' % ovs_agent['binary'] else: name = '%s_%s_on_host_%s' % (ovs_agent['binary'], ovs_agent['id'], ovs_agent['host']) metric(name, 'string', agent_is_up, m_name='maas_neutron') if on_lxc_container: all_containers = lxc.list_containers() neutron_containers_list = [] neutron_agent_containers_list = [] # NOTE(npawelek): The neutron container architecture was # refactored in recent versions removing all neutron containers # with the exception of one, or even using baremetal directly. # Since logic is looking for the presence of LXC, we do not need # to account for baremetal here. for container in all_containers: if 'neutron_agents' in container: neutron_agent_containers_list.append(container) if 'neutron' in container: neutron_containers_list.append(container) if len(neutron_containers_list) == 1 and \ 'neutron_server' in neutron_containers_list[0]: valid_containers = neutron_containers_list elif len(neutron_agent_containers_list) > 0: valid_containers = neutron_agent_containers_list else: valid_containers = 0 if len(valid_containers) == 0: status_err('no neutron agent or server containers found', m_name='maas_neutron') return for container in valid_containers: # Get the neutron_agent_container's init PID. try: c = lxc.Container(container) # If the container wasn't found, exit now. if c.init_pid == -1: metric_bool('container_success', False, m_name='maas_neutron') status_err('Could not find PID for container {}'.format( container), m_name='maas_neutron') except (Exception, SystemError) as e: metric_bool('container_success', False, m_name='maas_neutron') status_err( 'Container lookup failed on "{}". ERROR: "{}"'.format( container, e), m_name='maas_neutron') else: metric_bool('container_success', True, m_name='maas_neutron') # c is the lxc container instance of this # neutron_agent_container check_process_statuses(container, c) else: ovs_agent_host = socket.gethostname() check_process_statuses(ovs_agent_host)
def check(args): neutron = get_openstack_client('network') try: # Gather neutron agent states if args.host: agents = [i for i in neutron.agents(host=args.host)] elif args.fqdn: agents = [i for i in neutron.agents(host=args.fqdn)] else: agents = [i for i in neutron.agents()] # An API status metric is not gathered so catch any exception except Exception as e: metric_bool('client_success', False, m_name='maas_neutron') metric('%s_status' % "neutron-openvswitch-agent", 'string', '%s cannot reach API' % "neutron-openvswitch-agent", m_name='maas_neutron') status_err_no_exit(str(e), m_name='maas_neutron') return else: metric_bool('client_success', True, m_name='maas_neutron') try: ovs_agent = next( a for a in agents if 'openvswitch' in a['binary'] ) except StopIteration: status_err("No host(s) found in the agents list", m_name='maas_neutron') else: # Return all the things status_ok(m_name='maas_neutron') agent_is_up = "Yes" if ovs_agent['is_admin_state_up'] and not ovs_agent['is_alive']: agent_is_up = "No" if args.host: name = '%s_status' % ovs_agent['binary'] elif args.fqdn: name = '%s_status' % ovs_agent['binary'] else: name = '%s_%s_on_host_%s' % (ovs_agent['binary'], ovs_agent['id'], ovs_agent['host']) metric(name, 'string', agent_is_up, m_name='maas_neutron') if on_lxc_container: all_containers = lxc.list_containers() neutron_containers_list = [] neutron_agent_containers_list = [] # NOTE(npawelek): The neutron container architecture was # refactored in recent versions removing all neutron containers # with the exception of one, or even using baremetal directly. # Since logic is looking for the presence of LXC, we do not need # to account for baremetal here. for container in all_containers: if 'neutron_agents' in container: neutron_agent_containers_list.append(container) if 'neutron' in container: neutron_containers_list.append(container) if len(neutron_containers_list) == 1 and \ 'neutron_server' in neutron_containers_list[0]: valid_containers = neutron_containers_list elif len(neutron_agent_containers_list) > 0: valid_containers = neutron_agent_containers_list else: valid_containers = 0 if len(valid_containers) == 0: status_err('no neutron agent or server containers found', m_name='maas_neutron') return for container in valid_containers: # Get the neutron_agent_container's init PID. try: c = lxc.Container(container) # If the container wasn't found, exit now. if c.init_pid == -1: metric_bool('container_success', False, m_name='maas_neutron') status_err( 'Could not find PID for container {}'.format( container ), m_name='maas_neutron' ) except (Exception, SystemError) as e: metric_bool('container_success', False, m_name='maas_neutron') status_err( 'Container lookup failed on "{}". ERROR: "{}"' .format( container, e ), m_name='maas_neutron' ) else: metric_bool('container_success', True, m_name='maas_neutron') # c is the lxc container instance of this # neutron_agent_container check_process_statuses(container, c) else: ovs_agent_host = socket.gethostname() check_process_statuses(ovs_agent_host)
def recon_output(for_ring, options=None, swift_recon_path=None, deploy_osp=False): """Run swift-recon and filter out extraneous printed lines. :: >>> recon_output('account', '-r') ['[2014-11-21 00:25:16] Checking on replication', '[replication_failure] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_success] low: 2, high: 4, avg: 3.0, total: 6, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_time] low: 0, high: 0, avg: 0.0, total: 0, \ Failed: 0.0%, no_result: 0, reported: 2', '[replication_attempted] low: 1, high: 2, avg: 1.5, total: 3, \ Failed: 0.0%, no_result: 0, reported: 2', 'Oldest completion was 2014-11-21 00:24:51 (25 seconds ago) by \ 192.168.31.1:6002.', 'Most recent completion was 2014-11-21 00:24:56 (20 seconds ago) by \ 192.168.31.2:6002.'] :param str for_ring: Which ring to run swift-recon on :param list options: Command line options with which to run swift-recon :returns: Strings from output that are most important :rtype: list """ # identify the container we will use for monitoring container = get_container_name(deploy_osp, for_ring) command = [os.path.join(swift_recon_path or "", 'swift-recon'), for_ring] command.extend(options or []) command_options = ' '.join(command) if not container: _full_command = '{command_options}'.format( command_options=command_options ) elif deploy_osp: _full_command = '{container_exec_command} {command_options}'.format( container_exec_command='docker exec {}'.format( container ), command_options=command_options ) else: _full_command = '{container_exec_command} {command_options}'.format( container_exec_command='lxc-attach -n {} -- bash -c'.format( container ), command_options='"{}"'.format(command_options) ) full_command = shlex.split(_full_command) try: out = subprocess.check_output(full_command) except subprocess.CalledProcessError as error: # in case attach command fails we return no metrics rather than # letting it fail to give out red herring alarms status_err_no_exit("Attach container command failed: %s" % str(error), m_name='maas_swift') return [] return filter(lambda s: s and not s.startswith(('==', '-')), out.split('\n'))