Beispiel #1
0
    def all_healthy(self):
        """
        check if each resource is in healthy order
        and return a global healthy status
        :return: Bool
        """
        for attempt_number in range(360):

            try:

                if all([
                        self.rabbitmq_resource_healthy(),
                        self.galera_resource_healthy(),
                        self.redis_resource_healthy(),
                        self.vips_resource_healthy(),
                        self.ha_proxy_cinder_healthy(),
                        self.ovn_resource_healthy()
                ]):
                    LOG.info("pcs status checks: all resources are"
                             " in healthy state")
                    return True
                else:

                    LOG.info("pcs status check: not all resources are "
                             "in healthy "
                             "state")
                    raise PcsResourceException()
            except PcsResourceException:
                # reread pcs status
                LOG.info('Retrying pacemaker resource checks attempt '
                         '{} of 360'.format(attempt_number))
                time.sleep(1)
                self.pcs_df = get_pcs_resources_table()
        # exhausted all retries
        tobiko.fail('pcs cluster is not in a healthy state')
Beispiel #2
0
def find_msg_in_file(node, logfile, message, rotated=False):
    """Search for the message in the logfile

    :param node: Node the container is running on
    :type node: class: tobiko.openstack.topology.OpenStackTopologyNode
    :param logfile: Path of the logfile
    :type logfile: string
    :param message: Message to search for
    :type message: string
    :param rotated: Variable to flag that log file has to be rotated
        so the name will be ended by '.1'
    :type rotated: bool
    :return: True if message exists in file or False otherwise
    :rtype: bool
    """
    if rotated:
        suffix = ".1"
    else:
        suffix = ""
    LOG.debug(f'Searching for {message} in {logfile}{suffix} on {node.name}')
    result = sh.execute(f'grep -h {message} {logfile}{suffix}',
                        ssh_client=node.ssh_client,
                        expect_exit_status=None, sudo=True)
    if result.stderr:
        tobiko.fail(f'Failed to read {logfile} on {node.name}:\n'
                    f'{result.stderr}')
    elif result.stdout.strip() == message:
        return True
    else:
        return False
Beispiel #3
0
def get_overcloud_container(container_name=None,
                            container_host=None,
                            partial_container_name=None):
    """gets an container object by name on specified host
    container"""
    con_obj_df = list_containers_objects_df()
    if partial_container_name and container_host:
        con_obj_df = con_obj_df[con_obj_df['container_name'].str.contains(
            partial_container_name)]
        contaniner_obj = con_obj_df.query(
            'container_host == "{container_host}"'.format(
                container_host=container_host))['container_object']
    elif container_host:
        contaniner_obj = con_obj_df.query(
            'container_name == "{container_name}"'
            ' and container_host == "{container_host}"'.format(
                container_host=container_host,
                container_name=container_name))['container_object']
    else:
        contaniner_obj = con_obj_df.query(
            'container_name == "{container_name}"'.format(
                container_name=container_name))['container_object']
    if not contaniner_obj.empty:
        return contaniner_obj.values[0]
    else:
        tobiko.fail('container {} not found!'.format(container_name))
Beispiel #4
0
 def assert_not_transmitted(self):
     if self.transmitted:
         tobiko.fail(
             "{transmitted!r} package(s) has been transmitted to "
             "{destination!r}",
             transmitted=self.transmitted,
             destination=self.destination)
Beispiel #5
0
 def assert_not_replied(self):
     if self.received:
         tobiko.fail(
             "{received!r} reply package(s) has been received from "
             "{destination!r}",
             received=self.received,
             destination=self.destination)
Beispiel #6
0
 def test_public_ips(self):
     ips = dict()
     for node in self.topology.nodes:
         ping.ping(node.public_ip).assert_replied()
         other = ips.setdefault(node.public_ip, node)
         if node is not other:
             tobiko.fail(f"Nodes {node.name} and {other.name} have the "
                         f"same IP: {node.public_ip}")
Beispiel #7
0
 def test_hostnames(self):
     hostnames = dict()
     for node in self.topology.nodes:
         hostname = sh.get_hostname(ssh_client=node.ssh_client)
         self.assertTrue(hostname.startswith(node.name))
         other = hostnames.setdefault(hostname, node)
         if node is not other:
             tobiko.fail(f"Nodes {node.name} and {other.name} have the "
                         f"same hostname: {hostname}")
Beispiel #8
0
def run_container_config_validations():
    """check containers configuration in different scenarios
    """

    # TODO add here any generic configuration validation
    config_checkings = []

    if neutron.has_ovn():
        ovn_config_checkings = \
            [{'node_group': 'controller',
              'container_name': 'neutron_api',
              'config_file': '/etc/neutron/plugins/ml2/ml2_conf.ini',
              'param_validations': [{'section': 'ml2',
                                     'param': 'mechanism_drivers',
                                     'expected_value': 'ovn'},
                                    {'section': 'ml2',
                                     'param': 'type_drivers',
                                     'expected_value': 'geneve'},
                                    {'section': 'ovn',
                                     'param': 'ovn_l3_mode',
                                     'expected_value': 'True'},
                                    {'section': 'ovn',
                                     'param': 'ovn_metadata_enabled',
                                     'expected_value': 'True'}]}]
        config_checkings += ovn_config_checkings
    else:
        ovs_config_checkings = \
            [{'node_group': 'controller',
              'container_name': 'neutron_api',
              'config_file': '/etc/neutron/plugins/ml2/ml2_conf.ini',
              'param_validations': [{'section': 'ml2',
                                     'param': 'mechanism_drivers',
                                     'expected_value': 'openvswitch'}]}]
        config_checkings += ovs_config_checkings

    container_runtime_name = get_container_runtime_name()
    for config_check in config_checkings:
        for node in topology.list_openstack_nodes(
                group=config_check['node_group']):
            for param_check in config_check['param_validations']:
                obtained_param = sh.execute(
                    f"{container_runtime_name} exec -uroot "
                    f"{config_check['container_name']} crudini "
                    f"--get {config_check['config_file']} "
                    f"{param_check['section']} {param_check['param']}",
                    ssh_client=node.ssh_client,
                    sudo=True).stdout.strip()
                if param_check['expected_value'] not in obtained_param:
                    tobiko.fail(f"Expected {param_check['param']} value: "
                                f"{param_check['expected_value']}\n"
                                f"Obtained {param_check['param']} value: "
                                f"{obtained_param}")
        LOG.info("Configuration verified:\n"
                 f"node group: {config_check['node_group']}\n"
                 f"container: {config_check['container_name']}\n"
                 f"config file: {config_check['config_file']}")
Beispiel #9
0
 def test_network_namespaces(self):
     for node in self.topology.nodes:
         namespaces_ips = {}
         namespaces = ip.list_network_namespaces(ssh_client=node.ssh_client)
         for namespace in namespaces:
             ips = ip.list_ip_addresses(ssh_client=node.ssh_client,
                                        network_namespace=namespace)
             other_ips = namespaces_ips.setdefault(namespace, ips)
             if ips is not other_ips:
                 tobiko.fail(f"Duplicate network namespace {namespace} in "
                             f"node {node.name}: {other_ips}, {ips}")
Beispiel #10
0
 def test_network_namespaces(self):
     for node in self.topology.nodes:
         namespaces_ips = {}
         namespaces = ip.list_network_namespaces(ssh_client=node.ssh_client)
         for namespace in namespaces:
             ips = ip.list_ip_addresses(ssh_client=node.ssh_client,
                                        network_namespace=namespace)
             other_ips = namespaces_ips.setdefault(namespace, ips)
             if ips is not other_ips:
                 tobiko.fail("Duplicate network namespace {!r} in node "
                             "{!r}: {!r}, {!r}", namespace, node.name,
                             other_ips, ips)
Beispiel #11
0
 def assert_vlan_is_unreachable(self,
                                ip_version: int = None,
                                timeout: tobiko.Seconds = None,
                                ssh_client: ssh.SSHClientType = None):
     fixed_ips = self.list_vlan_fixed_ips(ip_version=ip_version)
     if fixed_ips:
         if ssh_client is None:
             ssh_client = self.vlan_ssh_proxy_client
         ping.assert_unreachable_hosts(fixed_ips,
                                       ssh_client=ssh_client,
                                       timeout=timeout)
     else:
         tobiko.fail(f'Server {self.stack_name} has any IP on VLAN port')
Beispiel #12
0
def stop_all_instances():
    """try to start all stopped overcloud instances"""
    for instance in nova.list_servers():
        activated_instance = nova.shutoff_server(instance)
        time.sleep(3)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=activated_instance.name,
            state=activated_instance.status,
            host=activated_instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
        if activated_instance.status != 'SHUTOFF':
            tobiko.fail(instance_info)
Beispiel #13
0
def get_pcs_resources_table(timeout=720, interval=2) -> pandas.DataFrame:
    """
    get pcs status from a controller and parse it
    to have it's resources states in check
       returns :
       rabbitmq-bundle-0    (ocf::heartbeat:rabbitmq-cluster):      Started con
       troller-0
     ip-10.0.0.101  (ocf::heartbeat:IPaddr2):       Started controller-1
       openstack-cinder-volume-docker-0     (ocf::heartbeat:docker):        Sta
       rted controller-0

    :return: dataframe of pcs resources stats table
    """
    failures: typing.List[str] = []
    start = time.time()

    ssh_client = get_random_controller_ssh_client()

    # prevent pcs table read failure while pacemaker is starting
    while time.time() - start < timeout:
        failures = []
        try:
            output = sh.execute("sudo pcs status resources |grep ocf",
                                ssh_client=ssh_client,
                                expect_exit_status=None).stdout
            # remove the first column when it only includes '*' characters
            output = output.replace('*', '').strip()
            stream = io.StringIO(output)
            table: pandas.DataFrame = pandas.read_csv(stream,
                                                      delim_whitespace=True,
                                                      header=None)
            table.columns = [
                'resource', 'resource_type', 'resource_state', 'overcloud_node'
            ]
        except ValueError:
            pcs_status_raw = sh.execute("sudo pcs status ",
                                        ssh_client=ssh_client,
                                        expect_exit_status=None).stdout
            failures.append(f'pcs status table import failed : '
                            f'pcs status stdout:\n {pcs_status_raw}')
            LOG.info('Retrying , timeout at: {}'.format(timeout -
                                                        (time.time() - start)))
            time.sleep(interval)
        else:
            break
    # exhausted all retries
    if failures:
        tobiko.fail('pcs status table import error\n' + '\n'.join(failures))

    LOG.debug("Got pcs status :\n%s", table)
    return table
Beispiel #14
0
    def test_extra_dhcp_opts(self):
        extra_dhcp_options = neutron.get_port_extra_dhcp_opts(
            self.stack.port_id)
        for option in extra_dhcp_options:
            if 'domain-name' == option['opt_name']:
                domain = option['opt_value'].replace('"', '')
                break
        else:
            tobiko.fail('No extra-dhcp-opt found for domain-name')

        vm_resolv_conf = sh.execute('cat /etc/resolv.conf',
                                    ssh_client=self.stack.ssh_client).stdout
        self.assertIsNotNone(
            re.search(r'^search\s+{domain}$'.format(domain=domain),
                      vm_resolv_conf, re.MULTILINE))
Beispiel #15
0
def assert_downloaded_file(file_name: str,
                           headers_file_name: str,
                           ssh_client: ssh.SSHClientType = None,
                           sudo: bool = None):
    try:
        header = read_headers_file(headers_file_name=headers_file_name,
                                   ssh_client=ssh_client,
                                   sudo=sudo)[-1]
    except sh.ShellCommandFailed as ex:
        tobiko.fail(f"Error reading headers file '{headers_file_name}': {ex}")
    else:
        file_size = header.content_length
        if file_size is not None:
            sh.assert_file_size(file_size=header.content_length,
                                file_name=file_name,
                                ssh_client=ssh_client,
                                sudo=sudo)
Beispiel #16
0
    def basic_overcloud_processes_running(self):
        """
        Checks that the oc_procs_df dataframe has all of the list procs
        :return: Bool
        """

        for attempt_number in range(600):

            try:

                for process_name in self.processes_to_check:
                    # osp16/python3 process is "neutron-server:"
                    if process_name == 'neutron-server' and \
                            self.oc_procs_df.query('PROCESS=="{}"'.format(
                            process_name)).empty:
                        process_name = 'neutron-server:'
                    if not self.oc_procs_df.query(
                            'PROCESS=="{}"'.format(process_name)).empty:
                        LOG.info("overcloud processes status checks: "
                                 "process {} is  "
                                 "in running state".format(process_name))
                        continue
                    else:
                        LOG.info(
                            "Failure : overcloud processes status checks:"
                            "process {} is not running ".format(process_name))
                        raise OvercloudProcessesException(
                            process_error="process {} is not running ".format(
                                process_name))
                # if all procs are running we can return true
                return True
            except OvercloudProcessesException:
                LOG.info('Retrying overcloud processes checks attempt '
                         '{} of 360'.format(attempt_number))
                time.sleep(1)
                self.oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
                    get_overcloud_node_processes_table)
        # exhausted all retries
        tobiko.fail('Not all overcloud processes are running !\n')
Beispiel #17
0
def check_ping_statistics(failure_limit=10):
    """Gets a list of ping_vm_log files and
    iterates their lines, checks if max ping
    failures have been reached per fip=file"""
    # iterate over ping_vm_log files:
    for filename in list(get_vm_ping_log_files()):
        with io.open(filename, 'rt') as fd:
            LOG.info(f'checking ping log file: {filename}, '
                     f'failure_limit is :{failure_limit}')
            failure_counter = 0
            for ping_line in fd.readlines():
                ping_line = json.loads(ping_line.rstrip())
                if ping_line['transmitted'] != ping_line['received']:
                    failure_counter += 1
                    LOG.info(f'found ping failure: {ping_line}')
                    if failure_counter >= failure_limit:
                        rename_ping_staistics_file_to_checked(filename)
                        tobiko.fail(f'{failure_counter} pings failure found '
                                    f'to vm fip destination: '
                                    f'{ping_line["destination"]}')
            LOG.info(f'no failures in ping log file: {filename}')
            rename_ping_staistics_file_to_checked(filename)
Beispiel #18
0
def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600,
                         interval=2, check_no_evacuation=False):
    """check evacuation of vms
    input: old and new vms_state_tables dfs"""
    failures = []
    start = time.time()

    while time.time() - start < timeout:
        failures = []
        vms_df_new = get_compute_vms_df(compute_host)
        for vm_id in vms_df_old.vm_id.to_list():
            old_bm_host = vm_location(vm_id, vms_df_old)
            new_vm_host = vm_location(vm_id, vms_df_new)

            if check_no_evacuation:
                cond = bool(old_bm_host != new_vm_host)
            else:
                cond = bool(old_bm_host == new_vm_host)

            if cond:
                failures.append(
                    'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
                                                           vms_df_old)))
            if failures:
                LOG.info('Failed nova evacuation:\n {}'.format(failures))
                LOG.info('Not all nova vms evacuated ..')
                LOG.info('Retrying , timeout at: {}'
                         .format(timeout-(time.time() - start)))
                time.sleep(interval)
            else:
                LOG.info(vms_df_old.to_string())
                LOG.info('All vms were evacuated!')
                return
    # exhausted all retries
    if failures:
        tobiko.fail(
            'failed vm evacuations:\n{!s}', '\n'.join(failures))
Beispiel #19
0
def test_neutron_agents_are_alive(timeout=300., interval=5.) \
        -> tobiko.Selection[neutron.NeutronAgentType]:
    for attempt in tobiko.retry(timeout=timeout, interval=interval):
        LOG.debug("Look for unhealthy Neutron agents...")
        try:
            # get Neutron agent list
            agents = neutron.list_agents()
        except (neutron.ServiceUnavailable, neutron.NeutronClientException,
                exceptions.connection.ConnectFailure) as ex:
            if attempt.is_last:
                raise
            else:
                # retry because Neutron server could still be unavailable
                # after a disruption
                LOG.debug(f"Waiting for neutron service... ({ex})")
                continue  # Let retry

        dead_agents = agents.with_items(alive=False)
        if dead_agents:
            dead_agents_details = json.dumps(agents, indent=4, sort_keys=True)
            if attempt.is_last:
                tobiko.fail("Unhealthy agent(s) found:\n"
                            f"{dead_agents_details}\n")
            else:
                # retry because some Neutron agent could still be unavailable
                # after a disruption
                LOG.debug("Waiting for Neutron agents to get alive...\n"
                          f"{dead_agents_details}")
                continue

        LOG.debug(f"All {len(agents)} Neutron agents are alive.")
        break
    else:
        raise RuntimeError("Retry loop broken")

    return agents
Beispiel #20
0
def assert_containers_running(group,
                              expected_containers,
                              full_name=True,
                              bool_check=False):
    """assert that all containers specified in the list are running
    on the specified openstack group(controller or compute etc..)
    if bool_check is True then return only True or false without failing"""

    if is_docker():
        LOG.info('not checking common containers since we are on docker')
        return

    failures = []

    openstack_nodes = topology.list_openstack_nodes(group=group)
    for node in openstack_nodes:
        node_containers = list_node_containers(ssh_client=node.ssh_client)
        containers_list_df = pandas.DataFrame(
            get_container_states_list(node_containers),
            columns=['container_host', 'container_name', 'container_state'])
        # check that the containers are present
        LOG.info('node: {} containers list : {}'.format(
            node.name, containers_list_df.to_string(index=False)))
        for container in expected_containers:
            # get container attrs dataframe
            if full_name:
                container_attrs = containers_list_df.query(
                    'container_name == "{}"'.format(container))
            else:
                container_attrs = containers_list_df[containers_list_df[
                    'container_name'].str.contains(container)]
            # check if the container exists
            LOG.info('checking container: {}'.format(container))
            if container_attrs.empty:
                failures.append(
                    'expected container {} not found on node {} ! : \n\n'.
                    format(container, node.name))
            # if container exists, check it is running
            else:
                # only one running container is expected
                container_running_attrs = container_attrs.query(
                    'container_state=="running"')
                if container_running_attrs.empty:
                    failures.append(
                        'expected container {} is not running on node {} , '
                        'its state is {}! : \n\n'.format(
                            container, node.name,
                            container_attrs.container_state.values.item()))
                elif len(container_running_attrs) > 1:
                    failures.append(
                        'only one running container {} was expected on '
                        'node {}, but got {}! : \n\n'.format(
                            container, node.name,
                            len(container_running_attrs)))

    if not bool_check and failures:
        tobiko.fail('container states mismatched:\n{!s}', '\n'.join(failures))

    elif bool_check and failures:
        return False

    else:
        LOG.info('All specified containers are in running state! ')
        return True
Beispiel #21
0
def assert_equal_containers_state(expected_containers_list=None,
                                  timeout=120,
                                  interval=2,
                                  recreate_expected=False):
    """compare all overcloud container states with using two lists:
    one is current , the other some past list
    first time this method runs it creates a file holding overcloud
    containers' states: /home/stack/expected_containers_list_df.csv'
    second time it creates a current containers states list and
    compares them, they must be identical"""

    # if we have a file or an explicit variable use that , otherwise  create
    # and return
    if recreate_expected or (not expected_containers_list
                             and not os.path.exists(expected_containers_file)):
        save_containers_state_to_file(list_containers())
        return

    elif expected_containers_list:
        expected_containers_list_df = pandas.DataFrame(
            get_container_states_list(expected_containers_list),
            columns=['container_host', 'container_name', 'container_state'])

    elif os.path.exists(expected_containers_file):
        expected_containers_list_df = pandas.read_csv(expected_containers_file)

    failures = []
    start = time.time()
    error_info = 'Output explanation: left_only is the original state, ' \
                 'right_only is the new state'

    while time.time() - start < timeout:

        failures = []
        actual_containers_list_df = list_containers_df()

        LOG.info('expected_containers_list_df: {} '.format(
            expected_containers_list_df.to_string(index=False)))
        LOG.info('actual_containers_list_df: {} '.format(
            actual_containers_list_df.to_string(index=False)))

        # execute a `dataframe` diff between the expected and actual containers
        expected_containers_state_changed = \
            dataframe_difference(expected_containers_list_df,
                                 actual_containers_list_df)
        # check for changed state containerstopology
        if not expected_containers_state_changed.empty:
            failures.append(
                'expected containers changed state ! : '
                '\n\n{}\n{}'.format(
                    expected_containers_state_changed.to_string(index=False),
                    error_info))
            LOG.info('container states mismatched:\n{}\n'.format(failures))
            time.sleep(interval)
            # clear cache to obtain new data
            list_node_containers.cache_clear()
        else:
            LOG.info("assert_equal_containers_state :"
                     " OK, all containers are on the same state")
            return
    if failures:
        tobiko.fail('container states mismatched:\n{!s}', '\n'.join(failures))
Beispiel #22
0
 def assert_not_transmitted(self):
     if self.transmitted:
         tobiko.fail(f"{self.transmitted} package(s) has been "
                     f"transmitted to {self.destination}")
Beispiel #23
0
def assert_unreachable_hosts(hosts, **params):
    reachable_hosts = _ping.list_reachable_hosts(hosts, **params)
    if reachable_hosts:
        tobiko.fail("Reached host(s): {!r}", reachable_hosts)
Beispiel #24
0
def assert_reachable_hosts(hosts, **params):
    unreachable_hosts = _ping.list_unreachable_hosts(hosts, **params)
    if unreachable_hosts:
        tobiko.fail("Unable to reach host(s): {!r}", unreachable_hosts)
Beispiel #25
0
 def assert_not_replied(self):
     if self.received:
         tobiko.fail(f"{self.received} reply package(s) has been received "
                     f"from {self.destination}")