Exemple #1
0
    def test_lock_with_max_vms_simplex(self, simplex_only):
        vms_num = host_helper.get_max_vms_supported(host='controller-0')
        vm_helper.ensure_vms_quotas(vms_num=vms_num)

        LOG.tc_step(
            "Boot {} vms with various storage settings".format(vms_num))
        vms = vm_helper.boot_vms_various_types(cleanup='function',
                                               vms_num=vms_num)

        LOG.tc_step("Lock vm host on simplex system")
        HostsToRecover.add('controller-0')
        host_helper.lock_host('controller-0')

        LOG.tc_step("Ensure vms are in {} state after locked host come "
                    "online".format(VMStatus.STOPPED))
        vm_helper.wait_for_vms_values(vms,
                                      value=VMStatus.STOPPED,
                                      fail_ok=False)

        LOG.tc_step("Unlock host on simplex system")
        host_helper.unlock_host(host='controller-0')

        LOG.tc_step("Ensure vms are Active and Pingable from NatBox")
        vm_helper.wait_for_vms_values(vms,
                                      value=VMStatus.ACTIVE,
                                      fail_ok=False,
                                      timeout=600)
        for vm in vms:
            vm_helper.wait_for_vm_pingable_from_natbox(
                vm, timeout=VMTimeout.DHCP_RETRY)
def test_set_hosts_storage_backing_min(instance_backing, number_of_hosts):
    """
    Modify hosts storage backing if needed so that system has minimal number of hosts in given instance backing

    Args:
        instance_backing:
        number_of_hosts:

    Test Steps:
        - Calculate the hosts to be configured based on test params
        - Configure hosts to meet given criteria
        - Check number of hosts in given instance backing is as specified

    """
    LOG.tc_step("Determine the hosts to configure")
    hosts = host_helper.get_up_hypervisors()
    hosts_len = len(hosts)
    host_num_mapping = {'all': hosts_len, 'two': 2, 'one': 1}
    number_of_hosts = host_num_mapping[number_of_hosts]

    hosts_with_backing = host_helper.get_hosts_in_storage_backing(
        instance_backing)
    if len(hosts_with_backing) >= number_of_hosts:
        LOG.info("Already have {} hosts in {} backing. Do nothing".format(
            len(hosts_with_backing), instance_backing))
        return

    candidate_hosts = get_candidate_hosts(number_of_hosts=number_of_hosts)

    number_to_config = number_of_hosts - len(hosts_with_backing)
    hosts_to_config = list(set(candidate_hosts) -
                           set(hosts_with_backing))[0:number_to_config]

    LOG.tc_step(
        "Delete vms if any to prepare for system configuration change with best effort"
    )
    vm_helper.delete_vms(fail_ok=True)

    LOG.tc_step("Configure following hosts to {} backing: {}".format(
        hosts_to_config, instance_backing))
    for host in hosts_to_config:
        HostsToRecover.add(host)
        host_helper.set_host_storage_backing(host=host,
                                             inst_backing=instance_backing,
                                             unlock=False,
                                             wait_for_configured=False)

    host_helper.unlock_hosts(hosts_to_config,
                             check_hypervisor_up=True,
                             fail_ok=False)

    LOG.tc_step("Waiting for hosts in {} aggregate".format(instance_backing))
    for host in hosts_to_config:
        host_helper.wait_for_host_in_instance_backing(
            host, storage_backing=instance_backing)

    LOG.tc_step("Check number of {} hosts is at least {}".format(
        instance_backing, number_of_hosts))
    assert number_of_hosts <= len(host_helper.get_hosts_in_storage_backing(instance_backing)), \
        "Number of {} hosts is less than {} after configuration".format(instance_backing, number_of_hosts)
Exemple #3
0
def host_to_modify(request):
    """
    Select a hypervisor from existing hosts to test

    Args:
        request: pytset arg

    Returns (str): hostname

    """

    target_host = host_helper.get_up_hypervisors()[0]
    original_backing = host_helper.get_host_instance_backing(host=target_host)

    # Ensure unlock attempt on target_host after running all test cases using this fixture
    HostsToRecover.add(target_host, scope='module')

    def revert_host():
        LOG.fixture_step("Revert {} storage backing to {} if needed".format(
            target_host, original_backing))
        host_helper.set_host_storage_backing(target_host,
                                             inst_backing=original_backing,
                                             check_first=True,
                                             lock=True,
                                             unlock=True)

    request.addfinalizer(revert_host)

    return target_host
Exemple #4
0
def test_force_reboot_host(host_type):
    """
    Verify lock unlock host

    Test Steps:
        - Select a host per given type. If type is controller, select standby
            controller.
        - Lock selected host and ensure it is successfully locked
        - Unlock selected host and ensure it is successfully unlocked

    """

    LOG.tc_step("Select a {} node from system if any".format(host_type))
    hosts = system_helper.get_hosts(availability=(HostAvailState.AVAILABLE,
                                                  HostAvailState.DEGRADED),
                                    personality=host_type)
    if not hosts:
        skip("No available or degraded {} host found on system".format(
            host_type))

    host = hosts[0]
    LOG.tc_step("Force reboot {} host: {}".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.reboot_hosts(hostnames=host)
    host_helper.wait_for_hosts_ready(host)
    def test_invalid_huge_page_input(self, get_host, proc, pages):
        """
        (55 Invalid inputs for number of hugepages will be rejected GUI in sysinv testplan)
        given invalid huge page number in a compute node and verify that it failed after modification

        Setup:
            - check if there is at least two compute nodes

        Test Steps:
            - lock compute node
            - modify the huge page on the locked compute node
            - unlock the compute node
            - compare the huge page number with the the expected huge page number

        Teardown:
            - Might be good idea to reset the host memory to what it was before

        """
        host_to_modify = get_host

        LOG.tc_step("Lock host")
        HostsToRecover.add(host_to_modify, scope='class')
        host_helper.lock_host(host_to_modify)

        # config the page number after lock the compute node
        LOG.tc_step(
            'Attempt to modify host memory with invalid page input and ensure it is rejected'
        )
        args = "{} {} {}".format(host_to_modify, proc, pages)
        code, output = cli.system('host-memory-modify', args, fail_ok=True)

        assert 1 == code, "host-memory-modify allows invalid args: {}".format(
            args)
Exemple #6
0
    def config_host_func(host, modify_func, revert_func=None, *args, **kwargs):

        HostsToRecover.add(host, scope=scope)
        LOG.fixture_step("({}) Lock host: {}".format(scope, host))
        host_helper.lock_host(host=host, swact=True)

        # add teardown before running modify (as long as host is locked
        # successfully) in case modify or unlock fails.
        if revert_func is not None:

            def revert_host():
                LOG.fixture_step("({}) Lock host: {}".format(scope, host))
                host_helper.lock_host(host=host, swact=True)
                try:
                    LOG.fixture_step("({}) Execute revert function: {}".format(
                        scope, revert_func))
                    revert_func(host)
                finally:
                    LOG.fixture_step("({}) Unlock host: {}".format(
                        scope, host))
                    # Put it in finally block in case revert_func fails -
                    # host will still be unlocked for other tests.
                    host_helper.unlock_host(host=host)

            request.addfinalizer(revert_host)

        LOG.fixture_step("({}) Execute modify function: {}".format(
            scope, modify_func))
        modify_func(host, *args, **kwargs)

        LOG.fixture_step("({}) Unlock host: {}".format(scope, host))
        host_helper.unlock_host(host=host)
def lock_controller():
    LOG.fixture_step(
        "Ensure system has no standby controller available for swact")
    standby = system_helper.get_standby_controller_name()

    if standby:
        HostsToRecover.add(standby)
        host_helper.lock_host(standby, swact=False)
Exemple #8
0
def test_restapi_sysinv_modify_cpu(prepare_modify_cpu):
    """
    TC2043
    Modify cpu parameters through API

    Test Steps:
        - Lock a compute
        - Apply the profile to the locked compute
        - Unlock compute and verify that the correct changes were made

    Teardown:
        - Delete cpu profile
        - Revert cpu changes

    """
    hostname, uuid, iprofile_uuid = prepare_modify_cpu
    headers = get_headers()

    url = html_helper.create_url(IP_ADDR, HTTPPort.SYS_PORT, HTTPPort.SYS_VER,
                                 "ihosts")
    hosts = html_helper.get_request(url=url, headers=headers,
                                    verify=False)['ihosts']
    found = False
    for host in hosts:
        if host['uuid'] == uuid:
            found = True
            break

    assert found, "FAIL: {} is not listed in the API".format(hostname)

    LOG.tc_step("Locking {} via restAPI".format(hostname))
    url = html_helper.create_url(IP_ADDR, HTTPPort.SYS_PORT, HTTPPort.SYS_VER,
                                 "ihosts/{}".format(uuid))
    lock_data = [{"path": "/action", "value": "lock", "op": "replace"}]
    HostsToRecover.add(hostname, scope='function')
    html_helper.patch_request(url=url,
                              headers=headers,
                              data=lock_data,
                              verify=False)

    system_helper.wait_for_host_values(hostname,
                                       timeout=HostTimeout.LOCK,
                                       administrative=HostAdminState.LOCKED)

    hostinfo = html_helper.get_request(url=url, headers=headers, verify=False)
    assert 'locked' == hostinfo[
        'administrative'], "FAIL: Couldn't lock {}".format(hostname)

    LOG.tc_step("Modify {} vSwitch cpu using CLI".format(hostname))
    res, out = host_helper.modify_host_cpu(hostname,
                                           'Platform',
                                           p0=2,
                                           p1=2,
                                           fail_ok=True)
    assert 1 == res, "FAIL: platform cpu modify passed with invalid config option"

    res, out = host_helper.modify_host_cpu(hostname, 'Platform', p0=2)
    assert 0 == res, "FAIL: platform cpu modify failed by cli"
Exemple #9
0
    def test_reboot_only_host(self, get_zone):
        """
        Test reboot only hypervisor on the system

        Args:
            get_zone: fixture to create stxauto aggregate, to ensure vms can
            only on one host

        Setups:
            - If more than 1 hypervisor: Create stxauto aggregate and add
            one host to the aggregate

        Test Steps:
            - Launch various vms on target host
                - vm booted from cinder volume,
                - vm booted from glance image,
                - vm booted from glance image, and have an extra cinder
                volume attached after launch,
                - vm booed from cinder volume with ephemeral and swap disks
            - sudo reboot -f only host
            - Check host is recovered
            - Check vms are recovered and reachable from NatBox

        """
        zone = get_zone

        LOG.tc_step("Launch 5 vms in {} zone".format(zone))
        vms = vm_helper.boot_vms_various_types(avail_zone=zone,
                                               cleanup='function')
        target_host = vm_helper.get_vm_host(vm_id=vms[0])
        for vm in vms[1:]:
            vm_host = vm_helper.get_vm_host(vm)
            assert target_host == vm_host, "VMs are not booted on same host"

        LOG.tc_step("Reboot -f from target host {}".format(target_host))
        HostsToRecover.add(target_host)
        host_helper.reboot_hosts(target_host)

        LOG.tc_step("Check vms are in Active state after host come back up")
        res, active_vms, inactive_vms = vm_helper.wait_for_vms_values(
            vms=vms, value=VMStatus.ACTIVE, timeout=600)

        vms_host_err = []
        for vm in vms:
            if vm_helper.get_vm_host(vm) != target_host:
                vms_host_err.append(vm)

        assert not vms_host_err, "Following VMs are not on the same host {}: " \
                                 "{}\nVMs did not reach Active state: {}". \
            format(target_host, vms_host_err, inactive_vms)

        assert not inactive_vms, "VMs did not reach Active state after " \
                                 "evacuated to other host: " \
                                 "{}".format(inactive_vms)

        LOG.tc_step("Check VMs are pingable from NatBox after evacuation")
        vm_helper.wait_for_vm_pingable_from_natbox(
            vms, timeout=VMTimeout.DHCP_RETRY)
Exemple #10
0
def test_system_persist_over_host_reboot(host_type, stx_openstack_required):
    """
    Validate Inventory summary over reboot of one of the controller see if data persists over reboot

    Test Steps:
        - capture Inventory summary for list of hosts on system service-list and neutron agent-list
        - reboot the current Controller-Active
        - Wait for reboot to complete
        - Validate key items from inventory persist over reboot

    """
    if host_type == 'controller':
        host = system_helper.get_active_controller_name()
    elif host_type == 'compute':
        if system_helper.is_aio_system():
            skip("No compute host for AIO system")

        host = None
    else:
        hosts = system_helper.get_hosts(personality='storage')
        if not hosts:
            skip(msg="Lab has no storage nodes. Skip rebooting storage node.")

        host = hosts[0]

    LOG.tc_step("Pre-check for system status")
    system_helper.wait_for_services_enable()
    up_hypervisors = host_helper.get_up_hypervisors()
    network_helper.wait_for_agents_healthy(hosts=up_hypervisors)

    LOG.tc_step("Launch a vm")
    vm_id = vm_helper.boot_vm(cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    if host is None:
        host = vm_helper.get_vm_host(vm_id)

    LOG.tc_step("Reboot a {} node and wait for reboot completes: {}".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.reboot_hosts(host)
    host_helper.wait_for_hosts_ready(host)

    LOG.tc_step("Check vm is still active and pingable after {} reboot".format(host))
    vm_helper.wait_for_vm_status(vm_id, status=VMStatus.ACTIVE, fail_ok=False)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_id, timeout=VMTimeout.DHCP_RETRY)

    LOG.tc_step("Check neutron agents and system services are in good state after {} reboot".format(host))
    network_helper.wait_for_agents_healthy(up_hypervisors)
    system_helper.wait_for_services_enable()

    if host in up_hypervisors:
        LOG.tc_step("Check {} can still host vm after reboot".format(host))
        if not vm_helper.get_vm_host(vm_id) == host:
            time.sleep(30)
            vm_helper.live_migrate_vm(vm_id, destination_host=host)
def test_force_lock_with_non_mig_vms(add_host_to_zone):
    """
    Test force lock host with non-migrate-able vms on it

    Prerequisites:
        - Minimum of two up hypervisors
    Test Setups:
        - Add admin role to primary tenant
        - Create cgcsauto aggregate
        - Add host_under_test to cgcsauto aggregate
        - Create flavor for vms_to_test with storage_backing support by host_under_test
        - Create vms_to_test on host_under_test that can be live migrated
    Test Steps:
        - Force lock target host
        - Verify force lock returns 0
        - Verify VMs cannot find a host to boot and are in error state
        - Unlock locked target host
        - Verify VMs are active on host once it is up and available
        - Verify VMs can be pinged
    Test Teardown:
        - Remove admin role from primary tenant
        - Delete created vms
        - Remove host_under_test from cgcsauto aggregate
    """
    storage_backing, host_under_test = add_host_to_zone

    # Create flavor with storage_backing the host_under_test supports
    flavor_id = nova_helper.create_flavor(storage_backing=storage_backing)[1]

    # Boot VMs on the host using the above flavor.
    LOG.tc_step("Boot VM on {}".format(host_under_test))
    vm_id = vm_helper.boot_vm(vm_host=host_under_test,
                              flavor=flavor_id,
                              avail_zone='cgcsauto',
                              cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    # Force lock host that VMs are booted on.
    LOG.tc_step("Force lock {}".format(host_under_test))
    HostsToRecover.add(host_under_test)
    lock_code, lock_output = host_helper.lock_host(host_under_test, force=True)
    assert lock_code == 0, "Failed to lock {}. Details: {}".format(
        host_under_test, lock_output)

    vm_helper.wait_for_vm_values(vm_id, fail_ok=False, **{'status': 'ERROR'})

    host_helper.unlock_host(host_under_test)

    vm_helper.wait_for_vm_values(vm_id,
                                 timeout=300,
                                 fail_ok=False,
                                 **{'status': 'ACTIVE'})
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id,
                                               timeout=VMTimeout.DHCP_RETRY)
Exemple #12
0
def get_hugepage_pod_file():
    """
    Fixture used to return the hugepage deployment file

        - Get the compute-0 if exist, else standby controller
        - Check 2M hugepages configured, elsif check 1G is configured
            else lock,configure 2G of 1G hugepages and unlock host
        - Call modify_yaml function to modify the yaml
          file with the values
        - Modified file scps to host to deploy hugepages pod
        - Deletes the hugepages pod from the host after the test

    """
    if system_helper.is_aio_duplex():
        hostname = system_helper.get_standby_controller_name()
    else:
        hostname = system_helper.get_hypervisors()[0]
    LOG.fixture_step("Checking hugepage values on {}".format(hostname))
    proc_id = 0
    out = host_helper.get_host_memories(hostname,
                                        ('app_hp_avail_2M', 'app_hp_avail_1G'),
                                        proc_id)
    if out[proc_id][0] > 0:
        hugepage_val = "{}Mi".format(out[proc_id][0])
        hugepage_str = "hugepages-2Mi"
    elif out[proc_id][1] > 0:
        hugepage_val = "{}Gi".format(out[proc_id][1])
        hugepage_str = "hugepages-1Gi"
    else:
        hugepage_val = "{}Gi".format(2)
        cmd = "{} -1G {}".format(proc_id, 2)
        hugepage_str = "hugepages-1Gi"
        HostsToRecover.add(hostname)
        host_helper.lock_host(hostname)
        LOG.fixture_step("Configuring hugepage values {} on {}".format(
            hugepage_val, hostname))
        cli.system('host-memory-modify {} {}'.format(hostname, cmd),
                   ssh_client=None,
                   auth_info=Tenant.get('admin_platform'))
        host_helper.unlock_host(hostname)
    LOG.fixture_step("{} {} pod will be configured on {} proc id {}".format(
        hugepage_str, hugepage_val, hostname, proc_id))
    file_dir, file_name = modify_yaml("utils/test_files/",
                                      "hugepages_pod.yaml", hugepage_str,
                                      hugepage_val)
    source_path = "{}/{}".format(file_dir, file_name)
    home_dir = HostLinuxUser.get_home()
    common.scp_from_localhost_to_active_controller(source_path,
                                                   dest_path=home_dir)
    yield file_name
    LOG.fixture_step("Delete hugepages pod")
    kube_helper.delete_resources(resource_names="hugepages-pod")
Exemple #13
0
def test_evacuate_vms_stress(add_hosts_to_zone):
    """
    Test evacuate vms with various vm storage configs and host instance backing configs

    Args:
        storage_backing: storage backing under test
        add_admin_role_class (None): test fixture to add admin role to primary tenant

    Skip conditions:
        - Less than two hosts configured with storage backing under test

    Setups:
        - Add admin role to primary tenant (module)

    Test Steps:
        - Create flv_rootdisk without ephemeral or swap disks, and set storage backing extra spec
        - Create flv_ephemswap with ephemeral AND swap disks, and set storage backing extra spec
        - Boot following vms on same host and wait for them to be pingable from NatBox:
            - Boot vm1 from volume with flavor flv_rootdisk
            - Boot vm2 from volume with flavor flv_localdisk
            - Boot vm3 from image with flavor flv_rootdisk
            - Boot vm4 from image with flavor flv_rootdisk, and attach a volume to it
            - Boot vm5 from image with flavor flv_localdisk
        - power-off host from vlm
        - Ensure evacuation for all 5 vms are successful (vm host changed, active state, pingable from NatBox)
        - Repeat above evacuation steps

    Teardown:
        - Delete created vms, volumes, flavors
        - Remove admin role from primary tenant (module)

    """
    storage_backing, hosts = add_hosts_to_zone
    zone = 'cgcsauto'

    HostsToRecover.add(hosts)

    initial_host = hosts[0]

    vms = vm_helper.boot_vms_various_types(storage_backing=storage_backing, target_host=initial_host, avail_zone=zone)

    target_host = initial_host

    for i in range(100):
        post_host = hosts[0] if target_host != hosts[0] else hosts[1]
        LOG.info("\n===============Iteration {}============".format(i+1))
        vm_helper.evacuate_vms(target_host, vms, wait_for_host_up=True, post_host=post_host, timeout=720, vlm=True,
                               ping_vms=True)

        target_host = post_host
        LOG.info("Rest for 120 seconds before next evacuation")
        time.sleep(120)
Exemple #14
0
def test_lock_unlock_host(host_type):
    """
    Verify lock unlock host

    Test Steps:
        - Select a host per given type. If type is controller, select
            standby controller.
        - Lock selected host and ensure it is successfully locked
        - Unlock selected host and ensure it is successfully unlocked

    """
    LOG.tc_step("Select a {} node from system if any".format(host_type))
    if host_type == 'controller':
        if system_helper.is_aio_simplex():
            host = 'controller-0'
        else:
            host = system_helper.get_standby_controller_name()
            assert host, "No standby controller available"

    else:
        if host_type == 'compute' and system_helper.is_aio_system():
            skip("No compute host on AIO system")
        elif host_type == 'storage' and not system_helper.is_storage_system():
            skip("System does not have storage nodes")

        hosts = system_helper.get_hosts(personality=host_type,
                                        availability=HostAvailState.AVAILABLE,
                                        operational=HostOperState.ENABLED)

        assert hosts, "No good {} host on system".format(host_type)
        host = hosts[0]

    LOG.tc_step("Lock {} host - {} and ensure it is successfully "
                "locked".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.lock_host(host, swact=False)

    # wait for services to stabilize before unlocking
    time.sleep(20)

    # unlock standby controller node and verify controller node is
    # successfully unlocked
    LOG.tc_step("Unlock {} host - {} and ensure it is successfully "
                "unlocked".format(host_type, host))
    host_helper.unlock_host(host)
def test_force_lock_with_mig_vms(get_hosts_with_backing):
    """
    Test force lock host with migrate-able vms on it

    Prerequisites:
        - Minimum of two hosts supporting the same storage backing.
    Test Setups:
        - Add admin role to primary tenant
        - Boot various VMs on host_under_test that can be live migrated
    Test Steps:
        - Get status info from VMs
        - Force lock target host
        - Verify force lock returns 0
        - Wait until VMs are active on a secondary host
        - Verify VMs can be pinged
    Test Teardown:
        - Remove admin role from primary tenant
        - Delete created vms
        - Unlock locked target host(s)
    """
    storage_backing, host_under_test = get_hosts_with_backing

    # Boot VMs on the host.
    LOG.tc_step("Boot VMs on {}".format(host_under_test))
    vm_ids = vm_helper.boot_vms_various_types(storage_backing=storage_backing,
                                              target_host=host_under_test,
                                              cleanup='function')

    # Force lock host that VMs are booted on
    LOG.tc_step("Force lock {}".format(host_under_test))
    HostsToRecover.add(host_under_test)
    lock_code, lock_output = host_helper.lock_host(host_under_test,
                                                   force=True,
                                                   check_first=False)
    assert lock_code == 0, "Failed to force lock {}. Details: {}".format(
        host_under_test, lock_output)

    # Expect VMs to migrate off force-locked host (non-gracefully)
    LOG.tc_step(
        "Wait for 'Active' status of VMs after host force lock completes")
    vm_helper.wait_for_vms_values(vm_ids, fail_ok=False)

    for vm in vm_ids:
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm, timeout=VMTimeout.DHCP_RETRY)
def test_host_operations_with_custom_kubectl_app(deploy_delete_kubectl_app):
    """
    Test create, delete custom app via kubectl run cmd
    Args:
        deploy_delete_kubectl_app: fixture

    Setups:
        - Create kubectl app via kubectl run

    Test Steps:
        - If duplex: swact and verify pod still Running
        - Lock/unlock controller and verify pod still Running

    Teardown:
        - Delete kubectl deployment and service
        - Verify pod is removed

    """
    app_name, pod_name = deploy_delete_kubectl_app
    active, standby = system_helper.get_active_standby_controllers()

    if standby:
        LOG.tc_step("Swact active controller and verify {} test app is "
                    "running ".format(pod_name))
        host_helper.swact_host()
        kube_helper.wait_for_pods_status(pod_names=pod_name,
                                         namespace='default',
                                         fail_ok=False)

    LOG.tc_step("Lock/unlock {} and verify {} test app is "
                "running.".format(active, pod_name))
    HostsToRecover.add(active)
    host_helper.lock_host(active, swact=False)

    # wait for services to stabilize before unlocking
    time.sleep(20)

    host_helper.unlock_host(active)
    pod_name = kube_helper.get_pods(field='NAME',
                                    namespace='default',
                                    name=app_name,
                                    strict=False)[0]
    kube_helper.wait_for_pods_status(pod_names=pod_name,
                                     namespace=None,
                                     fail_ok=False)
        def cleanup():

            if not system_helper.is_storage_system():
                skip("This test requires a storage system")

            profiles_created = self._pop_cleanup_list('profile')
            old_new_types = self._pop_cleanup_list('local_storage_type')

            # Add hosts to module level recovery fixture in case of modify or unlock fail in following class level
            # recovery attempt.
            for item in old_new_types:
                HostsToRecover.add(item[0], scope='module')

            exceptions = []
            try:
                LOG.fixture_step("(class) Delete created storage profiles")
                while profiles_created:
                    storage_helper.delete_storage_profile(
                        profile=profiles_created.pop())

            except Exception as e:
                LOG.exception(e)
                exceptions.append(e)

            try:
                LOG.fixture_step(
                    "(class) Revert local storage backing for {}".format(
                        old_new_types))
                while old_new_types:
                    host_to_revert, old_type, _ = old_new_types.pop()
                    LOG.info("Revert {} local storage to {}".format(
                        host_to_revert, old_type))
                    host_helper.set_host_storage_backing(host=host_to_revert,
                                                         inst_backing=old_type,
                                                         unlock=True)

            except Exception as e:
                LOG.exception(e)
                exceptions.append(e)

            assert not exceptions, "Failure occurred. Errors: {}".format(
                exceptions)
Exemple #18
0
def test_controllerfs_mod_when_host_locked():
    """
    This test attempts to modify controllerfs value while one of the
    controllers is locked.  All controller filesystem modification attempts
    should be rejected when any one of the controllers in not available.

    Arguments:
    - None

    Test Steps:
    1.  Lock standby controller or only controller (in the case of AIO systems)
    2.  Attempt to modify controller filesystem.  This should be rejected.

    Assumptions:
    - None

    Teardown:
    - Unlock controller
    """

    if system_helper.is_aio_simplex():
        target_host = "controller-0"
    else:
        target_host = system_helper.get_standby_controller_name()

    host_helper.lock_host(target_host)
    HostsToRecover.add(target_host, scope="function")

    drbdfs_val = {}
    fs = "database"
    LOG.tc_step("Determine the current filesystem size")
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 1
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of filesystems")
    code = storage_helper.modify_controllerfs(fail_ok=True, **drbdfs_val)[0]
    assert 1 == code, "Filesystem modify succeeded while failure is expected: {}".format(
        drbdfs_val)
Exemple #19
0
def ensure_sufficient_4k_pages(request):
    """
    Check if there is enough 4k pages on any compute node on any processors is a bit hassle

    Returns:

    """
    # check if any 4k pages greater than 600000 means more than 2G(~536871 4k pages) total.

    storage_backing = request.param
    hypervisors = host_helper.get_hosts_in_storage_backing(
        storage_backing=storage_backing)
    if len(hypervisors) < 2:
        skip("Less than two hypersvisors with {} instance backing".format(
            storage_backing))

    hypervisors = hypervisors[:2]
    LOG.fixture_step(
        "Configure {} with sufficient 4k pages".format(hypervisors))

    for host in hypervisors:
        LOG.fixture_step(
            "Modify 4k page numbers to 600000 for {}".format(host))
        num_4k_pages = host_helper.get_host_memories(host, 'app_total_4K')
        for proc, pages_4k in num_4k_pages.items():
            if pages_4k[0] > 1024 * 1024 / 4:
                break
        else:
            proc_to_set = 1 if len(num_4k_pages) > 1 else 0
            HostsToRecover.add(host, scope='module')
            host_helper.lock_host(host, swact=True)
            host_helper.modify_host_memory(host,
                                           proc=proc_to_set,
                                           gib_4k_range=(2, 4))
            host_helper.unlock_host(host,
                                    check_hypervisor_up=True,
                                    check_webservice_up=True)

    return storage_backing, hypervisors
def test_apply_storage_profile_negative(create_storage_profile, personality):

    if personality == 'controller':
        host_name = system_helper.get_standby_controller_name()
        assert host_name, "No standby controller available on system"
    else:
        host_name = host_helper.get_up_hypervisors()[0]

    # For storage systems, skip test if ceph isn't healthy
    if len(system_helper.get_storage_nodes()) > 0:
        ceph_healthy = storage_helper.is_ceph_healthy()
        if not ceph_healthy:
            skip('Skipping due to ceph not being healthy')

    profile_name = create_storage_profile['profile_name']
    origin_disk_num = create_storage_profile['disk_num']
    disks_num = len(storage_helper.get_host_disks(host_name, 'device_node'))

    expt_err = 'profile has more disks than host does' if disks_num < origin_disk_num -1 \
        else "Please check if host's disks match profile criteria"
    expt_err_list = [
        "Please check if host's disks match profile criteria",
        "Failed to create storage function. Host personality must be 'storage'",
    ]
    if disks_num < origin_disk_num - 1:
        expt_err_list.append("profile has more disks than host does")

    positional_arg = host_name + ' ' + profile_name

    HostsToRecover.add(host_name)
    host_helper.lock_host(host_name, swact=True)
    exitcode, output = cli.system('host-apply-storprofile',
                                  positional_arg,
                                  fail_ok=True)
    host_helper.unlock_host(host_name)

    assert exitcode == 1 and any(expt in output for expt in expt_err_list)
Exemple #21
0
def _test_status_firewall_reboot():
    """
    Test iptables status after reboot of controller

    Test Steps:
        - Stop iptables service
        - Confirm iptables service has stopped
        - Reboot the controller being tested
        - Confirm iptables service is online
        - Repeat for second controller
    """
    LOG.tc_step("Getting the controller(s)")
    controllers = system_helper.get_controllers()
    for controller in controllers:
        with host_helper.ssh_to_host(controller) as con_ssh:
            LOG.tc_step("Stopping iptables service")
            cmd = 'service iptables stop'
            con_ssh.exec_sudo_cmd(cmd)
            LOG.tc_step("checking iptables status")
            cmd = 'service iptables status'
            code, output = con_ssh.exec_sudo_cmd(cmd)
            assert 'Active: inactive' or 'Active: failed' in output, "iptables service did not stop running on host {}"\
                .format(controller)

        LOG.tc_step("Rebooting {}".format(controller))
        HostsToRecover.add(controller)
        host_helper.reboot_hosts(controller)

        with host_helper.ssh_to_host(controller) as con_ssh:
            LOG.tc_step(
                "Checking iptables status on host {} after reboot".format(
                    controller))
            cmd = 'service iptables status | grep --color=never Active'
            code, output = con_ssh.exec_sudo_cmd(cmd)
            assert 'active' in output, "iptables service did not start after reboot on host {}".format(
                controller)
def test_attempt_host_unlock_during_partition_creation():
    """
    This test attempts to unlock a host while a partition is being created.  It
    is expected to fail.

    Assumptions:
    * There's some free disk space available

    Test steps:
    * Query the hosts to determine disk space
    * Lock host
    * Create a partition but don't wait for completion
    * Attempt to unlock the host that is hosting the partition that is created

    Teardown:
    * Delete created partitions

    DISABLED since unlock while creating is not blocked.

    """

    global partitions_to_restore
    partitions_to_restore = {}

    computes = system_helper.get_hosts(personality="compute")
    hosts = system_helper.get_controllers() + computes

    # Filter out active controller
    active_controller = system_helper.get_active_controller_name()
    print("This is active controller: {}".format(active_controller))
    hosts.remove(active_controller)

    usable_disks = False
    for host in hosts:
        disks = storage_helper.get_host_disks(host)
        free_disks = storage_helper.get_host_disks_with_free_space(host, disks)
        if not free_disks:
            continue

        for uuid in free_disks:
            size_gib = float(free_disks[uuid])
            if size_gib < 2.0:
                LOG.info("Skip this disk due to insufficient space")
                continue

            LOG.tc_step("Lock {} and create a partition for disk {}".format(
                host, uuid))
            HostsToRecover.add(host)
            host_helper.lock_host(host)
            usable_disks = True
            LOG.info("Creating partition on {}".format(host))
            rc, out = storage_helper.create_host_partition(host,
                                                           uuid,
                                                           int(size_gib),
                                                           wait=False)
            uuid = table_parser.get_value_two_col_table(
                table_parser.table(out), "uuid")
            partitions_to_restore[host] = []
            partitions_to_restore[host].append(uuid)

            LOG.tc_step(
                "Attempt to unlock host and ensure it's rejected when partition is "
                "being created")
            rc_ = host_helper.unlock_host(host,
                                          fail_ok=True,
                                          check_first=False)[0]
            assert rc_ != 0, "Unlock attempt unexpectedly passed"

            LOG.tc_step("wait for partition to be created")
            storage_helper.wait_for_host_partition_status(host=host,
                                                          uuid=uuid,
                                                          timeout=CP_TIMEOUT)

            container_helper.wait_for_apps_status(apps='platform-integ-apps',
                                                  status=AppStatus.APPLIED,
                                                  check_interval=10)
            # Only test one disk on each host
            break
        # Do it on one host only
        break

    if not usable_disks:
        skip("Did not find disks with sufficient space to test with.")
def test_patch_orch_with_ignored_alarms(patch_orchestration_setup, patch_function_check, ignored_alarm_texts):
    """
    This test verifies the patch orchestration operation with presence of alarms that are normally ignored by the
    orchestration. These alarms are '200.001', '700.004,', '900.001', '900.005', '900.101'. This test generates the
    alarms host lock (200.001) and VM stopped ( 700.004) before executing the patch orchestration.
    Args:
        patch_orchestration_setup:
        patch_function_check
        ignored_alarm_texts:

    Returns:

    """
    vms = patch_function_check
    patches, controllers, computes, storages = patch_orchestration_setup
    hosts = controllers + computes + storages
    patch_id = patching_helper.parse_test_patches(patch_ids=patches, search_str='INSVC_ALLNODES')[0]

    if 'HOST_LOCK' in ignored_alarm_texts and len(hosts) < 2:
        skip("Not enough hosts present in the system")

    if 'HOST_LOCK' in ignored_alarm_texts:
        host = hosts[-1]
        HostsToRecover.add(host)
        LOG.info("Lock host {} to generate 200.001 alarm".format(host))
        host_helper.lock_host(host)
        system_helper.wait_for_alarm(alarm_id='200.001', fail_ok=False)
        LOG.info("Host {} is locked and 200.001 alarm is generated".format(host))

    vm_id_to_stop = None
    if 'VM_STOP' in ignored_alarm_texts:
        vm_id_to_stop = vms[0]
        LOG.info("Stop VM {} to generate 700.004 alarm".format(vm_id_to_stop))
        vm_helper.stop_vms(vm_id_to_stop)
        system_helper.wait_for_alarm(alarm_id='700.004')

    patch_file = patches[patch_id]

    LOG.tc_step("Upload patch file {}".format(patch_file))
    uploaded_id = patching_helper.upload_patches(patch_files=patch_file)[1][0]
    assert patch_id == uploaded_id, "Expected patch {} and uploaded patch {} mismatch".format(patch_id, uploaded_id)
    LOG.info("Patch {} uploaded".format(uploaded_id))

    LOG.tc_step("Apply patch {}".format(uploaded_id))
    applied = patching_helper.apply_patches(patch_ids=[uploaded_id])[1]
    LOG.info("Patch {} applied".format(applied))

    LOG.tc_step("Install patch {} through orchestration".format(uploaded_id))
    patching_helper.wait_for_affecting_alarms_gone()
    run_patch_orchestration_strategy()
    LOG.info("Install patch through orchestration completed for patch {}".format(applied))
    host_helper.wait_for_hosts_ready(hosts=hosts)

    LOG.tc_step("Check vms after patch is installed.")
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)

    LOG.tc_step("Remove test patch {}".format(applied))
    if vm_id_to_stop:
        vm_helper.stop_vms(vm_id_to_stop)

    patching_helper.remove_patches(patch_ids=applied)

    LOG.tc_step("Remove patch through orchestration: {}".format(applied))
    run_patch_orchestration_strategy(alarm_restrictions='relaxed')
    LOG.info("Apply/Remove through patch orchestration completed for patch {}".format(applied))

    LOG.tc_step("Check vms after patch removed: {}.".format(applied))
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)
Exemple #24
0
def test_sensorgroup_power_cycle(host,
                                 eventlevel,
                                 action,
                                 expected_host_state,
                                 expected_alarm_state,
                                 event_type,
                                 suppressionlevel, sensor_data_fit):
    """
    Verify that the sensorgroup action taken for an event is valid.

    Test Steps:
        - Get a sensorgroup to test
        - Set the event level and expected action
        - trigger an out-of-scope event for that sensorgroup
        - verify that the expected action is taken

    """
    bmc_hosts = sensor_data_fit
    if host not in bmc_hosts:
        skip("{} is not configured with BMC sensor".format(host))

    global HOST
    HOST = host

    if suppressionlevel == 'suppressed':
        # global SUPPRESSED
        # SUPPRESSED = host
        suppress = True
    else:
        suppress = False

    expt_severity = eventlevel.split('_')[-1] if 'yes' in expected_alarm_state else None

    # Get a sensor to validate
    sensorgroup_name = random.choice(bmc_helper.get_sensor_names(host, sensor_group=True))
    for i in range(4):
        LOG.info("################## iter {} #########################".format(i+1))
        LOG.tc_step("Validating that sensorgroup: {} "
                    "can be set to sensor action: {} "
                    "for event level: {}".format(sensorgroup_name, action,
                                                 eventlevel))

        # Set the event level and action
        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', suppress=suppress, audit_interval=10,
                                      **{eventlevel: action})

        # Get a sensor that is part of the sensorgroup
        sensor_name = bmc_helper.get_first_sensor_from_sensorgroup(sensorgroup_name, host)
        entity_id = 'host={}.sensor={}'.format(host, sensor_name)

        LOG.tc_step("Trigger event for sensorgroup: {} and sensor name: {}".
                    format(sensorgroup_name, sensor_name))
        if action in ['power-cycle', 'reset']:
            HostsToRecover.add(host)

        start_time = common.get_date_in_format()
        bmc_helper.trigger_event(host, sensor_name, event_type)

        LOG.tc_step("Check sensor status and alarm for {}".format(sensor_name))
        if expected_alarm_state == 'yes_alarm':
            system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id,
                                         severity=expt_severity, timeout=60, strict=False, fail_ok=False)
        else:
            events = system_helper.wait_for_events(timeout=60, num=10, event_log_id=EventLogID.BMC_SENSOR_ACTION,
                                                   entity_instance_id=entity_id, start=start_time, state='log',
                                                   severity=expt_severity, fail_ok=True, strict=False)
            if expected_alarm_state == 'yes_log':
                assert events, "No event log found for {} {} {} event".format(host, sensorgroup_name, eventlevel)
            else:
                assert not events, "Event logged unexpectedly for sensor on {}".format(host)
                system_helper.wait_for_alarm_gone(EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id, strict=False,
                                                  timeout=5, fail_ok=False)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        host_state_timeout = 120
        if action == 'reset':
            host_state_timeout = 1080  # 15 min reset interval in between two reset triggers
        system_helper.wait_for_host_values(host, timeout=host_state_timeout, fail_ok=False,
                                                    availability=expected_host_state)
        if action == 'power-cycle':
            system_helper.wait_for_host_values(host, timeout=20, task=HostTask.POWER_CYCLE, strict=False)

        LOG.tc_step("Check the alarm clears and host in available state after clearing events")
        bmc_helper.clear_events(host)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=host, strict=False,
                                          timeout=60)
        wait_time = 3000 if action == 'power-cycle' else HostTimeout.REBOOT
        expt_states = {'availability': 'available'}
        strict = True
        if action == 'power-cycle' and i == 3:
            wait_time = 1200
            strict = False
            expt_states = {'availability': HostAvailState.POWER_OFF,
                           'operational': HostOperState.DISABLED,
                           'administrative': HostAdminState.UNLOCKED,
                           'task': HostTask.POWER_DOWN}

        system_helper.wait_for_host_values(host, fail_ok=False, timeout=wait_time, strict=strict, **expt_states)

    LOG.tc_step("Power on {} after test ends".format(host))
    host_helper.lock_host(host=host)
    host_helper.power_on_host(host=host)
    HOST = ''
Exemple #25
0
def _test_increase_ceph_mon():
    """
    Increase the size of ceph-mon.  Only applicable to a storage system.

    Fails until CGTS-8216

    Test steps:
    1.  Determine the current size of ceph-mon
    2.  Attempt to modify ceph-mon to invalid values
    3.  Check if there is free space to increase ceph-mon
    4.  Attempt to increase ceph-mon
    5.  Wait for config out-of-date alarms to raise
    6.  Lock/unlock all affected nodes (controllers and storage)
    7.  Wait for alarms to clear
    8.  Check that ceph-mon has the correct updated value

    Enhancement:
    1.  Possibly check there is enough disk space for ceph-mon to increase.  Not sure if
    this is required since there always seems to be some space on the rootfs.

    """
    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    LOG.info("ceph_mon_gib is currently: {}".format(ceph_mon_gib))

    LOG.tc_step("Attempt to modify ceph-mon to invalid values")
    invalid_cmg = ['19', '41', 'fds']
    for value in invalid_cmg:
        host = "controller-0"
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(host, value),
                   fail_ok=True)

    if int(ceph_mon_gib) >= 30:
        skip("Insufficient disk space to execute test")

    ceph_mon_gib_avail = 40 - int(ceph_mon_gib)
    new_ceph_mon_gib = math.trunc(ceph_mon_gib_avail / 10) + int(ceph_mon_gib)

    LOG.tc_step("Increase ceph_mon_gib to {}".format(new_ceph_mon_gib))
    hosts = system_helper.get_controllers()
    for host in hosts:
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(
            host, new_ceph_mon_gib))
        # We only need to do this for one controller now and it applies to both
        break

    LOG.info("Wait for expected alarms to appear")
    storage_hosts = system_helper.get_storage_nodes()
    total_hosts = hosts + storage_hosts
    for host in total_hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                     entity_id="host={}".format(host))

    LOG.tc_step("Lock/unlock all affected nodes")
    for host in storage_hosts:
        HostsToRecover.add(host)
        host_helper.lock_host(host)
        host_helper.unlock_host(host)
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host))
        time.sleep(10)

    standby = system_helper.get_standby_controller_name()
    active = system_helper.get_active_controller_name()
    HostsToRecover.add(standby)
    host_helper.lock_host(standby)
    host_helper.unlock_host(standby)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(standby))
    time.sleep(10)
    host_helper.swact_host(active)
    HostsToRecover.add(active)
    host_helper.lock_host(active)
    host_helper.unlock_host(active)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(active))

    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    assert ceph_mon_gib != new_ceph_mon_gib, "ceph-mon did not change"
def _test_storage_profile(personality, from_backing, to_backing):
    """
    This test creates a storage profile and then applies it to a node with
    identical hardware, assuming one exists.

    Storage profiles do not apply on controller nodes.  Storage profiles can be
    applied on controller+compute nodes, compute nodes and storage nodes.

    Arguments:
    - personality (string) - controller, compute or storage
    - from_backing (string) - image, remote or None
    - to_backing (string) - image, remote or None

    Test Steps:
    1.  Query system and determine which nodes have compatible hardware.
    2.  Create a storage profile on one of those nodes
    3.  Apply the created storage profile on a compatible node*
    4.  Ensure the storage profiles have been successfully applied.

    * If the node is a compute node or a controller+compute, we will also change
      the backend if required for additional coverage.

    Returns:
    - Nothing
    """

    global PROFILES_TO_DELETE
    PROFILES_TO_DELETE = []

    # Skip if test is not applicable to hardware under test
    if personality == 'controller' and not system_helper.is_aio_system():
        skip("Test does not apply to controller hosts without subtype compute")

    hosts = system_helper.get_hosts(personality=personality)
    if not hosts:
        skip("No hosts of type {} available".format(personality))

    if (from_backing == "remote" or to_backing
            == "remote") and not system_helper.is_storage_system():
        skip("This test doesn't apply to systems without storage hosts")

    LOG.tc_step("Identify hardware compatible hosts")
    hash_to_hosts = get_hw_compatible_hosts(hosts)

    # Pick the hardware group that has the most compatible hosts
    current_size = 0
    candidate_hosts = []
    for value in hash_to_hosts:
        candidate_size = len(hash_to_hosts[value])
        if candidate_size > current_size:
            current_size = candidate_size
            candidate_hosts = hash_to_hosts[value]
    LOG.info(
        "This is the total set of candidate hosts: {}".format(candidate_hosts))

    if len(candidate_hosts) < 2:
        skip("Insufficient hardware compatible hosts to run test")

    # Rsync lab setup dot files between controllers
    con_ssh = ControllerClient.get_active_controller()
    _rsync_files_to_con1(con_ssh=con_ssh, file_to_check="force.txt")

    # Take the hardware compatible hosts and check if any of them already have
    # the backend that we want.  This will save us test time.
    new_to_backing = None
    if personality == "compute":
        from_hosts = []
        to_hosts = []
        for host in candidate_hosts:
            host_backing = host_helper.get_host_instance_backing(host)
            if host_backing == from_backing:
                from_hosts.append(host)
            elif host_backing == to_backing:
                to_hosts.append(host)
            else:
                pass
        LOG.info(
            "Candidate hosts that already have the right from backing {}: {}".
            format(from_backing, from_hosts))
        LOG.info(
            "Candidate hosts that already have the right to backing {}: {}".
            format(to_backing, to_hosts))

        # Determine what hosts to use
        if not from_hosts and to_hosts:
            to_host = random.choice(to_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        elif not to_hosts and from_hosts:
            from_host = random.choice(from_hosts)
            candidate_hosts.remove(from_host)
            to_host = random.choice(candidate_hosts)
        elif not to_hosts and not from_hosts:
            to_host = random.choice(candidate_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        else:
            to_host = random.choice(to_hosts)
            from_host = random.choice(from_hosts)

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)
        system_helper.wait_for_host_values(
            from_host,
            availability=HostAvailState.AVAILABLE,
            timeout=120,
            fail_ok=False)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    elif personality == "controller":
        # For now, we don't want to host reinstall controller-0 since it will default to
        # pxeboot, but this could be examined as a possible enhancement.
        from_host = "controller-0"
        to_host = "controller-1"

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    else:
        # Backing doesn't apply to storage nodes so just pick from compatible hardware
        from_host = random.choice(candidate_hosts)
        candidate_hosts.remove(from_host)
        to_host = random.choice(candidate_hosts)

    LOG.tc_step(
        "Create storage and interface profiles on the from host {}".format(
            from_host))
    prof_name = 'storprof_{}_{}'.format(
        from_host, time.strftime('%Y%m%d_%H%M%S', time.localtime()))
    storage_helper.create_storage_profile(from_host, profile_name=prof_name)
    PROFILES_TO_DELETE.append(prof_name)

    # Deleting VMs in case the remaining host(s) cannot handle all VMs
    # migrating on lock, particularly important in the case of AIO-DX systems.
    LOG.tc_step(
        "Delete all VMs and lock the host before applying the storage profile")
    vm_helper.delete_vms()
    HostsToRecover.add(to_host, scope='function')
    system_helper.wait_for_host_values(from_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)
    system_helper.wait_for_host_values(to_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)

    # Negative test #1 - attempt to apply profile on unlocked host (should be rejected)
    LOG.tc_step('Apply the storage-profile {} onto unlocked host:{}'.format(
        prof_name, to_host))
    cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
    rc, msg = cli.system(cmd, fail_ok=True)
    assert rc != 0, msg
    host_helper.lock_host(to_host, swact=True)

    # 3 conditions to watch for: no partitions, ready partitions and in-use
    # partitions on the compute.  If in-use, delete and freshly install host.
    # If ready, delete all ready partitions to make room for potentially new
    # partitions.  If no partitions, just delete nova-local lvg.
    if personality == "compute":

        # Negative test #2 - attempt to apply profile onto host with existing
        # nova-local (should be rejected)
        LOG.tc_step(
            'Apply the storage-profile {} onto host with existing nova-local:{}'
            .format(prof_name, to_host))
        cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
        rc, msg = cli.system(cmd, fail_ok=True)
        assert rc != 0, msg

        # If we were simply switching backing (without applying a storage
        # profile), the nova-local lvg deletion can be omitted according to design
        LOG.tc_step("Delete nova-local lvg on to host {}".format(to_host))
        cli.system("host-lvg-delete {} nova-local".format(to_host))

        in_use = storage_helper.get_host_partitions(to_host, "In-Use")

        if in_use:

            # Negative test #3 - attempt to apply profile onto host with existing
            # in-use partitions (should be rejected)
            LOG.tc_step('Apply the storage-profile {} onto host with existing \
                         in-use partitions:{}'.format(prof_name, to_host))
            cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
            rc, msg = cli.system(cmd, fail_ok=True)
            assert rc != 0, msg

            LOG.tc_step(
                "In-use partitions found.  Must delete the host and freshly install before proceeding."
            )
            LOG.info("Host {} has in-use partitions {}".format(
                to_host, in_use))
            lab = InstallVars.get_install_var("LAB")
            lab.update(create_node_dict(lab['compute_nodes'], 'compute'))
            lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
            install_helper.open_vlm_console_thread(to_host)

            LOG.tc_step("Delete the host {}".format(to_host))
            cli.system("host-bulk-export")
            cli.system("host-delete {}".format(to_host))
            assert len(
                system_helper.get_controllers()) > 1, "Host deletion failed"

            cli.system("host-bulk-add hosts.xml")
            system_helper.wait_for_host_values(
                to_host, timeout=6000, availability=HostAvailState.ONLINE)

            wait_for_disks(to_host)

        ready = storage_helper.get_host_partitions(to_host, "Ready")
        if ready:
            LOG.tc_step(
                "Ready partitions have been found.  Must delete them before profile application"
            )
            LOG.info("Host {} has Ready partitions {}".format(to_host, ready))
            for uuid in reversed(ready):
                storage_helper.delete_host_partition(to_host, uuid)
            # Don't bother restoring in this case since the system should be
            # functional after profile is applied.

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"

    if personality == "storage":
        if not storage_helper.is_ceph_healthy():
            skip("Cannot run test when ceph is not healthy")

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        # Re-provision interfaces through lab_setup.sh
        LOG.tc_step("Reprovision the host as necessary")
        files = ['interfaces']
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

    if personality == "controller":

        # Note, install helper doesn't work on all labs.  Some labs don't
        # display BIOS type which causes install helper to fail
        lab = InstallVars.get_install_var("LAB")
        lab.update(create_node_dict(lab['controller_nodes'], 'controller'))
        lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
        install_helper.open_vlm_console_thread(to_host)

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        assert len(system_helper.get_controllers()) > 1, "Host deletion failed"

        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step("Apply the storage-profile {} onto host:{}".format(
            prof_name, to_host))
        cli.system("host-apply-storprofile {} {}".format(to_host, prof_name))

        # Need to re-provision everything on node through lab_setup (except storage)
        LOG.tc_step("Reprovision the host as necessary")
        files = [
            'interfaces', 'cinder_device', 'vswitch_cpus', 'shared_cpus',
            'extend_cgts_vg', 'addresses'
        ]
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"
Exemple #27
0
def test_storgroup_semantic_checks():
    """
    This test validates CEPH semantic checks as it applies to storage nodes in
    a replication group.

    Args:
        - None

    Setup:
        - Requires a system with storage nodes (minimum of 2)
        - Requires TiS Release 3 and up

    Test Steps:
        1.  Lock one storage node in a storage node pair
        2.  Check the appropriate alarms are raised
        3.  Check OSDs are down on the storage node
        4.  Check that CEPH is no longer healthy
        5.  Attempt to lock the other node and ensure it is rejected
        6.  Attempt to force lock the other node and ensure it is rejected
        7.  If the storage node is a storage monitor, attempt to lock and force
            lock the controllers
        8.  Unlock the storage node in the storage node pair
        9.  Check that the alarms are cleared
        10.  Check that OSDs are up
        11.  Check that CEPH is healthy

    Defects this addresses:
        1.  CGTS-4286 Unexpected allowing lock action on storage node peergroup
            when redundancy lost
        2.  CGTS-3494 Some OSDs observed to be up on locked storage node
        3.  CGTS-3643 Able to lock standby controller despite only two CEPH
            monitors being available
        4.  CGTS-2690 Storage: Force locking a controller should be rejected when storage
            is locked.
    """

    con_ssh = ControllerClient.get_active_controller()

    table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1])
    capabilities = table_parser.get_value_two_col_table(table_, 'capabilities')
    replication_factor = capabilities[1]
    LOG.info("The replication factor is: {}".format(replication_factor))

    # We want to test storage-0 since it is a ceph monitor
    # Then we want to test another storage host in another group.  The choice
    # depends on the replication factor.
    storage_nodes = ["storage-0"]
    if replication_factor == "3":
        storage_nodes.append("storage-3")

    if replication_factor == "2" and len(storage_nodes) > 2:
        storage_nodes.append("storage-2")

    LOG.info("Storage hosts under test are: {}".format(storage_nodes))

    for host in storage_nodes:
        LOG.tc_step('Lock {}:'.format(host))
        HostsToRecover.add(host, scope='function')
        rtn_code, out = host_helper.lock_host(host)
        assert rtn_code == 0, out

        LOG.tc_step("Verify CEPH cluster health reflects the OSD being down")
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check that alarms are raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raise')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that the ceph health warning alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        hosts = []
        if host == 'storage-0':
            hosts.append('controller-0')
            hosts.append('controller-1')

        for node in hosts:
            LOG.tc_step('Attempt to lock the {}'.format(node))
            HostsToRecover.add(node)
            rtn_code, out = host_helper.lock_host(node, fail_ok=True)
            assert 1 == rtn_code, out

            LOG.tc_step('Attempt to force lock {}'.format(node))
            rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True)
            assert 1 == rtn_code, out

        LOG.tc_step('Unlock storage host {}'.format(host))
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        LOG.info("Check if alarms have cleared")
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg
Exemple #28
0
def _test_system_alarm_on_host_lock():
    """
    Verify fm event-list command in the system upon host-lock

    Scenario:
    1. Execute "fm alarm-list" command in the system.
    2. Lock one compute and wait 30 seconds.
    3. Verify commands return list of active alarms in table with expected
    rows.
    """

    LOG.info("Execute fm alarm-list. Verify header of " +
             "a table consist of correct items")

    # Get and save the list of existing alarms present in the system
    res, out = cli.fm('alarm-list')
    alarm_list = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    current_alarms = []
    for alarm in alarm_list['values']:
        if re.match(".", alarm[0].strip()) is not None:
            current_alarms.append(alarm[0])
            LOG.info("The current alarms in the system are: "
                     "{0}".format(alarm[0]))

    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    # Check that a valid alarm header is present
    alarm_header = [
        'UUID', 'Time Stamp', 'State', 'Event Log ID', 'Reason Text',
        'Entity Instance ID', 'Severity'
    ]
    if hist_alarm_table['headers'] != alarm_header:
        LOG.info("Fields in table not correct actual {0} expected {1}".format(
            hist_alarm_table['headers'], alarm_header))

    # Verify the existing alarms are present in the historical list in state 'set'
    for name in current_alarms:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Raise a new alarm by locking a compute node
    # Get the compute
        LOG.info("Lock compute and wait 30 seconds")
    host = 'compute-1'
    if system_helper.is_aio_duplex():
        host = system_helper.get_standby_controller_name()

    HostsToRecover.add(host, scope='function')
    host_helper.lock_host(host)
    time.sleep(20)

    # Verify the new alarm is present in the historical alarm and active alarm lists
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    # Save the list of new alarms present in the list
    new_alarms = []
    for alarm in new_active_alarm_table['values']:
        if (re.match(".", alarm[0].strip()) is not None):
            new_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    new_alarm_list = list(set(new_alarms) - set(current_alarms))
    LOG.info(new_alarm_list)

    # Verify the new alarms are present in the historical list in state 'set'
    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Clear the alarm by unlocking the compute node
        LOG.info("Unlock compute and wait 30 seconds")
    compute_ssh = host_helper.unlock_host(host)
    time.sleep(30)

    #Verify the alarm clear is shown in the historical table
    LOG.info("Verify event-list command returns list of active alarms")
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['clear']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    #Verify the alarm disappears from the active alarm table
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    active_alarms = []
    for alarm in new_active_alarm_table['values']:
        if re.match(".", alarm[0].strip()) is not None:
            active_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    for name in new_alarm_list:
        if name in active_alarms:
            LOG.info("The alarm was not cleared from the active alarm table")
            test_res = False
            break
Exemple #29
0
def ovs_dpdk_1_core():
    LOG.fixture_step("Review the ovs-dpdk vswitch be in just 1 core")
    vswitch_type = "ovs-dpdk"
    cpu_function = "vswitch"
    proc = "0"
    host_list = host_helper.get_hypervisors()
    for host in host_list:
        with host_helper.ssh_to_host(host) as node_ssh:
            cmd = "cat /proc/meminfo | grep Hugepagesize | awk '{print $2}'"
            hp = int(
                node_ssh.exec_cmd(cmd=cmd, fail_ok=False,
                                  get_exit_code=False)[1])
        mem = host_helper.get_host_memories(
            host=host,
            headers=("app_hp_avail_2M", "app_hp_avail_1G", "mem_avail(MiB)",
                     "vs_hp_total"))
        if hp == 1048576:
            if int(mem[proc][3]) < 2 or mem[proc][1] < 10:
                HostsToRecover.add(hostnames=host, scope="module")
                host_helper.lock_host(host=host)
                if int(mem[proc][3]) < 2:
                    args = ' -f vswitch -1G {} {} {}'.format(2, host, proc)
                    cli.system('host-memory-modify', args)
                    host_helper.modify_host_cpu(host=host,
                                                cpu_function=cpu_function,
                                                **{"p{}".format(proc): 1})
                    # TODO maybe find a better option than sleep since we can't wait for applyying
                    # container_helper.wait_for_apps_status(apps='stx-openstack',
                    #                                       status=AppStatus.APPLYING)
                    time.sleep(60)
                    container_helper.wait_for_apps_status(
                        apps='stx-openstack',
                        status=AppStatus.APPLIED,
                        check_interval=30)
                if mem[proc][1] < 10:
                    args = ' -1G {} {} {}'.format(10, host, proc)
                    cli.system('host-memory-modify', args)
                host_helper.unlock_host(host=host)
        if hp == 2048:
            if int(mem[proc][3]) < 512 or mem[proc][0] < 2500:
                host_helper.lock_host(host=host)
                if int(mem[proc][3]) < 512:
                    system_helper.modify_system(
                        **{"vswitch_type": vswitch_type})
                    vswitch_args = ' -f vswitch -2M {} {} {}'.format(
                        512, host, proc)
                    cli.system('host-memory-modify', vswitch_args)
                    host_helper.modify_host_cpu(host=host,
                                                cpu_function=cpu_function,
                                                **{"p{}".format(proc): 1})
                    # TODO maybe find a better option than sleep since we can't wait for applyying
                    # container_helper.wait_for_apps_status(apps='stx-openstack',
                    #                                     status=AppStatus.APPLIED)
                    time.sleep(60)
                    container_helper.wait_for_apps_status(
                        apps='stx-openstack',
                        status=AppStatus.APPLIED,
                        check_interval=30)
                if mem[proc][0] < 2500:
                    args = ' -2M {} {} {}'.format(2500, host, proc)
                    cli.system('host-memory-modify', args)
                host_helper.unlock_host(host=host)

        test_table = host_helper.get_host_cpu_list_table(host=host)
        curr_assigned_function_list = table_parser.get_values(
            test_table, "assigned_function")
        assert "vSwitch" in curr_assigned_function_list
Exemple #30
0
def test_set_hosts_storage_backing_equal(instance_backing, number_of_hosts):
    """
    Modify hosts storage backing if needed so that system has exact number
    of hosts in given instance backing

    Args:
        instance_backing:
        number_of_hosts:

    Test Steps:
        - Calculate the hosts to be configured based on test params
        - Configure hosts to meet given criteria
        - Check number of hosts in given instance backing is as specified

    """
    host_num_mapping = {'zero': 0, 'one': 1, 'two': 2}
    number_of_hosts = host_num_mapping[number_of_hosts]
    LOG.tc_step("Calculate the hosts to be configured based on test params")
    candidate_hosts = get_candidate_hosts(number_of_hosts=number_of_hosts)

    hosts_with_backing = \
        host_helper.get_hosts_in_storage_backing(instance_backing)
    if len(hosts_with_backing) == number_of_hosts:
        LOG.info("Already have {} hosts in {} backing. Do "
                 "nothing".format(number_of_hosts, instance_backing))
        return

    elif len(hosts_with_backing) < number_of_hosts:
        backing_to_config = instance_backing
        number_to_config = number_of_hosts - len(hosts_with_backing)
        hosts_pool = list(set(candidate_hosts) - set(hosts_with_backing))
    else:
        backing_to_config = 'remote' if 'image' in instance_backing else \
            'local_image'
        number_to_config = len(hosts_with_backing) - number_of_hosts
        hosts_pool = hosts_with_backing

    LOG.tc_step("Delete vms if any to prepare for system configuration "
                "change with best effort")
    vm_helper.delete_vms(fail_ok=True)

    hosts_to_config = hosts_pool[0:number_to_config]
    LOG.tc_step("Configure following hosts to {} backing: "
                "{}".format(hosts_to_config, backing_to_config))

    for host in hosts_to_config:
        host_helper.set_host_storage_backing(host=host,
                                             inst_backing=backing_to_config,
                                             unlock=False,
                                             wait_for_configured=False)
        HostsToRecover.add(host)

    host_helper.unlock_hosts(hosts_to_config,
                             check_hypervisor_up=True,
                             fail_ok=False)

    LOG.tc_step("Waiting for hosts in {} aggregate".format(backing_to_config))
    for host in hosts_to_config:
        host_helper.wait_for_host_in_instance_backing(
            host, storage_backing=backing_to_config)

    LOG.tc_step("Check number of {} hosts is {}".format(
        instance_backing, number_of_hosts))
    assert number_of_hosts == \
        len(host_helper.get_hosts_in_storage_backing(instance_backing)), \
        "Number of {} hosts is not {} after " \
        "configuration".format(instance_backing, number_of_hosts)