Beispiel #1
0
    def test_lock_with_max_vms_simplex(self, simplex_only):
        vms_num = host_helper.get_max_vms_supported(host='controller-0')
        vm_helper.ensure_vms_quotas(vms_num=vms_num)

        LOG.tc_step(
            "Boot {} vms with various storage settings".format(vms_num))
        vms = vm_helper.boot_vms_various_types(cleanup='function',
                                               vms_num=vms_num)

        LOG.tc_step("Lock vm host on simplex system")
        HostsToRecover.add('controller-0')
        host_helper.lock_host('controller-0')

        LOG.tc_step("Ensure vms are in {} state after locked host come "
                    "online".format(VMStatus.STOPPED))
        vm_helper.wait_for_vms_values(vms,
                                      value=VMStatus.STOPPED,
                                      fail_ok=False)

        LOG.tc_step("Unlock host on simplex system")
        host_helper.unlock_host(host='controller-0')

        LOG.tc_step("Ensure vms are Active and Pingable from NatBox")
        vm_helper.wait_for_vms_values(vms,
                                      value=VMStatus.ACTIVE,
                                      fail_ok=False,
                                      timeout=600)
        for vm in vms:
            vm_helper.wait_for_vm_pingable_from_natbox(
                vm, timeout=VMTimeout.DHCP_RETRY)
def test_dc_dead_office_recovery_central(
        reserve_unreserve_all_hosts_module_central):
    """
    Test dead office recovery main cloud
    Args:
    Setups:
        - Reserve all nodes for central cloud in vlm

    Test Steps:
        - Launch various types of VMs in primary clouds.
        - Power off all nodes in vlm using multi-processing to simulate a power outage
        - Power on all nodes
        - Wait for nodes to become online/available
        - Check all the subclouds are syncs as start of the test.
        - check all the VMs are up in subclouds which are launched.
    """
    LOG.tc_step("Boot 5 vms with various boot_source, disks, etc")
    vms = vm_helper.boot_vms_various_types()
    central_auth = Tenant.get('admin_platform', dc_region='SystemController')
    hosts = system_helper.get_hosts(auth_info=central_auth)
    managed_subclouds = dc_helper.get_subclouds(mgmt='managed', avail='online')
    hosts_to_check = system_helper.get_hosts(
        availability=['available', 'online'], auth_info=central_auth)
    LOG.info("Online or Available hosts before power-off: {}".format(
        hosts_to_check))

    LOG.tc_step(
        "Powering off hosts in multi-processes to simulate power outage: {}".
        format(hosts))
    try:
        vlm_helper.power_off_hosts_simultaneously(hosts,
                                                  region='central_region')
    except:
        raise
    finally:
        LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts))
        time.sleep(60)
        LOG.info("Hosts to check after power-on: {}".format(hosts_to_check))
        vlm_helper.power_on_hosts(hosts,
                                  reserve=False,
                                  reconnect_timeout=HostTimeout.REBOOT +
                                  HostTimeout.REBOOT,
                                  hosts_to_check=hosts_to_check,
                                  region='central_region')

    LOG.tc_step("Check subclouds managed")
    current_managed_subclouds = dc_helper.get_subclouds(mgmt='managed',
                                                        avail='online')
    assert managed_subclouds == current_managed_subclouds, 'current managed subclouds are diffrent from \
                                            origin {} current {}'.format(
        current_managed_subclouds, managed_subclouds)

    LOG.tc_step("Check vms are recovered after dead office recovery")
    vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)

    LOG.tc_step("Check vms are reachable after central clouds DOR test")
    for vm in vms:
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
Beispiel #3
0
    def test_reboot_only_host(self, get_zone):
        """
        Test reboot only hypervisor on the system

        Args:
            get_zone: fixture to create stxauto aggregate, to ensure vms can
            only on one host

        Setups:
            - If more than 1 hypervisor: Create stxauto aggregate and add
            one host to the aggregate

        Test Steps:
            - Launch various vms on target host
                - vm booted from cinder volume,
                - vm booted from glance image,
                - vm booted from glance image, and have an extra cinder
                volume attached after launch,
                - vm booed from cinder volume with ephemeral and swap disks
            - sudo reboot -f only host
            - Check host is recovered
            - Check vms are recovered and reachable from NatBox

        """
        zone = get_zone

        LOG.tc_step("Launch 5 vms in {} zone".format(zone))
        vms = vm_helper.boot_vms_various_types(avail_zone=zone,
                                               cleanup='function')
        target_host = vm_helper.get_vm_host(vm_id=vms[0])
        for vm in vms[1:]:
            vm_host = vm_helper.get_vm_host(vm)
            assert target_host == vm_host, "VMs are not booted on same host"

        LOG.tc_step("Reboot -f from target host {}".format(target_host))
        HostsToRecover.add(target_host)
        host_helper.reboot_hosts(target_host)

        LOG.tc_step("Check vms are in Active state after host come back up")
        res, active_vms, inactive_vms = vm_helper.wait_for_vms_values(
            vms=vms, value=VMStatus.ACTIVE, timeout=600)

        vms_host_err = []
        for vm in vms:
            if vm_helper.get_vm_host(vm) != target_host:
                vms_host_err.append(vm)

        assert not vms_host_err, "Following VMs are not on the same host {}: " \
                                 "{}\nVMs did not reach Active state: {}". \
            format(target_host, vms_host_err, inactive_vms)

        assert not inactive_vms, "VMs did not reach Active state after " \
                                 "evacuated to other host: " \
                                 "{}".format(inactive_vms)

        LOG.tc_step("Check VMs are pingable from NatBox after evacuation")
        vm_helper.wait_for_vm_pingable_from_natbox(
            vms, timeout=VMTimeout.DHCP_RETRY)
Beispiel #4
0
def test_evacuate_vms_stress(add_hosts_to_zone):
    """
    Test evacuate vms with various vm storage configs and host instance backing configs

    Args:
        storage_backing: storage backing under test
        add_admin_role_class (None): test fixture to add admin role to primary tenant

    Skip conditions:
        - Less than two hosts configured with storage backing under test

    Setups:
        - Add admin role to primary tenant (module)

    Test Steps:
        - Create flv_rootdisk without ephemeral or swap disks, and set storage backing extra spec
        - Create flv_ephemswap with ephemeral AND swap disks, and set storage backing extra spec
        - Boot following vms on same host and wait for them to be pingable from NatBox:
            - Boot vm1 from volume with flavor flv_rootdisk
            - Boot vm2 from volume with flavor flv_localdisk
            - Boot vm3 from image with flavor flv_rootdisk
            - Boot vm4 from image with flavor flv_rootdisk, and attach a volume to it
            - Boot vm5 from image with flavor flv_localdisk
        - power-off host from vlm
        - Ensure evacuation for all 5 vms are successful (vm host changed, active state, pingable from NatBox)
        - Repeat above evacuation steps

    Teardown:
        - Delete created vms, volumes, flavors
        - Remove admin role from primary tenant (module)

    """
    storage_backing, hosts = add_hosts_to_zone
    zone = 'cgcsauto'

    HostsToRecover.add(hosts)

    initial_host = hosts[0]

    vms = vm_helper.boot_vms_various_types(storage_backing=storage_backing, target_host=initial_host, avail_zone=zone)

    target_host = initial_host

    for i in range(100):
        post_host = hosts[0] if target_host != hosts[0] else hosts[1]
        LOG.info("\n===============Iteration {}============".format(i+1))
        vm_helper.evacuate_vms(target_host, vms, wait_for_host_up=True, post_host=post_host, timeout=720, vlm=True,
                               ping_vms=True)

        target_host = post_host
        LOG.info("Rest for 120 seconds before next evacuation")
        time.sleep(120)
Beispiel #5
0
def test_dead_office_recovery(reserve_unreserve_all_hosts_module):
    """
    Test dead office recovery with vms
    Args:
        reserve_unreserve_all_hosts_module: test fixture to reserve unreserve all vlm nodes for lab under test

    Setups:
        - Reserve all nodes in vlm

    Test Steps:
        - Boot 5 vms with various boot_source, disks, etc and ensure they can be reached from NatBox
        - Power off all nodes in vlm using multi-processing to simulate a power outage
        - Power on all nodes
        - Wait for nodes to become online/available
        - Check vms are recovered after hosts come back up and vms can be reached from NatBox

    """
    LOG.tc_step("Boot 5 vms with various boot_source, disks, etc")
    vms = vm_helper.boot_vms_various_types()

    hosts = system_helper.get_hosts()
    hosts_to_check = system_helper.get_hosts(availability=['available', 'online'])

    LOG.info("Online or Available hosts before power-off: {}".format(hosts_to_check))
    LOG.tc_step("Powering off hosts in multi-processes to simulate power outage: {}".format(hosts))
    region = None
    if ProjVar.get_var('IS_DC'):
        region = ProjVar.get_var('PRIMARY_SUBCLOUD')

    try:
        vlm_helper.power_off_hosts_simultaneously(hosts, region=region)
    except:
        raise
    finally:
        LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts))
        time.sleep(60)
        LOG.info("Hosts to check after power-on: {}".format(hosts_to_check))
        vlm_helper.power_on_hosts(hosts, reserve=False, reconnect_timeout=HostTimeout.REBOOT+HostTimeout.REBOOT,
                                  hosts_to_check=hosts_to_check, region=region)

    LOG.tc_step("Check vms are recovered after dead office recovery")
    vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)
    for vm in vms:
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
    computes = host_helper.get_hypervisors()
    if len(computes) >= 4:
        system_helper.wait_for_alarm(alarm_id=EventLogID.MULTI_NODE_RECOVERY, timeout=120)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.MULTI_NODE_RECOVERY, check_interval=60, timeout=1200)
def test_force_lock_with_mig_vms(get_hosts_with_backing):
    """
    Test force lock host with migrate-able vms on it

    Prerequisites:
        - Minimum of two hosts supporting the same storage backing.
    Test Setups:
        - Add admin role to primary tenant
        - Boot various VMs on host_under_test that can be live migrated
    Test Steps:
        - Get status info from VMs
        - Force lock target host
        - Verify force lock returns 0
        - Wait until VMs are active on a secondary host
        - Verify VMs can be pinged
    Test Teardown:
        - Remove admin role from primary tenant
        - Delete created vms
        - Unlock locked target host(s)
    """
    storage_backing, host_under_test = get_hosts_with_backing

    # Boot VMs on the host.
    LOG.tc_step("Boot VMs on {}".format(host_under_test))
    vm_ids = vm_helper.boot_vms_various_types(storage_backing=storage_backing,
                                              target_host=host_under_test,
                                              cleanup='function')

    # Force lock host that VMs are booted on
    LOG.tc_step("Force lock {}".format(host_under_test))
    HostsToRecover.add(host_under_test)
    lock_code, lock_output = host_helper.lock_host(host_under_test,
                                                   force=True,
                                                   check_first=False)
    assert lock_code == 0, "Failed to force lock {}. Details: {}".format(
        host_under_test, lock_output)

    # Expect VMs to migrate off force-locked host (non-gracefully)
    LOG.tc_step(
        "Wait for 'Active' status of VMs after host force lock completes")
    vm_helper.wait_for_vms_values(vm_ids, fail_ok=False)

    for vm in vm_ids:
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm, timeout=VMTimeout.DHCP_RETRY)
Beispiel #7
0
def test_lock_stor_check_osds_down(stx_openstack_required, host):
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that all OSDs go down on a locked storage
    node.  There are two variants:

    1.  Lock 'storage-0' which is a ceph monitor
    2.  Lock a storage node that is not 'storage-0', i.e. not a ceph monitor

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock storage node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs on the locked storage node are down
            - Check that the appropriate alarms are raised:
        3.  Unlock storage node
            - ensure CEPH is HEALTH_OK
            - ensure all OSDs on unlocked node are up
            - Check that alarms are cleared

    Note: If the storage node to be locked is monitor, we also expect to see
    the mon down alarm.

    What defects this addresses:
        1.  CGTS-2609 - Ceph processes fail to start after storage node reboot

    Notes:
        - Updated test to write to disk to add I/O load on system

    """

    con_ssh = ControllerClient.get_active_controller()

    if host == 'any':
        storage_nodes = system_helper.get_hosts(personality='storage')
        LOG.info('System has {} storage nodes:'.format(storage_nodes))
        storage_nodes.remove('storage-0')
        node_id = random.randint(0, len(storage_nodes) - 1)
        host = storage_nodes[node_id]

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")
    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        LOG.tc_step('Lock storage node {}'.format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host, check_first=False)

        LOG.tc_step('Determine the storage group for host {}'.format(host))
        storage_group, msg = storage_helper.get_storage_group(host)
        LOG.info(msg)

        LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that ceph is in health warn')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        # We're waiting 5 minutes for ceph rebalancing to be performed
        # DO NOT REMOVE.  This is part of the test.
        time.sleep(300)

        LOG.tc_step('Unlock storage node')
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        health = False
        end_time = time.time() + 40
        while time.time() < end_time:
            health = storage_helper.is_ceph_healthy(con_ssh)
            if health is True:
                break
        assert health, "Ceph did not become healthy"

        LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host))
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that the replication group alarm is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        LOG.tc_step('Check that the Storage Alarm Condition is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg

        LOG.tc_step('Check health of CEPH cluster')
        end_time = time.time() + 40
        while time.time() < end_time:
            ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
            if ceph_healthy is True:
                break

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Beispiel #8
0
def test_ceph_reboot_storage_node(stx_openstack_required):
    """
    us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt

    Verify that ceph mon processes recover when they are killed
    nodes.

    Args:
        - Nothing

    Setup:
        - Requires system with storage nodes

    Test Steps:
        0.  Run CEPH pre-check fixture to check:
            - system has storage nodes
            - health of the ceph cluster is okay
            - that we have OSDs provisioned
        1.  Delete existing VMs
        2.  Boot new VMs and run dd on them
        3.  Reboot storage node and ensure both:
            - mon state goes down (if storage-0)
            - OSD state goes down
        4.  Ensure mon and OSD state recover afterwards
        5.  Cleanup VMs

    Potential rework:
        1.  Add the alarms checks for raise and clear
        2.  Maybe we don't want to reboot all storage nodes

    What defects this addresses:
        1.  CGTS-2975

    Update:
        This test was updated for the Storage and Robustness feature.
    """
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")

    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        storage_nodes = system_helper.get_storage_nodes(con_ssh)

        for host in storage_nodes:
            LOG.tc_step('Reboot {}'.format(host))
            HostsToRecover.add(host, scope='function')
            host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False)

            LOG.tc_step('Check health of CEPH cluster')
            ceph_healthy = True
            msg = None
            end_time = time.time() + 10
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if not ceph_healthy:
                    break

            assert not ceph_healthy, "ceph is not healthy"
            LOG.info(msg)

            LOG.tc_step('Check that OSDs are down')
            osd_list = storage_helper.get_osds(host, con_ssh)
            all_osds_up = True
            up_list = osd_list.copy()
            end_time = time.time() + 60
            while time.time() < end_time and all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if not osd_up:
                        msg = 'OSD ID {} is down as expected'.format(osd_id)
                        LOG.info(msg)
                        up_list.remove(osd_id)
                if len(up_list) > 0:
                    osd_list = up_list.copy()
                else:
                    msg = ' All OSDs are down as expected'
                    LOG.info(msg)
                    all_osds_up = False

            assert not all_osds_up, " One or more OSD(s) {}  is(are) up but should be down".format(up_list)

            system_helper.wait_for_host_values(host, availability='available')

            LOG.tc_step('Check that OSDs are up')
            osd_list = storage_helper.get_osds(host, con_ssh)
            down_list = osd_list.copy()
            all_osds_up = False
            end_time = time.time() + 60
            while time.time() < end_time and not all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if osd_up:
                        msg = 'OSD ID {} is up as expected'.format(osd_id)
                        LOG.info(msg)
                        down_list.remove(osd_id)
                if len(down_list) > 0:
                    osd_list = down_list.copy()
                else:
                    msg = ' All OSDs are up as expected'
                    LOG.info(msg)
                    all_osds_up = True

            assert all_osds_up, " One or more OSD(s) {}  is(are) down but should be up".format(down_list)

            LOG.tc_step('Check health of CEPH cluster')
            end_time = time.time() + 40
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if ceph_healthy is True:
                    break

            assert ceph_healthy, "ceph is not healthy"

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Beispiel #9
0
    def test_lock_with_vms(self, target_hosts, no_simplex,
                           add_admin_role_func):
        """
        Test lock host with vms on it.

        Args:
            target_hosts (list): targeted host(s) to lock that was prepared
            by the target_hosts test fixture.

        Skip Conditions:
            - Less than 2 hypervisor hosts on the system

        Prerequisites:
            - Hosts storage backing are pre-configured to storage backing
            under test
                ie., 2 or more hosts should support the storage backing under
                test.
        Test Setups:
            - Set instances quota to 10 if it was less than 8
            - Determine storage backing(s) under test. i.e.,storage backings
            supported by at least 2 hosts on the system
            - Create flavors with storage extra specs set based on storage
            backings under test
            - Create vms_to_test that can be live migrated using created flavors
            - Determine target host(s) to perform lock based on which host(s)
            have the most vms_to_test
            - Live migrate vms to target host(s)
        Test Steps:
            - Lock target host
            - Verify lock succeeded and vms status unchanged
            - Repeat above steps if more than one target host
        Test Teardown:
            - Delete created vms and volumes
            - Delete created flavors
            - Unlock locked target host(s)

        """
        storage_backing, host = target_hosts
        vms_num = 5
        vm_helper.ensure_vms_quotas(vms_num=vms_num)

        LOG.tc_step(
            "Boot {} vms with various storage settings".format(vms_num))
        vms = vm_helper.boot_vms_various_types(cleanup='function',
                                               vms_num=vms_num,
                                               storage_backing=storage_backing,
                                               target_host=host)

        LOG.tc_step("Attempt to lock target host {}...".format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host=host,
                              check_first=False,
                              fail_ok=False,
                              swact=True)

        LOG.tc_step("Verify lock succeeded and vms still in good state")
        vm_helper.wait_for_vms_values(vms=vms, fail_ok=False)
        for vm in vms:
            vm_host = vm_helper.get_vm_host(vm_id=vm)
            assert vm_host != host, "VM is still on {} after lock".format(host)

            vm_helper.wait_for_vm_pingable_from_natbox(
                vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
Beispiel #10
0
def test_boot_various_vms():
    vm_helper.boot_vms_various_types(cleanup=None)