def test_lock_with_max_vms_simplex(self, simplex_only): vms_num = host_helper.get_max_vms_supported(host='controller-0') vm_helper.ensure_vms_quotas(vms_num=vms_num) LOG.tc_step( "Boot {} vms with various storage settings".format(vms_num)) vms = vm_helper.boot_vms_various_types(cleanup='function', vms_num=vms_num) LOG.tc_step("Lock vm host on simplex system") HostsToRecover.add('controller-0') host_helper.lock_host('controller-0') LOG.tc_step("Ensure vms are in {} state after locked host come " "online".format(VMStatus.STOPPED)) vm_helper.wait_for_vms_values(vms, value=VMStatus.STOPPED, fail_ok=False) LOG.tc_step("Unlock host on simplex system") host_helper.unlock_host(host='controller-0') LOG.tc_step("Ensure vms are Active and Pingable from NatBox") vm_helper.wait_for_vms_values(vms, value=VMStatus.ACTIVE, fail_ok=False, timeout=600) for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox( vm, timeout=VMTimeout.DHCP_RETRY)
def test_dc_dead_office_recovery_central( reserve_unreserve_all_hosts_module_central): """ Test dead office recovery main cloud Args: Setups: - Reserve all nodes for central cloud in vlm Test Steps: - Launch various types of VMs in primary clouds. - Power off all nodes in vlm using multi-processing to simulate a power outage - Power on all nodes - Wait for nodes to become online/available - Check all the subclouds are syncs as start of the test. - check all the VMs are up in subclouds which are launched. """ LOG.tc_step("Boot 5 vms with various boot_source, disks, etc") vms = vm_helper.boot_vms_various_types() central_auth = Tenant.get('admin_platform', dc_region='SystemController') hosts = system_helper.get_hosts(auth_info=central_auth) managed_subclouds = dc_helper.get_subclouds(mgmt='managed', avail='online') hosts_to_check = system_helper.get_hosts( availability=['available', 'online'], auth_info=central_auth) LOG.info("Online or Available hosts before power-off: {}".format( hosts_to_check)) LOG.tc_step( "Powering off hosts in multi-processes to simulate power outage: {}". format(hosts)) try: vlm_helper.power_off_hosts_simultaneously(hosts, region='central_region') except: raise finally: LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts)) time.sleep(60) LOG.info("Hosts to check after power-on: {}".format(hosts_to_check)) vlm_helper.power_on_hosts(hosts, reserve=False, reconnect_timeout=HostTimeout.REBOOT + HostTimeout.REBOOT, hosts_to_check=hosts_to_check, region='central_region') LOG.tc_step("Check subclouds managed") current_managed_subclouds = dc_helper.get_subclouds(mgmt='managed', avail='online') assert managed_subclouds == current_managed_subclouds, 'current managed subclouds are diffrent from \ origin {} current {}'.format( current_managed_subclouds, managed_subclouds) LOG.tc_step("Check vms are recovered after dead office recovery") vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600) LOG.tc_step("Check vms are reachable after central clouds DOR test") for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox( vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
def test_reboot_only_host(self, get_zone): """ Test reboot only hypervisor on the system Args: get_zone: fixture to create stxauto aggregate, to ensure vms can only on one host Setups: - If more than 1 hypervisor: Create stxauto aggregate and add one host to the aggregate Test Steps: - Launch various vms on target host - vm booted from cinder volume, - vm booted from glance image, - vm booted from glance image, and have an extra cinder volume attached after launch, - vm booed from cinder volume with ephemeral and swap disks - sudo reboot -f only host - Check host is recovered - Check vms are recovered and reachable from NatBox """ zone = get_zone LOG.tc_step("Launch 5 vms in {} zone".format(zone)) vms = vm_helper.boot_vms_various_types(avail_zone=zone, cleanup='function') target_host = vm_helper.get_vm_host(vm_id=vms[0]) for vm in vms[1:]: vm_host = vm_helper.get_vm_host(vm) assert target_host == vm_host, "VMs are not booted on same host" LOG.tc_step("Reboot -f from target host {}".format(target_host)) HostsToRecover.add(target_host) host_helper.reboot_hosts(target_host) LOG.tc_step("Check vms are in Active state after host come back up") res, active_vms, inactive_vms = vm_helper.wait_for_vms_values( vms=vms, value=VMStatus.ACTIVE, timeout=600) vms_host_err = [] for vm in vms: if vm_helper.get_vm_host(vm) != target_host: vms_host_err.append(vm) assert not vms_host_err, "Following VMs are not on the same host {}: " \ "{}\nVMs did not reach Active state: {}". \ format(target_host, vms_host_err, inactive_vms) assert not inactive_vms, "VMs did not reach Active state after " \ "evacuated to other host: " \ "{}".format(inactive_vms) LOG.tc_step("Check VMs are pingable from NatBox after evacuation") vm_helper.wait_for_vm_pingable_from_natbox( vms, timeout=VMTimeout.DHCP_RETRY)
def test_evacuate_vms_stress(add_hosts_to_zone): """ Test evacuate vms with various vm storage configs and host instance backing configs Args: storage_backing: storage backing under test add_admin_role_class (None): test fixture to add admin role to primary tenant Skip conditions: - Less than two hosts configured with storage backing under test Setups: - Add admin role to primary tenant (module) Test Steps: - Create flv_rootdisk without ephemeral or swap disks, and set storage backing extra spec - Create flv_ephemswap with ephemeral AND swap disks, and set storage backing extra spec - Boot following vms on same host and wait for them to be pingable from NatBox: - Boot vm1 from volume with flavor flv_rootdisk - Boot vm2 from volume with flavor flv_localdisk - Boot vm3 from image with flavor flv_rootdisk - Boot vm4 from image with flavor flv_rootdisk, and attach a volume to it - Boot vm5 from image with flavor flv_localdisk - power-off host from vlm - Ensure evacuation for all 5 vms are successful (vm host changed, active state, pingable from NatBox) - Repeat above evacuation steps Teardown: - Delete created vms, volumes, flavors - Remove admin role from primary tenant (module) """ storage_backing, hosts = add_hosts_to_zone zone = 'cgcsauto' HostsToRecover.add(hosts) initial_host = hosts[0] vms = vm_helper.boot_vms_various_types(storage_backing=storage_backing, target_host=initial_host, avail_zone=zone) target_host = initial_host for i in range(100): post_host = hosts[0] if target_host != hosts[0] else hosts[1] LOG.info("\n===============Iteration {}============".format(i+1)) vm_helper.evacuate_vms(target_host, vms, wait_for_host_up=True, post_host=post_host, timeout=720, vlm=True, ping_vms=True) target_host = post_host LOG.info("Rest for 120 seconds before next evacuation") time.sleep(120)
def test_dead_office_recovery(reserve_unreserve_all_hosts_module): """ Test dead office recovery with vms Args: reserve_unreserve_all_hosts_module: test fixture to reserve unreserve all vlm nodes for lab under test Setups: - Reserve all nodes in vlm Test Steps: - Boot 5 vms with various boot_source, disks, etc and ensure they can be reached from NatBox - Power off all nodes in vlm using multi-processing to simulate a power outage - Power on all nodes - Wait for nodes to become online/available - Check vms are recovered after hosts come back up and vms can be reached from NatBox """ LOG.tc_step("Boot 5 vms with various boot_source, disks, etc") vms = vm_helper.boot_vms_various_types() hosts = system_helper.get_hosts() hosts_to_check = system_helper.get_hosts(availability=['available', 'online']) LOG.info("Online or Available hosts before power-off: {}".format(hosts_to_check)) LOG.tc_step("Powering off hosts in multi-processes to simulate power outage: {}".format(hosts)) region = None if ProjVar.get_var('IS_DC'): region = ProjVar.get_var('PRIMARY_SUBCLOUD') try: vlm_helper.power_off_hosts_simultaneously(hosts, region=region) except: raise finally: LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts)) time.sleep(60) LOG.info("Hosts to check after power-on: {}".format(hosts_to_check)) vlm_helper.power_on_hosts(hosts, reserve=False, reconnect_timeout=HostTimeout.REBOOT+HostTimeout.REBOOT, hosts_to_check=hosts_to_check, region=region) LOG.tc_step("Check vms are recovered after dead office recovery") vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600) for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm, timeout=VMTimeout.DHCP_RETRY) computes = host_helper.get_hypervisors() if len(computes) >= 4: system_helper.wait_for_alarm(alarm_id=EventLogID.MULTI_NODE_RECOVERY, timeout=120) system_helper.wait_for_alarm_gone(alarm_id=EventLogID.MULTI_NODE_RECOVERY, check_interval=60, timeout=1200)
def test_force_lock_with_mig_vms(get_hosts_with_backing): """ Test force lock host with migrate-able vms on it Prerequisites: - Minimum of two hosts supporting the same storage backing. Test Setups: - Add admin role to primary tenant - Boot various VMs on host_under_test that can be live migrated Test Steps: - Get status info from VMs - Force lock target host - Verify force lock returns 0 - Wait until VMs are active on a secondary host - Verify VMs can be pinged Test Teardown: - Remove admin role from primary tenant - Delete created vms - Unlock locked target host(s) """ storage_backing, host_under_test = get_hosts_with_backing # Boot VMs on the host. LOG.tc_step("Boot VMs on {}".format(host_under_test)) vm_ids = vm_helper.boot_vms_various_types(storage_backing=storage_backing, target_host=host_under_test, cleanup='function') # Force lock host that VMs are booted on LOG.tc_step("Force lock {}".format(host_under_test)) HostsToRecover.add(host_under_test) lock_code, lock_output = host_helper.lock_host(host_under_test, force=True, check_first=False) assert lock_code == 0, "Failed to force lock {}. Details: {}".format( host_under_test, lock_output) # Expect VMs to migrate off force-locked host (non-gracefully) LOG.tc_step( "Wait for 'Active' status of VMs after host force lock completes") vm_helper.wait_for_vms_values(vm_ids, fail_ok=False) for vm in vm_ids: vm_helper.wait_for_vm_pingable_from_natbox( vm, timeout=VMTimeout.DHCP_RETRY)
def test_lock_stor_check_osds_down(stx_openstack_required, host): """ This test is adapted from us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt The goal of this test is to check that all OSDs go down on a locked storage node. There are two variants: 1. Lock 'storage-0' which is a ceph monitor 2. Lock a storage node that is not 'storage-0', i.e. not a ceph monitor Args: - None Setup: - Requires system with storage nodes Test Steps: 1. Lock storage node 2. Check - CEPH cluster is in HEALTH_WARN - Ensure all OSDs on the locked storage node are down - Check that the appropriate alarms are raised: 3. Unlock storage node - ensure CEPH is HEALTH_OK - ensure all OSDs on unlocked node are up - Check that alarms are cleared Note: If the storage node to be locked is monitor, we also expect to see the mon down alarm. What defects this addresses: 1. CGTS-2609 - Ceph processes fail to start after storage node reboot Notes: - Updated test to write to disk to add I/O load on system """ con_ssh = ControllerClient.get_active_controller() if host == 'any': storage_nodes = system_helper.get_hosts(personality='storage') LOG.info('System has {} storage nodes:'.format(storage_nodes)) storage_nodes.remove('storage-0') node_id = random.randint(0, len(storage_nodes) - 1) host = storage_nodes[node_id] LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) LOG.tc_step('Lock storage node {}'.format(host)) HostsToRecover.add(host) host_helper.lock_host(host, check_first=False) LOG.tc_step('Determine the storage group for host {}'.format(host)) storage_group, msg = storage_helper.get_storage_group(host) LOG.info(msg) LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \ "Alarm {} not raised".format(EventLogID.HOST_LOCK) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) assert not ceph_healthy LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} is up but should be down'.format(osd_id) assert not osd_up, msg msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) LOG.tc_step('Check that loss of replication alarm is raised') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that ceph is in health warn') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND) # We're waiting 5 minutes for ceph rebalancing to be performed # DO NOT REMOVE. This is part of the test. time.sleep(300) LOG.tc_step('Unlock storage node') rtn_code, out = host_helper.unlock_host(host) assert rtn_code == 0, out health = False end_time = time.time() + 40 while time.time() < end_time: health = storage_helper.is_ceph_healthy(con_ssh) if health is True: break assert health, "Ceph did not become healthy" LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host)) assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \ "Alarm {} not cleared".format(EventLogID.HOST_LOCK) LOG.tc_step('Check that the replication group alarm is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \ "Alarm {} not cleared".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that the Storage Alarm Condition is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \ "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check OSDs are up after unlock') for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} should be up but is not'.format(osd_id) assert osd_up, msg LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()
def test_ceph_reboot_storage_node(stx_openstack_required): """ us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt Verify that ceph mon processes recover when they are killed nodes. Args: - Nothing Setup: - Requires system with storage nodes Test Steps: 0. Run CEPH pre-check fixture to check: - system has storage nodes - health of the ceph cluster is okay - that we have OSDs provisioned 1. Delete existing VMs 2. Boot new VMs and run dd on them 3. Reboot storage node and ensure both: - mon state goes down (if storage-0) - OSD state goes down 4. Ensure mon and OSD state recover afterwards 5. Cleanup VMs Potential rework: 1. Add the alarms checks for raise and clear 2. Maybe we don't want to reboot all storage nodes What defects this addresses: 1. CGTS-2975 Update: This test was updated for the Storage and Robustness feature. """ con_ssh = ControllerClient.get_active_controller() LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) storage_nodes = system_helper.get_storage_nodes(con_ssh) for host in storage_nodes: LOG.tc_step('Reboot {}'.format(host)) HostsToRecover.add(host, scope='function') host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = True msg = None end_time = time.time() + 10 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if not ceph_healthy: break assert not ceph_healthy, "ceph is not healthy" LOG.info(msg) LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) all_osds_up = True up_list = osd_list.copy() end_time = time.time() + 60 while time.time() < end_time and all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if not osd_up: msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) up_list.remove(osd_id) if len(up_list) > 0: osd_list = up_list.copy() else: msg = ' All OSDs are down as expected' LOG.info(msg) all_osds_up = False assert not all_osds_up, " One or more OSD(s) {} is(are) up but should be down".format(up_list) system_helper.wait_for_host_values(host, availability='available') LOG.tc_step('Check that OSDs are up') osd_list = storage_helper.get_osds(host, con_ssh) down_list = osd_list.copy() all_osds_up = False end_time = time.time() + 60 while time.time() < end_time and not all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if osd_up: msg = 'OSD ID {} is up as expected'.format(osd_id) LOG.info(msg) down_list.remove(osd_id) if len(down_list) > 0: osd_list = down_list.copy() else: msg = ' All OSDs are up as expected' LOG.info(msg) all_osds_up = True assert all_osds_up, " One or more OSD(s) {} is(are) down but should be up".format(down_list) LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break assert ceph_healthy, "ceph is not healthy" for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()
def test_lock_with_vms(self, target_hosts, no_simplex, add_admin_role_func): """ Test lock host with vms on it. Args: target_hosts (list): targeted host(s) to lock that was prepared by the target_hosts test fixture. Skip Conditions: - Less than 2 hypervisor hosts on the system Prerequisites: - Hosts storage backing are pre-configured to storage backing under test ie., 2 or more hosts should support the storage backing under test. Test Setups: - Set instances quota to 10 if it was less than 8 - Determine storage backing(s) under test. i.e.,storage backings supported by at least 2 hosts on the system - Create flavors with storage extra specs set based on storage backings under test - Create vms_to_test that can be live migrated using created flavors - Determine target host(s) to perform lock based on which host(s) have the most vms_to_test - Live migrate vms to target host(s) Test Steps: - Lock target host - Verify lock succeeded and vms status unchanged - Repeat above steps if more than one target host Test Teardown: - Delete created vms and volumes - Delete created flavors - Unlock locked target host(s) """ storage_backing, host = target_hosts vms_num = 5 vm_helper.ensure_vms_quotas(vms_num=vms_num) LOG.tc_step( "Boot {} vms with various storage settings".format(vms_num)) vms = vm_helper.boot_vms_various_types(cleanup='function', vms_num=vms_num, storage_backing=storage_backing, target_host=host) LOG.tc_step("Attempt to lock target host {}...".format(host)) HostsToRecover.add(host) host_helper.lock_host(host=host, check_first=False, fail_ok=False, swact=True) LOG.tc_step("Verify lock succeeded and vms still in good state") vm_helper.wait_for_vms_values(vms=vms, fail_ok=False) for vm in vms: vm_host = vm_helper.get_vm_host(vm_id=vm) assert vm_host != host, "VM is still on {} after lock".format(host) vm_helper.wait_for_vm_pingable_from_natbox( vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
def test_boot_various_vms(): vm_helper.boot_vms_various_types(cleanup=None)