def ceph_precheck(): """ Run test pre-checks before running CEPH tests. """ LOG.info('Verify the health of the CEPH cluster') storage_helper.is_ceph_healthy()
def ceph_backend_installed(): ceph_info = get_ceph_backend_info() if not ceph_info: skip("No ceph system installed in the lab") rel = storage_helper.is_ceph_healthy() if not rel: skip("Ceph health not OK") return ceph_info
def test_apply_storage_profile_negative(create_storage_profile, personality): if personality == 'controller': host_name = system_helper.get_standby_controller_name() assert host_name, "No standby controller available on system" else: host_name = host_helper.get_up_hypervisors()[0] # For storage systems, skip test if ceph isn't healthy if len(system_helper.get_storage_nodes()) > 0: ceph_healthy = storage_helper.is_ceph_healthy() if not ceph_healthy: skip('Skipping due to ceph not being healthy') profile_name = create_storage_profile['profile_name'] origin_disk_num = create_storage_profile['disk_num'] disks_num = len(storage_helper.get_host_disks(host_name, 'device_node')) expt_err = 'profile has more disks than host does' if disks_num < origin_disk_num -1 \ else "Please check if host's disks match profile criteria" expt_err_list = [ "Please check if host's disks match profile criteria", "Failed to create storage function. Host personality must be 'storage'", ] if disks_num < origin_disk_num - 1: expt_err_list.append("profile has more disks than host does") positional_arg = host_name + ' ' + profile_name HostsToRecover.add(host_name) host_helper.lock_host(host_name, swact=True) exitcode, output = cli.system('host-apply-storprofile', positional_arg, fail_ok=True) host_helper.unlock_host(host_name) assert exitcode == 1 and any(expt in output for expt in expt_err_list)
def _test_storage_profile(personality, from_backing, to_backing): """ This test creates a storage profile and then applies it to a node with identical hardware, assuming one exists. Storage profiles do not apply on controller nodes. Storage profiles can be applied on controller+compute nodes, compute nodes and storage nodes. Arguments: - personality (string) - controller, compute or storage - from_backing (string) - image, remote or None - to_backing (string) - image, remote or None Test Steps: 1. Query system and determine which nodes have compatible hardware. 2. Create a storage profile on one of those nodes 3. Apply the created storage profile on a compatible node* 4. Ensure the storage profiles have been successfully applied. * If the node is a compute node or a controller+compute, we will also change the backend if required for additional coverage. Returns: - Nothing """ global PROFILES_TO_DELETE PROFILES_TO_DELETE = [] # Skip if test is not applicable to hardware under test if personality == 'controller' and not system_helper.is_aio_system(): skip("Test does not apply to controller hosts without subtype compute") hosts = system_helper.get_hosts(personality=personality) if not hosts: skip("No hosts of type {} available".format(personality)) if (from_backing == "remote" or to_backing == "remote") and not system_helper.is_storage_system(): skip("This test doesn't apply to systems without storage hosts") LOG.tc_step("Identify hardware compatible hosts") hash_to_hosts = get_hw_compatible_hosts(hosts) # Pick the hardware group that has the most compatible hosts current_size = 0 candidate_hosts = [] for value in hash_to_hosts: candidate_size = len(hash_to_hosts[value]) if candidate_size > current_size: current_size = candidate_size candidate_hosts = hash_to_hosts[value] LOG.info( "This is the total set of candidate hosts: {}".format(candidate_hosts)) if len(candidate_hosts) < 2: skip("Insufficient hardware compatible hosts to run test") # Rsync lab setup dot files between controllers con_ssh = ControllerClient.get_active_controller() _rsync_files_to_con1(con_ssh=con_ssh, file_to_check="force.txt") # Take the hardware compatible hosts and check if any of them already have # the backend that we want. This will save us test time. new_to_backing = None if personality == "compute": from_hosts = [] to_hosts = [] for host in candidate_hosts: host_backing = host_helper.get_host_instance_backing(host) if host_backing == from_backing: from_hosts.append(host) elif host_backing == to_backing: to_hosts.append(host) else: pass LOG.info( "Candidate hosts that already have the right from backing {}: {}". format(from_backing, from_hosts)) LOG.info( "Candidate hosts that already have the right to backing {}: {}". format(to_backing, to_hosts)) # Determine what hosts to use if not from_hosts and to_hosts: to_host = random.choice(to_hosts) candidate_hosts.remove(to_host) from_host = random.choice(candidate_hosts) elif not to_hosts and from_hosts: from_host = random.choice(from_hosts) candidate_hosts.remove(from_host) to_host = random.choice(candidate_hosts) elif not to_hosts and not from_hosts: to_host = random.choice(candidate_hosts) candidate_hosts.remove(to_host) from_host = random.choice(candidate_hosts) else: to_host = random.choice(to_hosts) from_host = random.choice(from_hosts) LOG.info("From host is: {}".format(from_host)) LOG.info("To host is: {}".format(to_host)) LOG.tc_step( "Check from host backing and convert to {} if necessary".format( from_backing)) host_helper.set_host_storage_backing(from_host, from_backing) system_helper.wait_for_host_values( from_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) LOG.tc_step( "Check to host backing and convert to {} if necessary".format( to_backing)) new_to_backing = host_helper.set_host_storage_backing( to_host, to_backing) elif personality == "controller": # For now, we don't want to host reinstall controller-0 since it will default to # pxeboot, but this could be examined as a possible enhancement. from_host = "controller-0" to_host = "controller-1" LOG.info("From host is: {}".format(from_host)) LOG.info("To host is: {}".format(to_host)) LOG.tc_step( "Check from host backing and convert to {} if necessary".format( from_backing)) host_helper.set_host_storage_backing(from_host, from_backing) LOG.tc_step( "Check to host backing and convert to {} if necessary".format( to_backing)) new_to_backing = host_helper.set_host_storage_backing( to_host, to_backing) else: # Backing doesn't apply to storage nodes so just pick from compatible hardware from_host = random.choice(candidate_hosts) candidate_hosts.remove(from_host) to_host = random.choice(candidate_hosts) LOG.tc_step( "Create storage and interface profiles on the from host {}".format( from_host)) prof_name = 'storprof_{}_{}'.format( from_host, time.strftime('%Y%m%d_%H%M%S', time.localtime())) storage_helper.create_storage_profile(from_host, profile_name=prof_name) PROFILES_TO_DELETE.append(prof_name) # Deleting VMs in case the remaining host(s) cannot handle all VMs # migrating on lock, particularly important in the case of AIO-DX systems. LOG.tc_step( "Delete all VMs and lock the host before applying the storage profile") vm_helper.delete_vms() HostsToRecover.add(to_host, scope='function') system_helper.wait_for_host_values(from_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) system_helper.wait_for_host_values(to_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) # Negative test #1 - attempt to apply profile on unlocked host (should be rejected) LOG.tc_step('Apply the storage-profile {} onto unlocked host:{}'.format( prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg host_helper.lock_host(to_host, swact=True) # 3 conditions to watch for: no partitions, ready partitions and in-use # partitions on the compute. If in-use, delete and freshly install host. # If ready, delete all ready partitions to make room for potentially new # partitions. If no partitions, just delete nova-local lvg. if personality == "compute": # Negative test #2 - attempt to apply profile onto host with existing # nova-local (should be rejected) LOG.tc_step( 'Apply the storage-profile {} onto host with existing nova-local:{}' .format(prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg # If we were simply switching backing (without applying a storage # profile), the nova-local lvg deletion can be omitted according to design LOG.tc_step("Delete nova-local lvg on to host {}".format(to_host)) cli.system("host-lvg-delete {} nova-local".format(to_host)) in_use = storage_helper.get_host_partitions(to_host, "In-Use") if in_use: # Negative test #3 - attempt to apply profile onto host with existing # in-use partitions (should be rejected) LOG.tc_step('Apply the storage-profile {} onto host with existing \ in-use partitions:{}'.format(prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg LOG.tc_step( "In-use partitions found. Must delete the host and freshly install before proceeding." ) LOG.info("Host {} has in-use partitions {}".format( to_host, in_use)) lab = InstallVars.get_install_var("LAB") lab.update(create_node_dict(lab['compute_nodes'], 'compute')) lab['boot_device_dict'] = create_node_boot_dict(lab['name']) install_helper.open_vlm_console_thread(to_host) LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) assert len( system_helper.get_controllers()) > 1, "Host deletion failed" cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values( to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) ready = storage_helper.get_host_partitions(to_host, "Ready") if ready: LOG.tc_step( "Ready partitions have been found. Must delete them before profile application" ) LOG.info("Host {} has Ready partitions {}".format(to_host, ready)) for uuid in reversed(ready): storage_helper.delete_host_partition(to_host, uuid) # Don't bother restoring in this case since the system should be # functional after profile is applied. LOG.tc_step('Apply the storage-profile {} onto host:{}'.format( prof_name, to_host)) cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name)) LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) to_host_backing = host_helper.get_host_instance_backing(to_host) LOG.info("To host backing was {} and is now {}".format( new_to_backing, to_host_backing)) assert to_host_backing == from_backing, "Host backing was not changed on storage profile application" if personality == "storage": if not storage_helper.is_ceph_healthy(): skip("Cannot run test when ceph is not healthy") LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values(to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) LOG.tc_step('Apply the storage-profile {} onto host:{}'.format( prof_name, to_host)) cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name)) # Re-provision interfaces through lab_setup.sh LOG.tc_step("Reprovision the host as necessary") files = ['interfaces'] con_ssh = ControllerClient.get_active_controller() delete_lab_setup_files(con_ssh, to_host, files) rc, msg = install_helper.run_lab_setup() assert rc == 0, msg LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) if personality == "controller": # Note, install helper doesn't work on all labs. Some labs don't # display BIOS type which causes install helper to fail lab = InstallVars.get_install_var("LAB") lab.update(create_node_dict(lab['controller_nodes'], 'controller')) lab['boot_device_dict'] = create_node_boot_dict(lab['name']) install_helper.open_vlm_console_thread(to_host) LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) assert len(system_helper.get_controllers()) > 1, "Host deletion failed" cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values(to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) LOG.tc_step("Apply the storage-profile {} onto host:{}".format( prof_name, to_host)) cli.system("host-apply-storprofile {} {}".format(to_host, prof_name)) # Need to re-provision everything on node through lab_setup (except storage) LOG.tc_step("Reprovision the host as necessary") files = [ 'interfaces', 'cinder_device', 'vswitch_cpus', 'shared_cpus', 'extend_cgts_vg', 'addresses' ] con_ssh = ControllerClient.get_active_controller() delete_lab_setup_files(con_ssh, to_host, files) rc, msg = install_helper.run_lab_setup() assert rc == 0, msg LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) to_host_backing = host_helper.get_host_instance_backing(to_host) LOG.info("To host backing was {} and is now {}".format( new_to_backing, to_host_backing)) assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"
def test_storgroup_semantic_checks(): """ This test validates CEPH semantic checks as it applies to storage nodes in a replication group. Args: - None Setup: - Requires a system with storage nodes (minimum of 2) - Requires TiS Release 3 and up Test Steps: 1. Lock one storage node in a storage node pair 2. Check the appropriate alarms are raised 3. Check OSDs are down on the storage node 4. Check that CEPH is no longer healthy 5. Attempt to lock the other node and ensure it is rejected 6. Attempt to force lock the other node and ensure it is rejected 7. If the storage node is a storage monitor, attempt to lock and force lock the controllers 8. Unlock the storage node in the storage node pair 9. Check that the alarms are cleared 10. Check that OSDs are up 11. Check that CEPH is healthy Defects this addresses: 1. CGTS-4286 Unexpected allowing lock action on storage node peergroup when redundancy lost 2. CGTS-3494 Some OSDs observed to be up on locked storage node 3. CGTS-3643 Able to lock standby controller despite only two CEPH monitors being available 4. CGTS-2690 Storage: Force locking a controller should be rejected when storage is locked. """ con_ssh = ControllerClient.get_active_controller() table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1]) capabilities = table_parser.get_value_two_col_table(table_, 'capabilities') replication_factor = capabilities[1] LOG.info("The replication factor is: {}".format(replication_factor)) # We want to test storage-0 since it is a ceph monitor # Then we want to test another storage host in another group. The choice # depends on the replication factor. storage_nodes = ["storage-0"] if replication_factor == "3": storage_nodes.append("storage-3") if replication_factor == "2" and len(storage_nodes) > 2: storage_nodes.append("storage-2") LOG.info("Storage hosts under test are: {}".format(storage_nodes)) for host in storage_nodes: LOG.tc_step('Lock {}:'.format(host)) HostsToRecover.add(host, scope='function') rtn_code, out = host_helper.lock_host(host) assert rtn_code == 0, out LOG.tc_step("Verify CEPH cluster health reflects the OSD being down") ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) assert not ceph_healthy, "ceph is not healthy" LOG.tc_step('Check that alarms are raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \ "Alarm {} not raised".format(EventLogID.HOST_LOCK) LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} is up but should be down'.format(osd_id) assert not osd_up, msg msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) LOG.tc_step('Check that loss of replication alarm is raise') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that the ceph health warning alarm is raised') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND) hosts = [] if host == 'storage-0': hosts.append('controller-0') hosts.append('controller-1') for node in hosts: LOG.tc_step('Attempt to lock the {}'.format(node)) HostsToRecover.add(node) rtn_code, out = host_helper.lock_host(node, fail_ok=True) assert 1 == rtn_code, out LOG.tc_step('Attempt to force lock {}'.format(node)) rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True) assert 1 == rtn_code, out LOG.tc_step('Unlock storage host {}'.format(host)) rtn_code, out = host_helper.unlock_host(host) assert rtn_code == 0, out LOG.info("Check if alarms have cleared") assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \ "Alarm {} not cleared".format(EventLogID.HOST_LOCK) assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \ "Alarm {} not cleared".format(EventLogID.STORAGE_LOR) assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \ "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) assert ceph_healthy, "ceph is not healthy" LOG.tc_step('Check OSDs are up after unlock') for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} should be up but is not'.format(osd_id) assert osd_up, msg
def test_lock_cont_check_mon_down(): """ This test is adapted from us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt The goal of this test is to check that we alarm when a CEPH monitor goes down. This test is specifically for controller hosts. Args: - None Setup: - Requires system with storage nodes Test Steps: 1. Lock controller node 2. Check - CEPH cluster is in HEALTH_WARN - Ensure all OSDs stay up - Check that the appropriate alarms are raised: - controller-X is locked - ceph mon down 3. Unlock controller node - ensure CEPH is HEALTH_OK - Check that alarms are cleared Enhancements: 1. Should we do both controllers? This will require a swact. """ con_ssh = ControllerClient.get_active_controller() host = system_helper.get_standby_controller_name() LOG.tc_step('Lock standby controller node {}'.format(host)) HostsToRecover.add(host, scope='function') rtn_code, out = host_helper.lock_host(host) assert rtn_code == 0, out LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \ "Alarm {} not raised".format(EventLogID.HOST_LOCK) LOG.tc_step('Check OSDs are still up after lock') osd_list = storage_helper.get_osds(con_ssh=con_ssh) for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} should be up but is not'.format(osd_id) assert osd_up, msg msg = 'OSD ID {} is up'.format(osd_id) LOG.info(msg) LOG.tc_step('Unlock standby controller node {}'.format(host)) rtn_code, out = host_helper.unlock_host(host, available_only=True) assert rtn_code == 0, out LOG.tc_step('Check that the host locked alarm is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \ "Alarm {} not cleared".format(EventLogID.HOST_LOCK) LOG.tc_step('Check that the Storage Alarm Condition is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \ "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check health of CEPH cluster') msg = '' end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy: break else: assert 0, "ceph is not healthy"
def test_lock_stor_check_osds_down(stx_openstack_required, host): """ This test is adapted from us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt The goal of this test is to check that all OSDs go down on a locked storage node. There are two variants: 1. Lock 'storage-0' which is a ceph monitor 2. Lock a storage node that is not 'storage-0', i.e. not a ceph monitor Args: - None Setup: - Requires system with storage nodes Test Steps: 1. Lock storage node 2. Check - CEPH cluster is in HEALTH_WARN - Ensure all OSDs on the locked storage node are down - Check that the appropriate alarms are raised: 3. Unlock storage node - ensure CEPH is HEALTH_OK - ensure all OSDs on unlocked node are up - Check that alarms are cleared Note: If the storage node to be locked is monitor, we also expect to see the mon down alarm. What defects this addresses: 1. CGTS-2609 - Ceph processes fail to start after storage node reboot Notes: - Updated test to write to disk to add I/O load on system """ con_ssh = ControllerClient.get_active_controller() if host == 'any': storage_nodes = system_helper.get_hosts(personality='storage') LOG.info('System has {} storage nodes:'.format(storage_nodes)) storage_nodes.remove('storage-0') node_id = random.randint(0, len(storage_nodes) - 1) host = storage_nodes[node_id] LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) LOG.tc_step('Lock storage node {}'.format(host)) HostsToRecover.add(host) host_helper.lock_host(host, check_first=False) LOG.tc_step('Determine the storage group for host {}'.format(host)) storage_group, msg = storage_helper.get_storage_group(host) LOG.info(msg) LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \ "Alarm {} not raised".format(EventLogID.HOST_LOCK) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) assert not ceph_healthy LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} is up but should be down'.format(osd_id) assert not osd_up, msg msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) LOG.tc_step('Check that loss of replication alarm is raised') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that ceph is in health warn') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND) # We're waiting 5 minutes for ceph rebalancing to be performed # DO NOT REMOVE. This is part of the test. time.sleep(300) LOG.tc_step('Unlock storage node') rtn_code, out = host_helper.unlock_host(host) assert rtn_code == 0, out health = False end_time = time.time() + 40 while time.time() < end_time: health = storage_helper.is_ceph_healthy(con_ssh) if health is True: break assert health, "Ceph did not become healthy" LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host)) assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \ "Alarm {} not cleared".format(EventLogID.HOST_LOCK) LOG.tc_step('Check that the replication group alarm is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \ "Alarm {} not cleared".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that the Storage Alarm Condition is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \ "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check OSDs are up after unlock') for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} should be up but is not'.format(osd_id) assert osd_up, msg LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()
def test_ceph_reboot_storage_node(stx_openstack_required): """ us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt Verify that ceph mon processes recover when they are killed nodes. Args: - Nothing Setup: - Requires system with storage nodes Test Steps: 0. Run CEPH pre-check fixture to check: - system has storage nodes - health of the ceph cluster is okay - that we have OSDs provisioned 1. Delete existing VMs 2. Boot new VMs and run dd on them 3. Reboot storage node and ensure both: - mon state goes down (if storage-0) - OSD state goes down 4. Ensure mon and OSD state recover afterwards 5. Cleanup VMs Potential rework: 1. Add the alarms checks for raise and clear 2. Maybe we don't want to reboot all storage nodes What defects this addresses: 1. CGTS-2975 Update: This test was updated for the Storage and Robustness feature. """ con_ssh = ControllerClient.get_active_controller() LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) storage_nodes = system_helper.get_storage_nodes(con_ssh) for host in storage_nodes: LOG.tc_step('Reboot {}'.format(host)) HostsToRecover.add(host, scope='function') host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = True msg = None end_time = time.time() + 10 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if not ceph_healthy: break assert not ceph_healthy, "ceph is not healthy" LOG.info(msg) LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) all_osds_up = True up_list = osd_list.copy() end_time = time.time() + 60 while time.time() < end_time and all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if not osd_up: msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) up_list.remove(osd_id) if len(up_list) > 0: osd_list = up_list.copy() else: msg = ' All OSDs are down as expected' LOG.info(msg) all_osds_up = False assert not all_osds_up, " One or more OSD(s) {} is(are) up but should be down".format(up_list) system_helper.wait_for_host_values(host, availability='available') LOG.tc_step('Check that OSDs are up') osd_list = storage_helper.get_osds(host, con_ssh) down_list = osd_list.copy() all_osds_up = False end_time = time.time() + 60 while time.time() < end_time and not all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if osd_up: msg = 'OSD ID {} is up as expected'.format(osd_id) LOG.info(msg) down_list.remove(osd_id) if len(down_list) > 0: osd_list = down_list.copy() else: msg = ' All OSDs are up as expected' LOG.info(msg) all_osds_up = True assert all_osds_up, " One or more OSD(s) {} is(are) down but should be up".format(down_list) LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break assert ceph_healthy, "ceph is not healthy" for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()