def check_server_group_messaging_enabled(vms, action): vms = list(set(vms)) vm_sender = random.choice(vms) vms.remove(vm_sender) if action == 'message': msg = MSG timeout = 180 elif action == 'pause': msg = '{}.*paused'.format(vm_sender) timeout = 240 else: raise ValueError("Unknown action - '{}' provided".format(action)) res_events = [] sent_event = Events("srv msg/event triggered") listener_event = Events("VM started listening to server group messages") vm_threads = [] sender_thread = None try: for vm in vms: listener_event.clear() new_thread = MThread(_wait_for_srv_grp_msg, vm, msg, timeout=timeout, res_events=res_events, listener_event=listener_event, sent_event=sent_event) new_thread.start_thread(timeout=timeout + 30) vm_threads.append(new_thread) listener_event.wait_for_event() time.sleep(5) # this 60 seconds timeout is hardcoded for action == 'message' scenario to send # the message out sender_thread = MThread(trigger_srv_grp_msg, vm_sender, action, timeout=60, sent_event=sent_event, rcv_event=res_events) sender_thread.start_thread(timeout=timeout) sent_event.wait_for_event() for res_event in res_events: res_event.wait_for_event() finally: # wait for server group msg to be received for vm_thr in vm_threads: vm_thr.wait_for_thread_end(timeout=30) if sender_thread: sender_thread.wait_for_thread_end(timeout=30) if action == 'pause': vm_helper.unpause_vm(vm_sender)
def check_port_forwarding_protocol(ext_gateway_ip, nat_ssh, vm_pfs, vm_ssh_pfs, protocol): vm_threads = [] end_event = Events("Hello msg sent to ports") start_events = [] received_events = [] try: LOG.tc_step("Start listening on vms {} ports .... ".format(protocol)) for vm_id_, v in vm_pfs.items(): greeting = "Hello {}".format(v['public_port']) ssh_public_port = vm_ssh_pfs[vm_id_]['public_port'] start_event = Events("VM {} started listening".format(vm_id_)) start_events.append(start_event) received_event = Events( "Greeting received on vm {}".format(vm_id_)) received_events.append(received_event) thread_vm = MThread(check_ssh_to_vm_and_wait_for_packets, start_event, end_event, received_event, vm_id_, ext_gateway_ip, ssh_public_port, greeting, protocol) thread_vm.start_thread() vm_threads.append(thread_vm) for event_ in start_events: event_.wait_for_event(timeout=180, fail_ok=False) diff_protocol = 'udp' if protocol == 'tcp' else 'tcp' LOG.tc_step( "Send Hello msg to vms from NATBox via {} ports, and check they are not received via {} ports" .format(diff_protocol, protocol)) for vm_id_, v in vm_pfs.items(): greeting = "Hello {}".format(v['public_port']) send_packets_to_vm_from_nat_box(nat_ssh, ext_gateway_ip, v['public_port'], greeting, diff_protocol) time.sleep(10) for event in received_events: assert not event.is_set(), "Event {} is set".format(event) LOG.tc_step( "Send Hello msg to vms from NATBox via {} ports, and check they are received" .format(protocol, protocol)) for vm_id_, v in vm_pfs.items(): greeting = "Hello {}".format(v['public_port']) send_packets_to_vm_from_nat_box(nat_ssh, ext_gateway_ip, v['public_port'], greeting, protocol) time.sleep(10) for event in received_events: assert event.wait_for_event( timeout=40, fail_ok=False), "Event {} is not set".format(event) finally: end_event.set() for thread in vm_threads: thread.wait_for_thread_end(timeout=40, fail_ok=True)
def _wait_for_srv_grp_msg(vm_id, msg, timeout, res_events, listener_event, sent_event): with vm_helper.ssh_to_vm_from_natbox(vm_id, retry_timeout=60) as vm_ssh: vm_ssh.send('server_group_app') # vm_ssh.expect('\r\n\r\n', timeout=1, searchwindowsize=100) listener_event.set() sent_event.wait_for_event() received_event = Events( "Server group message received on VM {}".format(vm_id)) res_events.append(received_event) end_time = time.time() + timeout while time.time() < end_time: code = vm_ssh.expect('\r\n\r\n', fail_ok=True, timeout=timeout) if code < 0: assert False, "No more server group notification received. Expected msg not found." current_output = vm_ssh.cmd_output if re.search(msg, current_output): received_event.set() vm_ssh.send_control('c') vm_ssh.expect(searchwindowsize=100, timeout=5) break else: assert False, "Expected msg did not appear within timeout"
def test_events(): e = Events("functions should wait here") LOG.tc_step("Create multiple threads") thread_1 = MThread(events_func, 1, 10, e) thread_2 = MThread(events_func, 2, 15, e) thread_1.start_thread(60) thread_2.start_thread(60) sleep(20) LOG.tc_step("Setting event") e.set() thread_1.wait_for_thread_end() thread_2.wait_for_thread_end() LOG.tc_step("Threads have finished") e.clear() e.wait_for_event(20, fail_ok=True)
def test_swact_100_times(): """ Skip Condition: - Less than two controllers on system Test Steps: - Boot a vm and ensure it's pingable - Start writing from pre-existed vm before swacting - Repeat following steps 100 times: - ensure system has standby controller - system host-swact - ensure all services are active in sudo sm-dump on new active controller - ensure pre-existed vm is still pingable from NatBox - ensure writing did not stop on pre-existed vm - ensure new vm can be launched in 2 minutes - ensure newly booted vm is pingable from NatBox - delete newly booted vm Teardown: - delete vms, volumes """ if len(system_helper.get_controllers()) < 2: skip("Less than two controllers on system") if not system_helper.get_standby_controller_name(): assert False, "No standby controller on system" LOG.tc_step("Boot a vm and ensure it's pingable") vm_base = vm_helper.boot_vm(name='pre_swact', cleanup='function')[1] LOG.tc_step("Start writing from pre-existed vm before swacting") end_event = Events("End write in base vm") base_vm_thread = vm_helper.write_in_vm(vm_base, end_event=end_event, expect_timeout=40, thread_timeout=60*100) try: for i in range(100): iter_str = "Swact iter{}/100 - ".format(i+1) LOG.tc_step("{}Ensure system has standby controller".format(iter_str)) standby = system_helper.get_standby_controller_name() assert standby LOG.tc_step("{}Swact active controller and ensure active controller is changed".format(iter_str)) host_helper.swact_host() LOG.tc_step("{}Check all services are up on active controller via sudo sm-dump".format(iter_str)) host_helper.wait_for_sm_dump_desired_states(controller=standby, fail_ok=False) LOG.tc_step("{}Ensure pre-existed vm still pingable post swact".format(iter_str)) vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_base, timeout=45) time.sleep(5) LOG.tc_step("{}Ensure writing from pre-existed vm resumes after swact".format(iter_str)) assert base_vm_thread.res is True, "Writing in pre-existed vm stopped after {}".format(iter_str.lower()) LOG.tc_step("{}Attempt to boot new vm after 2 minutes of post swact and ensure it's pingable". format(iter_str)) time.sleep(60) for j in range(3): code, vm_new, msg = vm_helper.boot_vm(name='post_swact', fail_ok=True, cleanup='function') if code == 0: break LOG.warning("VM failed to boot - attempt{}".format(j+1)) vm_helper.delete_vms(vms=vm_new) assert j < 2, "No vm can be booted 2+ minutes after swact" LOG.tc_step("{}VM{} failed to boot, wait for 30 seconds and retry".format(j+1, iter_str)) time.sleep(30) vm_helper.wait_for_vm_pingable_from_natbox(vm_new) LOG.tc_step("{}Delete the vm created".format(iter_str)) vm_helper.delete_vms(vms=vm_new) except: raise finally: LOG.tc_step("End the base_vm_thread") end_event.set() base_vm_thread.wait_for_thread_end(timeout=20) post_standby = system_helper.get_standby_controller_name() assert post_standby, "System does not have standby controller after last swact"
def check_port_forwarding_ports(ext_gateway_ip, nat_ssh, vm_id, ssh_port, old_port, new_port, protocol): end_event = Events("Hello msg sent to ports") start_event = Events("VM {} started listening".format(vm_id)) received_event = Events("Greeting received on vm {}".format(vm_id)) LOG.tc_step("Starting VM ssh session threads .... ") new_greeting = "Hello {}".format(new_port) vm_thread = MThread(check_ssh_to_vm_and_wait_for_packets, start_event, end_event, received_event, vm_id, ext_gateway_ip, ssh_port, new_greeting, protocol) vm_thread.start_thread() try: start_event.wait_for_event(timeout=180, fail_ok=False) LOG.tc_step( "Send Hello msg to vm from NATBox via old {} port {}, and check it's not received" .format(protocol, old_port)) greeting = "Hello {}".format(old_port) send_packets_to_vm_from_nat_box(nat_ssh, ext_gateway_ip, old_port, greeting, protocol) time.sleep(10) assert not received_event.is_set(), "Event {} is set".format( received_event) LOG.tc_step( "Check greeting is received on vm via new {} port {}".format( protocol, new_port)) send_packets_to_vm_from_nat_box(nat_ssh, ext_gateway_ip, new_port, new_greeting, protocol) assert received_event.wait_for_event( timeout=30), "Event {} is not set".format(received_event) finally: end_event.set() vm_thread.wait_for_thread_end(timeout=40, fail_ok=False)
def test_lock_stor_check_osds_down(stx_openstack_required, host): """ This test is adapted from us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt The goal of this test is to check that all OSDs go down on a locked storage node. There are two variants: 1. Lock 'storage-0' which is a ceph monitor 2. Lock a storage node that is not 'storage-0', i.e. not a ceph monitor Args: - None Setup: - Requires system with storage nodes Test Steps: 1. Lock storage node 2. Check - CEPH cluster is in HEALTH_WARN - Ensure all OSDs on the locked storage node are down - Check that the appropriate alarms are raised: 3. Unlock storage node - ensure CEPH is HEALTH_OK - ensure all OSDs on unlocked node are up - Check that alarms are cleared Note: If the storage node to be locked is monitor, we also expect to see the mon down alarm. What defects this addresses: 1. CGTS-2609 - Ceph processes fail to start after storage node reboot Notes: - Updated test to write to disk to add I/O load on system """ con_ssh = ControllerClient.get_active_controller() if host == 'any': storage_nodes = system_helper.get_hosts(personality='storage') LOG.info('System has {} storage nodes:'.format(storage_nodes)) storage_nodes.remove('storage-0') node_id = random.randint(0, len(storage_nodes) - 1) host = storage_nodes[node_id] LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) LOG.tc_step('Lock storage node {}'.format(host)) HostsToRecover.add(host) host_helper.lock_host(host, check_first=False) LOG.tc_step('Determine the storage group for host {}'.format(host)) storage_group, msg = storage_helper.get_storage_group(host) LOG.info(msg) LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host)) assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \ "Alarm {} not raised".format(EventLogID.HOST_LOCK) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) assert not ceph_healthy LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} is up but should be down'.format(osd_id) assert not osd_up, msg msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) LOG.tc_step('Check that loss of replication alarm is raised') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that ceph is in health warn') assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \ "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND) # We're waiting 5 minutes for ceph rebalancing to be performed # DO NOT REMOVE. This is part of the test. time.sleep(300) LOG.tc_step('Unlock storage node') rtn_code, out = host_helper.unlock_host(host) assert rtn_code == 0, out health = False end_time = time.time() + 40 while time.time() < end_time: health = storage_helper.is_ceph_healthy(con_ssh) if health is True: break assert health, "Ceph did not become healthy" LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host)) assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \ "Alarm {} not cleared".format(EventLogID.HOST_LOCK) LOG.tc_step('Check that the replication group alarm is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \ "Alarm {} not cleared".format(EventLogID.STORAGE_LOR) LOG.tc_step('Check that the Storage Alarm Condition is cleared') assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \ "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND) LOG.tc_step('Check OSDs are up after unlock') for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) msg = 'OSD ID {} should be up but is not'.format(osd_id) assert osd_up, msg LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()
def test_ceph_reboot_storage_node(stx_openstack_required): """ us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt Verify that ceph mon processes recover when they are killed nodes. Args: - Nothing Setup: - Requires system with storage nodes Test Steps: 0. Run CEPH pre-check fixture to check: - system has storage nodes - health of the ceph cluster is okay - that we have OSDs provisioned 1. Delete existing VMs 2. Boot new VMs and run dd on them 3. Reboot storage node and ensure both: - mon state goes down (if storage-0) - OSD state goes down 4. Ensure mon and OSD state recover afterwards 5. Cleanup VMs Potential rework: 1. Add the alarms checks for raise and clear 2. Maybe we don't want to reboot all storage nodes What defects this addresses: 1. CGTS-2975 Update: This test was updated for the Storage and Robustness feature. """ con_ssh = ControllerClient.get_active_controller() LOG.tc_step("Delete existing VMs") vm_helper.delete_vms() LOG.tc_step("Boot various VMs") vms = vm_helper.boot_vms_various_types(cleanup="function") vm_threads = [] LOG.tc_step("SSH to VMs and write to disk") end_event = Events("End dd in vms") try: for vm in vms: vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40) vm_threads.append(vm_thread) storage_nodes = system_helper.get_storage_nodes(con_ssh) for host in storage_nodes: LOG.tc_step('Reboot {}'.format(host)) HostsToRecover.add(host, scope='function') host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False) LOG.tc_step('Check health of CEPH cluster') ceph_healthy = True msg = None end_time = time.time() + 10 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if not ceph_healthy: break assert not ceph_healthy, "ceph is not healthy" LOG.info(msg) LOG.tc_step('Check that OSDs are down') osd_list = storage_helper.get_osds(host, con_ssh) all_osds_up = True up_list = osd_list.copy() end_time = time.time() + 60 while time.time() < end_time and all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if not osd_up: msg = 'OSD ID {} is down as expected'.format(osd_id) LOG.info(msg) up_list.remove(osd_id) if len(up_list) > 0: osd_list = up_list.copy() else: msg = ' All OSDs are down as expected' LOG.info(msg) all_osds_up = False assert not all_osds_up, " One or more OSD(s) {} is(are) up but should be down".format(up_list) system_helper.wait_for_host_values(host, availability='available') LOG.tc_step('Check that OSDs are up') osd_list = storage_helper.get_osds(host, con_ssh) down_list = osd_list.copy() all_osds_up = False end_time = time.time() + 60 while time.time() < end_time and not all_osds_up: for osd_id in osd_list: osd_up = storage_helper.is_osd_up(osd_id, con_ssh) if osd_up: msg = 'OSD ID {} is up as expected'.format(osd_id) LOG.info(msg) down_list.remove(osd_id) if len(down_list) > 0: osd_list = down_list.copy() else: msg = ' All OSDs are up as expected' LOG.info(msg) all_osds_up = True assert all_osds_up, " One or more OSD(s) {} is(are) down but should be up".format(down_list) LOG.tc_step('Check health of CEPH cluster') end_time = time.time() + 40 while time.time() < end_time: ceph_healthy = storage_helper.is_ceph_healthy(con_ssh) if ceph_healthy is True: break assert ceph_healthy, "ceph is not healthy" for vm_thread in vm_threads: assert vm_thread.res is True, "Writing in vm stopped unexpectedly" finally: end_event.set() for vm_thread in vm_threads: vm_thread.wait_for_thread_end(timeout=20) LOG.tc_step("Delete existing VMs") vm_helper.delete_vms()
def _verify_port_from_natbox(con_ssh, port, port_expected_open): """ :param con_ssh: Controller ssh client :param port: (number) Port to test :param port_expected_open: (boolean) """ if ProjVar.get_var('IS_DC'): subcloud = ProjVar.get_var('PRIMARY_SUBCLOUD') lab_ip = ProjVar.get_var('LAB')[subcloud]['floating ip'] else: lab_ip = ProjVar.get_var('LAB')['floating ip'] cli.system('show', source_openrc=True, force_source=True) LOG.info("Check if port {} is listed in iptables".format(port)) cmd = 'iptables -nvL | grep --color=never -w {}'.format(port) end_time = time.time() + 90 while time.time() < end_time: output = con_ssh.exec_sudo_cmd(cmd, get_exit_code=False)[1] if (port_expected_open and output) or (not port_expected_open and not output): LOG.info("Port {} is {}listed in iptables as expected".format( port, '' if port_expected_open else 'not ')) break time.sleep(5) else: assert 0, "Port {} is {}listed in iptables. ".format( port, 'not ' if port_expected_open else '') end_event = Events('Packet received') LOG.info("Open listener on port {}".format(port)) listener_thread = MThread(_listen_on_port, port, end_event=end_event, ssh_name=ProjVar.get_var('PRIMARY_SUBCLOUD')) listener_thread.start_thread(timeout=300) extra_str = 'succeeded' if port_expected_open else 'rejected' LOG.info("Verify access to port {} from natbox is {}".format( port, extra_str)) try: wait_for_port_to_listen(con_ssh, port) natbox_ssh = NATBoxClient.get_natbox_client() end_time = time.time() + 60 while time.time() < end_time: output = natbox_ssh.exec_cmd("nc -v -w 2 {} {}".format( lab_ip, port), get_exit_code=False)[1] if (port_expected_open and 'succeeded' in output) or ( not port_expected_open and 'succeeded' not in output): LOG.info("Access via port {} {} as expected".format( port, extra_str)) return else: assert False, "Access via port {} is not {}".format( port, extra_str) finally: end_event.set() listener_thread.wait_for_thread_end(timeout=10) con_ssh.send_control('c') con_ssh.expect(con_ssh.get_prompt())
def _check_packets_forwarded_in_sfc_vm(source_vm_id, dest_vm_id, sfc_vm_ids, dest_vm_internal_net_ip, protocol, nsh_aware, symmetric, load_balancing=False): end_event = Events("Hello or ping sent to vm") start_event = Events("VM {} started listening".format(dest_vm_id)) received_event = Events("Greeting received on vm {}".format(dest_vm_id)) vms_events = {} for sfc_vm in sfc_vm_ids: start_event_sfc = Events("SFC vm {} started listening".format(sfc_vm)) received_event_sfc = Events( "Packets received on SFC vm {}".format(sfc_vm)) vms_events[sfc_vm] = (start_event_sfc, received_event_sfc) greeting = "hello" port = 20010 vm_thread = None if protocol != 'icmp': func_args = (start_event, end_event, received_event, dest_vm_id, dest_vm_internal_net_ip, greeting, port, protocol, load_balancing) vm_thread = MThread(_ssh_to_dest_vm_and_wait_for_greetings, *func_args) sfc_vm_threads = [] for sfc_vm in sfc_vm_ids: start_event_sfc, received_event_sfc = vms_events[sfc_vm] func_args = (start_event_sfc, end_event, received_event_sfc, sfc_vm, protocol, nsh_aware, symmetric) sfc_vm_thread = MThread(_ssh_to_sfc_vm_and_wait_for_packets, *func_args) sfc_vm_threads.append(sfc_vm_thread) LOG.tc_step( "Starting VM ssh session threads to ping (icmp) or send hello (tcp, udp)" ) if protocol != 'icmp': vm_thread.start_thread() for sfc_vm_thread in sfc_vm_threads: LOG.tc_step("Starting each SFC VM threads") sfc_vm_thread.start_thread() try: if protocol != 'icmp': start_event.wait_for_event(timeout=180, fail_ok=False) for sfc_vm in sfc_vm_ids: start_event_sfc, received_event_sfc = vms_events[sfc_vm] start_event_sfc.wait_for_event(timeout=120, fail_ok=False) if protocol == 'icmp': LOG.tc_step( "Ping from from vm {} to vm {}, and check it's received". format(source_vm_id, dest_vm_id)) _ping_from_source_to_dest_vm(source_vm_id, end_event, dest_vm_internal_net_ip) else: if load_balancing: LOG.tc_step( "Send Hello msg from vm using tcp_client.py {} to vm {}, and check it's received" .format(source_vm_id, dest_vm_id)) _send_hello_message_from_vm_using_tcp_client( source_vm_id, end_event, dest_vm_internal_net_ip) else: LOG.tc_step( "Send Hello msg from vm {} to vm {}, and check it's received" .format(source_vm_id, dest_vm_id)) _send_hello_message_from_vm(source_vm_id, greeting, end_event, dest_vm_internal_net_ip, port, protocol) if protocol != 'icmp': assert received_event.wait_for_event( timeout=30), "Received Event {} is not set".format( received_event) for sfc_vm in sfc_vm_ids: start_event_sfc, received_event_sfc = vms_events[sfc_vm] assert received_event_sfc.wait_for_event( timeout=10), "Received Event is not set in SFC function" finally: end_event.set() if protocol != 'icmp': vm_thread.wait_for_thread_end(timeout=40, fail_ok=False) for sfc_vm_thread in sfc_vm_threads: sfc_vm_thread.wait_for_thread_end(timeout=40, fail_ok=False)