def test_swact_controller_platform(wait_for_con_drbd_sync_complete): """ Verify swact active controller Test Steps: - Swact active controller - Verify standby controller and active controller are swapped - Verify nodes are ready in kubectl get nodes """ if system_helper.is_aio_simplex(): skip("Simplex system detected") if not wait_for_con_drbd_sync_complete: skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS) LOG.tc_step('retrieve active and available controllers') pre_active_controller, pre_standby_controller = \ system_helper.get_active_standby_controllers() assert pre_standby_controller, "No standby controller available" LOG.tc_step("Swact active controller and ensure active controller " "is changed") host_helper.swact_host(hostname=pre_active_controller) LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact") kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller, pre_standby_controller), timeout=30)
def sys_lock_unlock_standby(number_of_times=1): """ This is to identify the storage nodes and turn them off and on via vlm :return: """ timeout = VMTimeout.DHCP_RETRY if system_helper.is_aio_system( ) else VMTimeout.PING_VM for i in range(0, number_of_times): active, standby = system_helper.get_active_standby_controllers() LOG.tc_step("Doing iteration of {} of total iteration {}".format( i, number_of_times)) LOG.tc_step("'sudo reboot -f' from {}".format(standby)) host_helper.lock_host(host=standby) LOG.tc_step("Check vms status after locking standby") vms = get_all_vms() vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600) for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm, timeout=timeout) host_helper.unlock_host(host=standby) vms = get_all_vms() vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600) for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm)
def remove(): LOG.fixture_step("Removing custom firewall rules") user_file_dir = ProjVar.get_var('USER_FILE_DIR') empty_path = user_file_dir + "iptables-empty.rules" client = get_cli_client(central_region=True) client.exec_cmd('touch {}'.format(empty_path)) _modify_firewall_rules(empty_path) active, standby = system_helper.get_active_standby_controllers() con_ssh = ControllerClient.get_active_controller() LOG.fixture_step("Verify custom ports on {}".format(active)) for port in custom_ports: # Verifying ports that are in the iptables file are closed _verify_port_from_natbox(con_ssh, port, port_expected_open=False) if standby: LOG.fixture_step("Swact {}".format(active)) host_helper.swact_host(active) LOG.fixture_step("Verify custom ports on {}".format(standby)) for port in custom_ports: # Verifying ports that are in the iptables file are closed after swact _verify_port_from_natbox(con_ssh, port, port_expected_open=False)
def is_controller_swacted(prev_active, prev_standby, swact_start_timeout=MTCTimeout.KILL_PROCESS_SWACT_NOT_START, swact_complete_timeout=MTCTimeout.KILL_PROCESS_SWACT_COMPLETE, con_ssh=None): """ Wait and check if the active-controller on the system was 'swacted' with give time period Args: prev_active: previous active controller prev_standby: previous standby controller swact_start_timeout: check within this time frame if the swacting started swact_complete_timeout: check if the swacting (if any) completed in this time period con_ssh: ssh connection/client to the current active-controller Returns: """ LOG.info('Check if the controllers started to swact within:{}, and completing swacting within:{}'.format( swact_start_timeout, swact_complete_timeout)) code = -1 host = prev_active for retry in range(1, 5): LOG.info('retry{:02d}: checking if swacting triggered, prev-active-controller={}'.format(retry, prev_active)) code = 0 try: code, msg = host_helper.wait_for_swact_complete( host, con_ssh=con_ssh, fail_ok=True, swact_start_timeout=swact_start_timeout, swact_complete_timeout=swact_complete_timeout) if 0 == code: LOG.info('OK, host-swacted, prev-active:{}, pre-standby:{}, code:{}, message:{}'.format( prev_active, prev_active, code, msg)) return True active, standby = system_helper.get_active_standby_controllers() if active == prev_standby and standby == prev_active: LOG.info('swacted?! prev-active:{} prev-standby:{}, cur active:{}, cur standby:{}'.format( prev_active, prev_standby, active, standby)) return True break except Exception as e: LOG.warn('erred, indicating system is in unstable state, meaning probably swacting is in process. ' 'previous active-controller:{}, previous standby-controller:{}\nerror message:{}'. format(prev_active, prev_standby, e)) if retry >= 4: LOG.error('Fail the test after retry {} times, system remains in unstable state, ' 'meaning probably swacting is in process. previous active-controller:{}, ' 'previous standby-controller:{}\nerror message:{}'. format(retry, prev_active, prev_standby, e)) raise time.sleep(10) return 0 == code
def pre_configs(request): """ Dovetail test fixture Args: request: - configure sshd_config on tis hosts to allow root access - update conf files on dovetail test node on cumulus """ if not ComplianceVar.get_var('DOVETAIL_SUITE'): skip('--dovetail-suite unspecified.') try: import yaml except ImportError: skip('pyymal package is not installed.') computes = host_helper.get_up_hypervisors() if len(computes) < 2: skip('Less than 2 computes in available states') active, standby = system_helper.get_active_standby_controllers() if not standby: skip('No standby controller on system') LOG.fixture_step( "Ensure dovetail test node mgmt nic connects to lab under test") compliance_helper.update_dovetail_mgmt_interface() controllers = [active, standby] storages = system_helper.get_hosts(personality='storage', availability=HostAvailState.AVAILABLE) hosts_dict = { 'controller': controllers, 'compute': computes, 'storage': storages } all_hosts = list(set(controllers + computes + storages)) LOG.fixture_step( "Enable port_security for the system and update existing networks") port_security = network_helper.get_network_values( 'external-net0', 'port_security_enabled')[0] port_security = eval(port_security) if not port_security: system_helper.add_ml2_extension_drivers(drivers='port_security') networks = network_helper.get_networks(auth_info=Tenant.get('admin')) for net in networks: network_helper.set_network(net_id=net, enable_port_security=True) configure_tis(all_hosts, request=request) configure_dovetail_server(hosts_per_personality=hosts_dict)
def test_reboot_standby_controller(no_simplex): active, standby = system_helper.get_active_standby_controllers() LOG.tc_step("'sudo reboot -f' from {}".format(standby)) host_helper.reboot_hosts(standby, wait_for_offline=True, wait_for_reboot_finish=True, force_reboot=True) system_helper.wait_for_hosts_states(standby, timeout=360, check_interval=30, availability=['available']) kube_helper.wait_for_pods_healthy(check_interval=30, all_namespaces=True)
def _test_firewall_rules_custom(remove_custom_firewall): """ Verify specified ports from the custom firewall rules are open and non-specified ports are closed. Skip Condition: - N/A Test Setup: - SCP iptables.rules from test server to lab Test Steps: - Install custom firewall rules - Check ports that should be both open and closed based on the custom firewall rules - Swact and check ports that should be both open and closed based on the custom firewall rules - Remove custom firewall rules - Check ports that are in the custom firewall rules are no longer open - Swact and check ports that are in the custom firewall rules are no longer open """ # The following ports must be in the iptables.rules file or the test will fail custom_ports, firewall_rules_path = remove_custom_firewall LOG.tc_step("Installing custom firewall rules") _modify_firewall_rules(firewall_rules_path) active_controller, standby_controller = system_helper.get_active_standby_controllers( ) con_ssh = ControllerClient.get_active_controller() LOG.tc_step("Verify custom ports on {}".format(active_controller)) for port in custom_ports: # Verifying ports that are in the iptables file are open _verify_port_from_natbox(con_ssh, port, port_expected_open=True) # Verifying ports that are not in the iptables file are still closed _verify_port_from_natbox(con_ssh, port + 1, port_expected_open=False) if standby_controller: LOG.tc_step("Swact {}".format(active_controller)) host_helper.swact_host(active_controller) active_controller = system_helper.get_active_controller_name() con_ssh = ControllerClient.get_active_controller() LOG.tc_step("Verify custom ports on {}".format(active_controller)) for port in custom_ports: # Verifying ports that are in the iptables file are open after swact _verify_port_from_natbox(con_ssh, port, port_expected_open=True) # Verifying ports that are not in the iptables file are still closed after swact _verify_port_from_natbox(con_ssh, port + 1, port_expected_open=False)
def test_swact_controller_platform(wait_for_con_drbd_sync_complete, collect_kpi): """ Verify swact active controller Test Steps: - Swact active controller - Verify standby controller and active controller are swapped - Verify nodes are ready in kubectl get nodes """ if system_helper.is_aio_simplex(): skip("Simplex system detected") if not wait_for_con_drbd_sync_complete: skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS) LOG.tc_step('retrieve active and available controllers') pre_active_controller, pre_standby_controller = system_helper.get_active_standby_controllers( ) assert pre_standby_controller, "No standby controller available" collect_kpi = None if container_helper.is_stx_openstack_deployed( ) else collect_kpi init_time = None if collect_kpi: init_time = common.get_date_in_format(date_format=KPI_DATE_FORMAT) LOG.tc_step( "Swact active controller and ensure active controller is changed") host_helper.swact_host(hostname=pre_active_controller) LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact") kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller, pre_standby_controller), timeout=30) if collect_kpi: kpi_name = SwactPlatform.NAME kpi_log_parser.record_kpi(local_kpi_file=collect_kpi, kpi_name=kpi_name, init_time=init_time, log_path=SwactPlatform.LOG_PATH, end_pattern=SwactPlatform.END, host=pre_standby_controller, start_host=pre_active_controller, start_pattern=SwactPlatform.START, start_path=SwactPlatform.START_PATH, uptime=1, fail_ok=False)
def clear_config_out_of_date_alarm(): active, standby = system_helper.get_active_standby_controllers() for host in (standby, active): if host and system_helper.wait_for_alarm( alarm_id=EventLogID.CONFIG_OUT_OF_DATE, timeout=5, entity_id=host, fail_ok=True)[0]: host_helper.lock_host(host, swact=True) time.sleep(60) host_helper.unlock_host(host) system_helper.wait_for_alarm_gone( alarm_id=EventLogID.CONFIG_OUT_OF_DATE, entity_id=host, fail_ok=False)
def test_reapply_stx_openstack_no_change(stx_openstack_applied_required, check_nodes, controller): """ Args: stx_openstack_applied_required: Pre-requisite: - stx-openstack application in applied state Test Steps: - Re-apply stx-openstack application - Check openstack pods healthy """ # if controller == 'controller-1': # skip("CGTS-10708") if system_helper.is_aio_simplex() and controller != 'controller-0': skip('Simplex system only has controller-0') active, standby = system_helper.get_active_standby_controllers() if active != controller: if not standby: skip('{} is not ready to take over'.format(controller)) LOG.tc_step("Swact active controller to test reapply from {}".format(controller)) host_helper.swact_host() time.sleep(60) LOG.info("helm list before reapply after swact") from utils.clients.ssh import ControllerClient con_ssh = ControllerClient.get_active_controller() end_time = time.time() + 180 while time.time() < end_time: code = con_ssh.exec_cmd('helm list', expect_timeout=60)[0] if code == 0: break time.sleep(30) LOG.tc_step("Re-apply stx-openstack application") container_helper.apply_app(app_name='stx-openstack') LOG.tc_step("Check openstack pods in good state on all controllers after stx-openstack " "re-applied") for host in get_valid_controllers(): check_openstack_pods_healthy(host=host, timeout=120)
def test_host_operations_with_custom_kubectl_app(deploy_delete_kubectl_app): """ Test create, delete custom app via kubectl run cmd Args: deploy_delete_kubectl_app: fixture Setups: - Create kubectl app via kubectl run Test Steps: - If duplex: swact and verify pod still Running - Lock/unlock controller and verify pod still Running Teardown: - Delete kubectl deployment and service - Verify pod is removed """ app_name, pod_name = deploy_delete_kubectl_app active, standby = system_helper.get_active_standby_controllers() if standby: LOG.tc_step("Swact active controller and verify {} test app is " "running ".format(pod_name)) host_helper.swact_host() kube_helper.wait_for_pods_status(pod_names=pod_name, namespace='default', fail_ok=False) LOG.tc_step("Lock/unlock {} and verify {} test app is " "running.".format(active, pod_name)) HostsToRecover.add(active) host_helper.lock_host(active, swact=False) # wait for services to stabilize before unlocking time.sleep(20) host_helper.unlock_host(active) pod_name = kube_helper.get_pods(field='NAME', namespace='default', name=app_name, strict=False)[0] kube_helper.wait_for_pods_status(pod_names=pod_name, namespace=None, fail_ok=False)
def sys_uncontrolled_swact(number_of_times=1): """ This is to identify the storage nodes and turn them off and on via vlm :return: """ for i in range(0, number_of_times): active, standby = system_helper.get_active_standby_controllers() LOG.tc_step("Doing iteration of {} of total iteration {}".format( i, number_of_times)) LOG.tc_step("'sudo reboot -f' from {}".format(standby)) host_helper.reboot_hosts(hostnames=active) LOG.tc_step("Check vms status after controller swact") vms = get_all_vms() vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600) for vm in vms: vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm)
def _test_firewall_rules_default(): """ Verify default ports are open. Test Steps: - Confirm iptables service is running on active controller - Check if lab is http(s), add corresponding port to check - Confirm the default ports are open - Swact and repeat the above steps """ # Cannot test connecting to the ports as they are in use. default_ports = [ 123, 161, 199, 5000, 6080, 6385, 8000, 8003, 8004, 8041, 8774, 8776, 8778, 9292, 9696, 15491 ] from consts.proj_vars import ProjVar region = ProjVar.get_var('REGION') if region != 'RegionOne' and region in MULTI_REGION_MAP: default_ports.remove(5000) default_ports.remove(9292) default_ports.append(8443) if CliAuth.get_var( 'HTTPS') else default_ports.append(8080) active_controller = system_helper.get_active_controller_name() con_ssh = ControllerClient.get_active_controller() _verify_iptables_status(con_ssh, active_controller) _check_ports_with_netstat(con_ssh, active_controller, default_ports) active_controller, new_active = system_helper.get_active_standby_controllers( ) if new_active: LOG.tc_step( "Swact {} and verify firewall rules".format(active_controller)) host_helper.swact_host(active_controller) con_ssh = ControllerClient.get_active_controller() _verify_iptables_status(con_ssh, new_active) _check_ports_with_netstat(con_ssh, new_active, default_ports)
def _lock_unlock_controllers(): LOG.fixture_step("Sleep for 300 seconds after admin password change") time.sleep(300) if more_than_one_controllers: active, standby = system_helper.get_active_standby_controllers() if standby: LOG.fixture_step( "(Session) Locking unlocking controllers to complete " "action") host_helper.lock_host(standby) host_helper.unlock_host(standby) host_helper.lock_host(active, swact=True) host_helper.unlock_host(active) else: LOG.warning("Standby controller unavailable. Skip lock unlock " "controllers post admin password change.") elif system_helper.is_aio_simplex(): LOG.fixture_step( "(Session) Simplex lab - lock/unlock controller to complete " "action") host_helper.lock_host('controller-0', swact=False) host_helper.unlock_host('controller-0')
def test_swact_uncontrolled_kpi_platform(collect_kpi): if not collect_kpi or container_helper.is_stx_openstack_deployed(): skip( "KPI test for platform only. Skip due to kpi collection is not enabled or openstack " "application is deployed.") start_host, end_host = system_helper.get_active_standby_controllers() if not end_host: skip("No standby host to swact to") init_time = common.get_date_in_format(date_format=KPI_DATE_FORMAT) host_helper.reboot_hosts(hostnames=start_host) kpi_name = SwactUncontrolledPlatform.NAME kpi_log_parser.record_kpi(local_kpi_file=collect_kpi, kpi_name=kpi_name, init_time=init_time, log_path=SwactUncontrolledPlatform.LOG_PATH, end_pattern=SwactUncontrolledPlatform.END, host=end_host, start_host=start_host, start_pattern=SwactUncontrolledPlatform.START, start_path=SwactUncontrolledPlatform.START_PATH, uptime=5, fail_ok=False)
def test_detect_failed_controller(no_simplex): con_ssh = ssh.ControllerClient.get_active_controller() active_controller, controller_host = system_helper.get_active_standby_controllers() controller_su_prompt = r'.*controller\-([0-9]){1,}\:/home/sysadmin#' cmd_get_offset = ("ntpq -p | grep {} -A1 | " "tail -1 | awk '{{print$8}}'".format(active_controller)) cmd_magic_keys_enable = ("echo 1 > /proc/sys/kernel/sysrq") cmd_get_start_date = ("python -c \"import datetime; " "print str(datetime.datetime.now())[:-3]\"") cmd_get_end_date = ("cat /var/log/mtcAgent.log | " "grep --color=never \"{} MNFA new candidate\" | " "tail -1 | awk '{{print$1}}'".format(controller_host)) cmd_get_recovered_date = ("cat /var/log/mtcAgent.log | " "grep --color=never '{} unlocked-enabled-available' | " "tail -1 | awk '{{print$1}}'".format(controller_host)) cmd_trigger_reboot = ("echo b > /proc/sysrq-trigger") res = list() rec_res = list() for i in range(20): LOG.tc_step("Start of iter {}".format(i)) st = str() offset = float() with host_helper.ssh_to_host(controller_host) as node_ssh: offset = float(node_ssh.exec_cmd(cmd=cmd_get_offset, get_exit_code=False)[1])/1000 node_ssh.send_sudo(cmd="su") node_ssh.expect(controller_su_prompt) node_ssh.send_sudo(cmd=cmd_magic_keys_enable) node_ssh.expect(controller_su_prompt) st = node_ssh.exec_cmd(cmd=cmd_get_start_date, get_exit_code=False, blob=controller_su_prompt)[1] node_ssh.exec_sudo_cmd(cmd_trigger_reboot, get_exit_code=False) system_helper.wait_for_hosts_states(controller_host, check_interval=20, availability=HostAvailState.AVAILABLE) pods_health = kube_helper.wait_for_pods_healthy(check_interval=20, timeout=HostTimeout.REBOOT) assert pods_health is True, "Check PODs health has failed" st_date = datetime.datetime.fromtimestamp( datetime.datetime.strptime(st, '%Y-%m-%d %H:%M:%S.%f').timestamp() - offset) et = con_ssh.exec_cmd(cmd=cmd_get_end_date, get_exit_code=False)[1] et_date = datetime.datetime.strptime(et, '%Y-%m-%dT%H:%M:%S.%f') er = con_ssh.exec_cmd(cmd=cmd_get_recovered_date, get_exit_code=False)[1] er_date = datetime.datetime.strptime(er, '%Y-%m-%dT%H:%M:%S.%f') diff = et_date - st_date rec_diff = er_date - st_date LOG.info(("\noffset = {}\n" "start time = {}\n" "end time = {}\n" "recover time = {}".format(offset, st, et, er))) LOG.info("\ndiff = {}".format(diff)) LOG.info("\nrecover diff = {}".format(rec_diff)) res.append(diff) rec_res.append(rec_diff) def calc_avg(lst): rtrn_sum = datetime.timedelta() for i in lst: LOG.info("Iter {}: {}".format(lst.index(i), i)) rtrn_sum += i return rtrn_sum/len(lst) final_res = calc_avg(res) final_rec_res = calc_avg(rec_res) LOG.info("Avg time is : {}".format(final_res)) LOG.info("Avg rec time is : {}".format(final_rec_res))
def test_swact_controllers(stx_openstack_required, wait_for_con_drbd_sync_complete): """ Verify swact active controller Test Steps: - Boot a vm on system and check ping works - Swact active controller - Verify standby controller and active controller are swapped - Verify vm is still pingable """ if not wait_for_con_drbd_sync_complete: skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS) LOG.tc_step('retrieve active and available controllers') pre_active_controller, pre_standby_controller = system_helper.get_active_standby_controllers( ) assert pre_standby_controller, "No standby controller available" pre_res_sys, pre_msg_sys = system_helper.wait_for_services_enable( timeout=20, fail_ok=True) up_hypervisors = host_helper.get_up_hypervisors() pre_res_neutron, pre_msg_neutron = network_helper.wait_for_agents_healthy( up_hypervisors, timeout=20, fail_ok=True) LOG.tc_step("Boot a vm from image and ping it") vm_id_img = vm_helper.boot_vm(name='swact_img', source='image', cleanup='function')[1] vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img) LOG.tc_step("Boot a vm from volume and ping it") vm_id_vol = vm_helper.boot_vm(name='swact', cleanup='function')[1] vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol) LOG.tc_step( "Swact active controller and ensure active controller is changed") host_helper.swact_host(hostname=pre_active_controller) LOG.tc_step("Verify standby controller and active controller are swapped") post_active_controller = system_helper.get_active_controller_name() post_standby_controller = system_helper.get_standby_controller_name() assert pre_standby_controller == post_active_controller, \ "Prev standby: {}; Post active: {}".format( pre_standby_controller, post_active_controller) assert pre_active_controller == post_standby_controller, \ "Prev active: {}; Post standby: {}".format( pre_active_controller, post_standby_controller) LOG.tc_step("Check boot-from-image vm still pingable after swact") vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img, timeout=30) LOG.tc_step("Check boot-from-volume vm still pingable after swact") vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol, timeout=30) LOG.tc_step( "Check system services and neutron agents after swact from {}".format( pre_active_controller)) post_res_sys, post_msg_sys = system_helper.wait_for_services_enable( fail_ok=True) post_res_neutron, post_msg_neutron = network_helper.wait_for_agents_healthy( hosts=up_hypervisors, fail_ok=True) assert post_res_sys, \ "\nPost-evac system services stats: {}\nPre-evac system services stats: {}". \ format(post_msg_sys, pre_msg_sys) assert post_res_neutron, \ "\nPost evac neutron agents stats: {}\nPre-evac neutron agents stats: {}". \ format(pre_msg_neutron, post_msg_neutron) LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact") kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller, pre_standby_controller), timeout=30)
def _test_basic_swift_provisioning(pool_size, pre_swift_check): """ Verifies basic swift provisioning works as expected Args: pool_size: pre_swift_check: Returns: """ ceph_backend_info = get_ceph_backend_info() if pool_size == 'default' and pre_swift_check[0]: skip("Swift is already provisioned") if pool_size == 'fixed_size' and pre_swift_check[0]: skip("Swift is already provisioned and set to non-default pool value") object_pool_gib = None cinder_pool_gib = ceph_backend_info['cinder_pool_gib'] if pool_size == 'default': if not ceph_backend_info['object_gateway']: LOG.tc_step("Enabling SWIFT object store .....") else: if not ceph_backend_info['object_gateway']: skip("Swift is not provisioned") total_gib = ceph_backend_info['ceph_total_space_gib'] unallocated_gib = (total_gib - cinder_pool_gib - ceph_backend_info['glance_pool_gib'] - ceph_backend_info['ephemeral_pool_gib']) if unallocated_gib == 0: unallocated_gib = int(int(cinder_pool_gib) / 4) cinder_pool_gib = str(int(cinder_pool_gib) - unallocated_gib) elif unallocated_gib < 0: skip("Unallocated gib < 0. System is in unknown state.") object_pool_gib = str(unallocated_gib) LOG.tc_step( "Enabling SWIFT object store and setting object pool size to {}....." .format(object_pool_gib)) rc, updated_backend_info = storage_helper.modify_storage_backend( 'ceph', object_gateway=False, cinder=cinder_pool_gib, object_gib=object_pool_gib, services='cinder,glance,nova,swift') LOG.info("Verifying if swift object gateway is enabled...") assert str(updated_backend_info['object_gateway']).lower() == 'true', "Fail to enable Swift object gateway: {}"\ .format(updated_backend_info) LOG.info("Swift object gateway is enabled.") LOG.info("Verifying ceph task ...") state = storage_helper.get_storage_backends(backend='ceph', field='state')[0] if system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, timeout=10, fail_ok=True, entity_id='controller-')[0]: LOG.info("Verifying ceph task is set to 'add-object-gateway'...") assert BackendState.CONFIGURING == state, \ "Unexpected ceph state '{}' after swift object gateway update ".format(state) LOG.info("Lock/Unlock controllers...") active_controller, standby_controller = system_helper.get_active_standby_controllers( ) LOG.info("Active Controller is {}; Standby Controller is {}...".format( active_controller, standby_controller)) for controller in [standby_controller, active_controller]: if not controller: continue HostsToRecover.add(controller) host_helper.lock_host(controller, swact=True) storage_helper.wait_for_storage_backend_vals( backend='ceph-store', **{ 'task': BackendTask.RECONFIG_CONTROLLER, 'state': BackendState.CONFIGURING }) host_helper.unlock_host(controller) system_helper.wait_for_alarm_gone( alarm_id=EventLogID.CONFIG_OUT_OF_DATE, fail_ok=False) else: assert BackendState.CONFIGURED == state, \ "Unexpected ceph state '{}' after swift object gateway update ".format(state) LOG.info("Verifying Swift provisioning setups...") assert verify_swift_object_setup(), "Failure in swift setups" for i in range(3): vm_name = 'vm_swift_api_{}'.format(i) LOG.tc_step( "Boot vm {} and perform nova actions on it".format(vm_name)) vm_id = vm_helper.boot_vm(name=vm_name, cleanup='function')[1] vm_helper.wait_for_vm_pingable_from_natbox( vm_id, timeout=VMTimeout.DHCP_RETRY) LOG.info("Cold migrate VM {} ....".format(vm_name)) rc = vm_helper.cold_migrate_vm(vm_id=vm_id)[0] assert rc == 0, "VM {} failed to cold migrate".format(vm_name) vm_helper.wait_for_vm_pingable_from_natbox(vm_id) LOG.info("Live migrate VM {} ....".format(vm_name)) rc = vm_helper.live_migrate_vm(vm_id=vm_id)[0] assert rc == 0, "VM {} failed to live migrate".format(vm_name) vm_helper.wait_for_vm_pingable_from_natbox(vm_id) LOG.info("Suspend/Resume VM {} ....".format(vm_name)) vm_helper.suspend_vm(vm_id) vm_helper.resume_vm(vm_id) vm_helper.wait_for_vm_pingable_from_natbox(vm_id) LOG.info("Checking overall system health...") assert system_helper.get_system_health_query( ), "System health not OK after VMs" LOG.tc_step("Create Swift container using swift post cli command ...") container_names = [ "test_container_1", "test_container_2", "test_container_3" ] for container in container_names: LOG.info("Creating swift object container {}".format(container)) rc, out = swift_helper.create_swift_container(container) assert rc == 0, "Fail to create swift container {}".format(container) LOG.info( "Create swift object container {} successfully".format(container)) LOG.tc_step("Verify swift list to list containers ...") container_list = swift_helper.get_swift_containers()[1] assert set(container_names) <= set(container_list), "Swift containers {} not listed in {}"\ .format(container_names, container_list) LOG.tc_step("Verify swift delete a container...") container_to_delete = container_names[2] rc, out = swift_helper.delete_swift_container(container_to_delete) assert rc == 0, "Swift delete container rejected: {}".format(out) assert container_to_delete not in swift_helper.get_swift_containers()[1], "Unable to delete swift container {}"\ .format(container_to_delete) LOG.tc_step("Verify swift stat to show info of a single container...") container_to_stat = container_names[0] out = swift_helper.get_swift_container_stat_info(container_to_stat) assert out["Container"] == container_to_stat, "Unable to stat swift container {}"\ .format(container_to_stat) assert out["Objects"] == '0', "Incorrect number of objects container {}. Expected O objects, but has {} objects"\ .format(container_to_stat, out["Objects"])
def test_modify_mtu_oam_interface(mtu_range): """ of the 2016-04-04 sysinv_test_plan.pdf 20) Change the MTU value of the OAM interface using CLI Verify that MTU on oam interfaces on both standby and active controller can be modified by cli Args: mtu_range (str): A string that contain the mtu want to be tested Setup: - Nothing Test Steps: - lock standby controller - modify the imtu value of the controller - unlock the controller - revert and oam mtu of the controller and check system is still healthy - swact the controller - lock the controller - modify the imtu value of the controller - unlock the controller - check the controllers have expected mtu - revert the oam mtu of the controller and check system is still healthy Teardown: - Nothing """ is_sx = system_helper.is_aio_simplex() origin_active, origin_standby = system_helper.get_active_standby_controllers() if not origin_standby and not is_sx: skip("Standby controller unavailable. Cannot lock controller.") mtu = __get_mtu_to_mod(providernet_name='-ext', mtu_range=mtu_range) first_host = origin_active if is_sx else origin_standby max_mtu, cur_mtu, nic_name = get_max_allowed_mtus(host=first_host, network_type='oam') LOG.info('OK, the max MTU for {} is {}'.format(nic_name, max_mtu)) expecting_pass = not max_mtu or mtu <= max_mtu if not expecting_pass: LOG.warn('Expecting to fail in changing MTU: changing to:{}, max-mtu:{}'.format(mtu, max_mtu)) oam_attributes = host_helper.get_host_interfaces(host=first_host, field='attributes', name='oam', strict=False) # sample attributes: [MTU=9216,AE_MODE=802.3ad] pre_oam_mtu = int(oam_attributes[0].split(',')[0].split('=')[1]) is_stx_openstack_applied = container_helper.is_stx_openstack_deployed(applied_only=True) if not is_sx: HostsToRecover.add(origin_standby) prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True) LOG.tc_step("Modify {} oam interface MTU from {} to {} on standby controller, and " "ensure it's applied successfully after unlock".format(origin_standby, pre_oam_mtu, mtu)) if mtu == cur_mtu: LOG.info('Setting to same MTU: from:{} to:{}'.format(mtu, cur_mtu)) code, res = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=mtu, network_type='oam', lock_unlock=True, fail_ok=True) LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu)) code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=pre_oam_mtu, network_type='oam', lock_unlock=True, fail_ok=True) if 0 == code: assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res) else: assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res) assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert) LOG.tc_step("Check openstack cli, application and pods status after modify and revert {} oam mtu". format(origin_standby)) check_containers(prev_bad_pods, check_app=is_stx_openstack_applied) LOG.tc_step("Ensure standby controller is in available state and attempt to swact active controller to {}". format(origin_standby)) system_helper.wait_for_hosts_states(origin_active, availability=['available']) host_helper.swact_host(fail_ok=False) host_helper.wait_for_webservice_up(origin_standby) prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True) HostsToRecover.add(origin_active) LOG.tc_step("Modify {} oam interface MTU to: {}, and " "ensure it's applied successfully after unlock".format(origin_active, mtu)) code, res = host_helper.modify_mtu_on_interfaces(origin_active, mtu_val=mtu, network_type='oam', lock_unlock=True, fail_ok=True) LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu)) code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_active, mtu_val=pre_oam_mtu, network_type='oam', lock_unlock=True, fail_ok=True) if 0 == code: assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res) else: assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res) assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert) LOG.tc_step("Check openstack cli, application and pods after modify and revert {} oam mtu".format(origin_active)) check_containers(prev_bad_pods, check_app=is_stx_openstack_applied)
def test_is_active_con(): active_con, standby_con = system_helper.get_active_standby_controllers() assert system_helper.is_active_controller(active_con) if standby_con: assert not system_helper.is_active_controller(standby_con)
def kill_sm_process_and_verify_impact(name, cmd='', pid_file='', retries=2, impact='swact', host='controller-0', interval=20, action_timeout=90, total_retries=3, process_type='sm', on_active_controller=True, con_ssh=None, auth_info=Tenant.get('admin_platform')): """ Kill the process with the specified name and verify the system behaviors as expected Args: name (str): name of the process cmd (str): executable of the process pid_file (str): file containing process id retries (int): times of killing actions upon which the IMPACT will be triggered impact (str): system behavior including: swact -- active controller is swacted enabled-degraded -- the status of the service will change to disabled-failed -- the status of the service will change to ... host (str): host to test on interval (int): least time to wait between kills action_timeout (int): kills and impact should happen within this time frame total_retries (int): total number of retries for whole kill and wait actions process_type (str): valid types are: sm, pmon, other on_active_controller (boolean): con_ssh: ssh connection/client to the active controller auth_info Returns: (pid, host) pid: >0 suceess, the final PID of the process -1 fail because of impact NOT happening after killing the process up to threshold times -2 fail because of impact happening before killing threshold times -3 fail after try total_retries times host: the host tested on """ active_controller, standby_controller = \ system_helper.get_active_standby_controllers(con_ssh=con_ssh, auth_info=auth_info) if on_active_controller: LOG.info('on active controller: {}, host:{}'.format( active_controller, host)) host = active_controller con_ssh = con_ssh or ControllerClient.get_active_controller() LOG.info('on host: {}'.format(host)) if total_retries < 1 or retries < 1: LOG.error( 'retries/total-retries < 1? retires:{}, total retries:{}'.format( retries, total_retries)) return None count = 0 for i in range(1, total_retries + 1): LOG.info( 'retry:{:02d} kill the process:{} and verify impact:{}'.format( i, name, impact)) exec_times = [] killed_pids = [] timeout = time.time() + action_timeout * (retries / 2 if retries > 2 else 1) while time.time() < timeout: count += 1 LOG.debug('retry{:02d}-{:02d}: Failed to get process id for {} on ' 'host:{}, swacted unexpectedly?'.format( i, count, name, host)) try: pid, proc_name = get_process_info(name, cmd=cmd, host=host, process_type=process_type, pid_file=pid_file, con_ssh=con_ssh)[0:2] except pexpect.exceptions.EOF: LOG.warn( 'retry{:02d}-{:02d}: Failed to get process id for {} on ' 'host:{}, swacted unexpectedly?'.format( i, count, name, host)) time.sleep(interval / 3.0) continue if -1 == pid: LOG.error( 'retry{:02d}-{:02d}: Failed to get PID for process with ' 'name:{}, cmd:{}, ' 'wait and retries'.format(i, count, name, cmd)) time.sleep(interval / 3.0) continue if killed_pids and pid in killed_pids: LOG.warn('retry{:02d}-{:02d}: No new process re-created, ' 'prev-pid={}, cur-pid={}'.format( i, count, killed_pids[-1], pid)) time.sleep(interval / 3.0) continue last_killed_pid = killed_pids[-1] if killed_pids else None killed_pids.append(pid) last_kill_time = exec_times[-1] if exec_times else None exec_times.append(datetime.datetime.utcnow()) latest_events = _get_last_events_timestamps( event_log_id=KILL_PROC_EVENT_FORMAT[process_type]['event_id'], limit=10) LOG.info( 'retry{:02d}-{:02d}: before kill CLI, proc_name={}, pid={}, ' 'last_killed_pid={}, last_kill_time={}'.format( i, count, proc_name, pid, last_killed_pid, last_kill_time)) LOG.info('\tactive-controller={}, standby-controller={}'.format( active_controller, standby_controller)) kill_cmd = '{} {}'.format(KILL_CMD, pid) with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con: code, output = con.exec_sudo_cmd(kill_cmd, fail_ok=True) if 0 != code: # it happens occasionaly LOG.error('Failed to kill pid:{}, cmd={}, output=<{}>, ' 'at run:{}, already terminated?'.format( pid, kill_cmd, output, count)) if count < retries: # IMPACT should not happen yet if not check_impact(impact, proc_name, last_events=latest_events, active_controller=active_controller, standby_controller=standby_controller, expecting_impact=False, process_type=process_type, host=host, con_ssh=con_ssh): LOG.error( 'Impact:{} observed unexpectedly, it should happen ' 'only after killing {} times, ' 'actual killed times:{}'.format( impact, retries, count)) return -2, host LOG.info( 'retry{:02d}-{:02d}: OK, NO impact as expected, impact={}, ' 'will kill it another time'.format(i, count, impact)) time.sleep(max(interval * 1 / 2.0, 5)) else: no_standby_controller = standby_controller is None expecting_impact = True if not no_standby_controller else False if not check_impact(impact, proc_name, last_events=latest_events, active_controller=active_controller, standby_controller=standby_controller, expecting_impact=expecting_impact, process_type=process_type, host=host, con_ssh=con_ssh): LOG.error( 'No impact after killing process {} {} times, while ' '{}'.format(proc_name, count, ('expecting impact' if expecting_impact else 'not expecting impact'))) return -1, host LOG.info('OK, final retry{:02d}-{:02d}: OK, IMPACT happened ' '(if applicable) as expected, ' 'impact={}'.format(i, count, impact)) active_controller, standby_controller = \ system_helper.get_active_standby_controllers( con_ssh=con_ssh) LOG.info( 'OK, after impact:{} (tried:{} times), ' 'now active-controller={}, standby-controller={}'.format( impact, count, active_controller, standby_controller)) pid, proc_name = get_process_info(name, cmd=cmd, host=host, pid_file=pid_file, process_type=process_type, con_ssh=con_ssh)[0:2] return pid, active_controller return -3, host
def test_measure_swact_recover(no_simplex): cmd_get_start_date = ("python -c \"import datetime; " "print str(datetime.datetime.now())[:-3]\"") res = list() try: for i in range(2): LOG.tc_step("Start of iter {}".format(i)) con_ssh = ssh.ControllerClient.get_active_controller() LOG.info("Get active/standby controllers") active_controller, standby_controller = system_helper.get_active_standby_controllers( ) cmd_get_offset = ( "ntpq -p | grep {} -A1 | " "tail -1 | awk '{{print$8}}'".format(active_controller)) cmd_get_start_date = ( "cat /var/log/mtcAgent.log | " "grep \"{} Action=swact\" | " "tail -1 | awk '{{print$1}}'".format(active_controller)) cmd_get_end_date = ( "cat /var/log/mtcAgent.log | " "grep \"{} Task: Swact: Complete\" | " "tail -1 | awk '{{print$1}}'".format(active_controller)) LOG.info("Start swact action") host_helper.swact_host(hostname=active_controller) kube_helper.wait_for_nodes_ready(hosts=(active_controller, standby_controller), check_interval=20) LOG.info("Calculate swact time") con_ssh = ssh.ControllerClient.get_active_controller() with host_helper.ssh_to_host(active_controller, con_ssh=con_ssh) as con_0_ssh: con_0_ssh.exec_cmd(cmd="cat /var/log/mtcAgent.log", get_exit_code=False) st = con_0_ssh.exec_cmd(cmd=cmd_get_start_date, get_exit_code=False)[1] st_date = datetime.datetime.strptime(st, '%Y-%m-%dT%H:%M:%S.%f') offset = float( con_ssh.exec_cmd(cmd=cmd_get_offset, get_exit_code=False)[1]) / 1000 et = con_ssh.exec_cmd(cmd=cmd_get_end_date, get_exit_code=False)[1] et_date = datetime.datetime.fromtimestamp( datetime.datetime.strptime( et, '%Y-%m-%dT%H:%M:%S.%f').timestamp() - offset) diff = et_date - st_date LOG.info("\nstart time = {}\nend time = {}".format(st, et)) LOG.info("\ndiff = {}".format(diff)) res.append(diff) finally: active_controller, standby_controller = system_helper.get_active_standby_controllers( ) if active_controller != "controller-0": host_helper.swact_host(hostname=active_controller) kube_helper.wait_for_nodes_ready(hosts=(active_controller, standby_controller), check_interval=20) def calc_avg(lst): rtrn_sum = datetime.timedelta() for i in lst: LOG.info("Iter {}: {}".format(lst.index(i), i)) rtrn_sum += i return rtrn_sum / len(lst) final_res = calc_avg(res) LOG.info("Avg time is : {}".format(final_res))