Ejemplo n.º 1
0
def test_swact_controller_platform(wait_for_con_drbd_sync_complete):
    """
    Verify swact active controller

    Test Steps:
        - Swact active controller
        - Verify standby controller and active controller are swapped
        - Verify nodes are ready in kubectl get nodes

    """
    if system_helper.is_aio_simplex():
        skip("Simplex system detected")

    if not wait_for_con_drbd_sync_complete:
        skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS)

    LOG.tc_step('retrieve active and available controllers')
    pre_active_controller, pre_standby_controller = \
        system_helper.get_active_standby_controllers()
    assert pre_standby_controller, "No standby controller available"

    LOG.tc_step("Swact active controller and ensure active controller "
                "is changed")
    host_helper.swact_host(hostname=pre_active_controller)

    LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact")
    kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller,
                                            pre_standby_controller),
                                     timeout=30)
Ejemplo n.º 2
0
def sys_lock_unlock_standby(number_of_times=1):
    """
    This is to identify the storage nodes and turn them off and on via vlm
    :return:
    """
    timeout = VMTimeout.DHCP_RETRY if system_helper.is_aio_system(
    ) else VMTimeout.PING_VM
    for i in range(0, number_of_times):
        active, standby = system_helper.get_active_standby_controllers()
        LOG.tc_step("Doing iteration of {} of total iteration {}".format(
            i, number_of_times))
        LOG.tc_step("'sudo reboot -f' from {}".format(standby))
        host_helper.lock_host(host=standby)

        LOG.tc_step("Check vms status after locking standby")
        vms = get_all_vms()
        vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)

        for vm in vms:
            vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm,
                                                       timeout=timeout)

        host_helper.unlock_host(host=standby)
        vms = get_all_vms()
        vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)

        for vm in vms:
            vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm)
Ejemplo n.º 3
0
    def remove():
        LOG.fixture_step("Removing custom firewall rules")
        user_file_dir = ProjVar.get_var('USER_FILE_DIR')
        empty_path = user_file_dir + "iptables-empty.rules"
        client = get_cli_client(central_region=True)
        client.exec_cmd('touch {}'.format(empty_path))
        _modify_firewall_rules(empty_path)

        active, standby = system_helper.get_active_standby_controllers()
        con_ssh = ControllerClient.get_active_controller()
        LOG.fixture_step("Verify custom ports on {}".format(active))
        for port in custom_ports:
            # Verifying ports that are in the iptables file are closed
            _verify_port_from_natbox(con_ssh, port, port_expected_open=False)

        if standby:
            LOG.fixture_step("Swact {}".format(active))
            host_helper.swact_host(active)

            LOG.fixture_step("Verify custom ports on {}".format(standby))
            for port in custom_ports:
                # Verifying ports that are in the iptables file are closed after swact
                _verify_port_from_natbox(con_ssh,
                                         port,
                                         port_expected_open=False)
Ejemplo n.º 4
0
def is_controller_swacted(prev_active, prev_standby,
                          swact_start_timeout=MTCTimeout.KILL_PROCESS_SWACT_NOT_START,
                          swact_complete_timeout=MTCTimeout.KILL_PROCESS_SWACT_COMPLETE,
                          con_ssh=None):
    """
    Wait and check if the active-controller on the system was 'swacted' with give time period

    Args:
        prev_active:            previous active controller
        prev_standby:           previous standby controller
        swact_start_timeout:    check within this time frame if the swacting started
        swact_complete_timeout: check if the swacting (if any) completed in this time period
        con_ssh:                ssh connection/client to the current active-controller

    Returns:

    """
    LOG.info('Check if the controllers started to swact within:{}, and completing swacting within:{}'.format(
        swact_start_timeout, swact_complete_timeout))

    code = -1
    host = prev_active
    for retry in range(1, 5):
        LOG.info('retry{:02d}: checking if swacting triggered, prev-active-controller={}'.format(retry, prev_active))
        code = 0
        try:
            code, msg = host_helper.wait_for_swact_complete(
                host, con_ssh=con_ssh, fail_ok=True,
                swact_start_timeout=swact_start_timeout, swact_complete_timeout=swact_complete_timeout)

            if 0 == code:
                LOG.info('OK, host-swacted, prev-active:{}, pre-standby:{}, code:{}, message:{}'.format(
                    prev_active, prev_active, code, msg))
                return True

            active, standby = system_helper.get_active_standby_controllers()
            if active == prev_standby and standby == prev_active:
                LOG.info('swacted?! prev-active:{} prev-standby:{}, cur active:{}, cur standby:{}'.format(
                    prev_active, prev_standby, active, standby))
                return True
            break

        except Exception as e:
            LOG.warn('erred, indicating system is in unstable state, meaning probably swacting is in process. '
                     'previous active-controller:{}, previous standby-controller:{}\nerror message:{}'.
                     format(prev_active, prev_standby, e))

            if retry >= 4:
                LOG.error('Fail the test after retry {} times, system remains in unstable state, '
                          'meaning probably swacting is in process. previous active-controller:{}, '
                          'previous standby-controller:{}\nerror message:{}'.
                          format(retry, prev_active, prev_standby, e))
                raise

        time.sleep(10)

    return 0 == code
Ejemplo n.º 5
0
def pre_configs(request):
    """
    Dovetail test fixture
    Args:
        request:

    - configure sshd_config on tis hosts to allow root access
    - update conf files on dovetail test node on cumulus

    """
    if not ComplianceVar.get_var('DOVETAIL_SUITE'):
        skip('--dovetail-suite unspecified.')

    try:
        import yaml
    except ImportError:
        skip('pyymal package is not installed.')

    computes = host_helper.get_up_hypervisors()
    if len(computes) < 2:
        skip('Less than 2 computes in available states')

    active, standby = system_helper.get_active_standby_controllers()
    if not standby:
        skip('No standby controller on system')

    LOG.fixture_step(
        "Ensure dovetail test node mgmt nic connects to lab under test")
    compliance_helper.update_dovetail_mgmt_interface()

    controllers = [active, standby]
    storages = system_helper.get_hosts(personality='storage',
                                       availability=HostAvailState.AVAILABLE)
    hosts_dict = {
        'controller': controllers,
        'compute': computes,
        'storage': storages
    }
    all_hosts = list(set(controllers + computes + storages))

    LOG.fixture_step(
        "Enable port_security for the system and update existing networks")
    port_security = network_helper.get_network_values(
        'external-net0', 'port_security_enabled')[0]
    port_security = eval(port_security)
    if not port_security:
        system_helper.add_ml2_extension_drivers(drivers='port_security')
        networks = network_helper.get_networks(auth_info=Tenant.get('admin'))
        for net in networks:
            network_helper.set_network(net_id=net, enable_port_security=True)

    configure_tis(all_hosts, request=request)
    configure_dovetail_server(hosts_per_personality=hosts_dict)
Ejemplo n.º 6
0
def test_reboot_standby_controller(no_simplex):
    active, standby = system_helper.get_active_standby_controllers()
    LOG.tc_step("'sudo reboot -f' from {}".format(standby))
    host_helper.reboot_hosts(standby,
                             wait_for_offline=True,
                             wait_for_reboot_finish=True,
                             force_reboot=True)
    system_helper.wait_for_hosts_states(standby,
                                        timeout=360,
                                        check_interval=30,
                                        availability=['available'])
    kube_helper.wait_for_pods_healthy(check_interval=30, all_namespaces=True)
Ejemplo n.º 7
0
def _test_firewall_rules_custom(remove_custom_firewall):
    """
    Verify specified ports from the custom firewall rules are open and non-specified ports are closed.

    Skip Condition:
        - N/A

    Test Setup:
        - SCP iptables.rules from test server to lab

    Test Steps:
        - Install custom firewall rules
        - Check ports that should be both open and closed based on the custom firewall rules
        - Swact and check ports that should be both open and closed based on the custom firewall rules
        - Remove custom firewall rules
        - Check ports that are in the custom firewall rules are no longer open
        - Swact and check ports that are in the custom firewall rules are no longer open
    """
    # The following ports must be in the iptables.rules file or the test will fail
    custom_ports, firewall_rules_path = remove_custom_firewall

    LOG.tc_step("Installing custom firewall rules")
    _modify_firewall_rules(firewall_rules_path)

    active_controller, standby_controller = system_helper.get_active_standby_controllers(
    )
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Verify custom ports on {}".format(active_controller))
    for port in custom_ports:
        # Verifying ports that are in the iptables file are open
        _verify_port_from_natbox(con_ssh, port, port_expected_open=True)

        # Verifying ports that are not in the iptables file are still closed
        _verify_port_from_natbox(con_ssh, port + 1, port_expected_open=False)

    if standby_controller:
        LOG.tc_step("Swact {}".format(active_controller))
        host_helper.swact_host(active_controller)
        active_controller = system_helper.get_active_controller_name()
        con_ssh = ControllerClient.get_active_controller()

        LOG.tc_step("Verify custom ports on {}".format(active_controller))
        for port in custom_ports:
            # Verifying ports that are in the iptables file are open after swact
            _verify_port_from_natbox(con_ssh, port, port_expected_open=True)

            # Verifying ports that are not in the iptables file are still closed after swact
            _verify_port_from_natbox(con_ssh,
                                     port + 1,
                                     port_expected_open=False)
Ejemplo n.º 8
0
def test_swact_controller_platform(wait_for_con_drbd_sync_complete,
                                   collect_kpi):
    """
    Verify swact active controller

    Test Steps:
        - Swact active controller
        - Verify standby controller and active controller are swapped
        - Verify nodes are ready in kubectl get nodes

    """
    if system_helper.is_aio_simplex():
        skip("Simplex system detected")

    if not wait_for_con_drbd_sync_complete:
        skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS)

    LOG.tc_step('retrieve active and available controllers')
    pre_active_controller, pre_standby_controller = system_helper.get_active_standby_controllers(
    )
    assert pre_standby_controller, "No standby controller available"

    collect_kpi = None if container_helper.is_stx_openstack_deployed(
    ) else collect_kpi
    init_time = None
    if collect_kpi:
        init_time = common.get_date_in_format(date_format=KPI_DATE_FORMAT)

    LOG.tc_step(
        "Swact active controller and ensure active controller is changed")
    host_helper.swact_host(hostname=pre_active_controller)

    LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact")
    kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller,
                                            pre_standby_controller),
                                     timeout=30)

    if collect_kpi:
        kpi_name = SwactPlatform.NAME
        kpi_log_parser.record_kpi(local_kpi_file=collect_kpi,
                                  kpi_name=kpi_name,
                                  init_time=init_time,
                                  log_path=SwactPlatform.LOG_PATH,
                                  end_pattern=SwactPlatform.END,
                                  host=pre_standby_controller,
                                  start_host=pre_active_controller,
                                  start_pattern=SwactPlatform.START,
                                  start_path=SwactPlatform.START_PATH,
                                  uptime=1,
                                  fail_ok=False)
Ejemplo n.º 9
0
def clear_config_out_of_date_alarm():
    active, standby = system_helper.get_active_standby_controllers()
    for host in (standby, active):
        if host and system_helper.wait_for_alarm(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                timeout=5,
                entity_id=host,
                fail_ok=True)[0]:
            host_helper.lock_host(host, swact=True)
            time.sleep(60)
            host_helper.unlock_host(host)
            system_helper.wait_for_alarm_gone(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                entity_id=host,
                fail_ok=False)
Ejemplo n.º 10
0
def test_reapply_stx_openstack_no_change(stx_openstack_applied_required, check_nodes, controller):
    """
    Args:
        stx_openstack_applied_required:

    Pre-requisite:
        - stx-openstack application in applied state

    Test Steps:
        - Re-apply stx-openstack application
        - Check openstack pods healthy

    """
    # if controller == 'controller-1':
    #     skip("CGTS-10708")

    if system_helper.is_aio_simplex() and controller != 'controller-0':
        skip('Simplex system only has controller-0')

    active, standby = system_helper.get_active_standby_controllers()
    if active != controller:
        if not standby:
            skip('{} is not ready to take over'.format(controller))

        LOG.tc_step("Swact active controller to test reapply from {}".format(controller))
        host_helper.swact_host()
        time.sleep(60)

    LOG.info("helm list before reapply after swact")
    from utils.clients.ssh import ControllerClient
    con_ssh = ControllerClient.get_active_controller()
    end_time = time.time() + 180
    while time.time() < end_time:
        code = con_ssh.exec_cmd('helm list', expect_timeout=60)[0]
        if code == 0:
            break
        time.sleep(30)

    LOG.tc_step("Re-apply stx-openstack application")
    container_helper.apply_app(app_name='stx-openstack')

    LOG.tc_step("Check openstack pods in good state on all controllers after stx-openstack "
                "re-applied")
    for host in get_valid_controllers():
        check_openstack_pods_healthy(host=host, timeout=120)
Ejemplo n.º 11
0
def test_host_operations_with_custom_kubectl_app(deploy_delete_kubectl_app):
    """
    Test create, delete custom app via kubectl run cmd
    Args:
        deploy_delete_kubectl_app: fixture

    Setups:
        - Create kubectl app via kubectl run

    Test Steps:
        - If duplex: swact and verify pod still Running
        - Lock/unlock controller and verify pod still Running

    Teardown:
        - Delete kubectl deployment and service
        - Verify pod is removed

    """
    app_name, pod_name = deploy_delete_kubectl_app
    active, standby = system_helper.get_active_standby_controllers()

    if standby:
        LOG.tc_step("Swact active controller and verify {} test app is "
                    "running ".format(pod_name))
        host_helper.swact_host()
        kube_helper.wait_for_pods_status(pod_names=pod_name,
                                         namespace='default',
                                         fail_ok=False)

    LOG.tc_step("Lock/unlock {} and verify {} test app is "
                "running.".format(active, pod_name))
    HostsToRecover.add(active)
    host_helper.lock_host(active, swact=False)

    # wait for services to stabilize before unlocking
    time.sleep(20)

    host_helper.unlock_host(active)
    pod_name = kube_helper.get_pods(field='NAME',
                                    namespace='default',
                                    name=app_name,
                                    strict=False)[0]
    kube_helper.wait_for_pods_status(pod_names=pod_name,
                                     namespace=None,
                                     fail_ok=False)
Ejemplo n.º 12
0
def sys_uncontrolled_swact(number_of_times=1):
    """
    This is to identify the storage nodes and turn them off and on via vlm
    :return:
    """
    for i in range(0, number_of_times):
        active, standby = system_helper.get_active_standby_controllers()
        LOG.tc_step("Doing iteration of {} of total iteration {}".format(
            i, number_of_times))
        LOG.tc_step("'sudo reboot -f' from {}".format(standby))
        host_helper.reboot_hosts(hostnames=active)

        LOG.tc_step("Check vms status after controller swact")
        vms = get_all_vms()
        vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)

        for vm in vms:
            vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm)
Ejemplo n.º 13
0
def _test_firewall_rules_default():
    """
    Verify default ports are open.

    Test Steps:
        - Confirm iptables service is running on active controller
        - Check if lab is http(s), add corresponding port to check
        - Confirm the default ports are open
        - Swact and repeat the above steps
    """
    # Cannot test connecting to the ports as they are in use.

    default_ports = [
        123, 161, 199, 5000, 6080, 6385, 8000, 8003, 8004, 8041, 8774, 8776,
        8778, 9292, 9696, 15491
    ]

    from consts.proj_vars import ProjVar
    region = ProjVar.get_var('REGION')
    if region != 'RegionOne' and region in MULTI_REGION_MAP:
        default_ports.remove(5000)
        default_ports.remove(9292)

    default_ports.append(8443) if CliAuth.get_var(
        'HTTPS') else default_ports.append(8080)

    active_controller = system_helper.get_active_controller_name()
    con_ssh = ControllerClient.get_active_controller()

    _verify_iptables_status(con_ssh, active_controller)
    _check_ports_with_netstat(con_ssh, active_controller, default_ports)

    active_controller, new_active = system_helper.get_active_standby_controllers(
    )
    if new_active:
        LOG.tc_step(
            "Swact {} and verify firewall rules".format(active_controller))
        host_helper.swact_host(active_controller)
        con_ssh = ControllerClient.get_active_controller()

        _verify_iptables_status(con_ssh, new_active)
        _check_ports_with_netstat(con_ssh, new_active, default_ports)
Ejemplo n.º 14
0
    def _lock_unlock_controllers():
        LOG.fixture_step("Sleep for 300 seconds after admin password change")
        time.sleep(300)
        if more_than_one_controllers:
            active, standby = system_helper.get_active_standby_controllers()
            if standby:
                LOG.fixture_step(
                    "(Session) Locking unlocking controllers to complete "
                    "action")
                host_helper.lock_host(standby)
                host_helper.unlock_host(standby)

                host_helper.lock_host(active, swact=True)
                host_helper.unlock_host(active)
            else:
                LOG.warning("Standby controller unavailable. Skip lock unlock "
                            "controllers post admin password change.")
        elif system_helper.is_aio_simplex():
            LOG.fixture_step(
                "(Session) Simplex lab - lock/unlock controller to complete "
                "action")
            host_helper.lock_host('controller-0', swact=False)
            host_helper.unlock_host('controller-0')
Ejemplo n.º 15
0
def test_swact_uncontrolled_kpi_platform(collect_kpi):
    if not collect_kpi or container_helper.is_stx_openstack_deployed():
        skip(
            "KPI test for platform only. Skip due to kpi collection is not enabled or openstack "
            "application is deployed.")

    start_host, end_host = system_helper.get_active_standby_controllers()
    if not end_host:
        skip("No standby host to swact to")

    init_time = common.get_date_in_format(date_format=KPI_DATE_FORMAT)
    host_helper.reboot_hosts(hostnames=start_host)
    kpi_name = SwactUncontrolledPlatform.NAME
    kpi_log_parser.record_kpi(local_kpi_file=collect_kpi,
                              kpi_name=kpi_name,
                              init_time=init_time,
                              log_path=SwactUncontrolledPlatform.LOG_PATH,
                              end_pattern=SwactUncontrolledPlatform.END,
                              host=end_host,
                              start_host=start_host,
                              start_pattern=SwactUncontrolledPlatform.START,
                              start_path=SwactUncontrolledPlatform.START_PATH,
                              uptime=5,
                              fail_ok=False)
Ejemplo n.º 16
0
def test_detect_failed_controller(no_simplex):
    con_ssh = ssh.ControllerClient.get_active_controller()
    active_controller, controller_host = system_helper.get_active_standby_controllers()

    controller_su_prompt = r'.*controller\-([0-9]){1,}\:/home/sysadmin#'
    cmd_get_offset = ("ntpq -p | grep {} -A1 | "
                      "tail -1 | awk '{{print$8}}'".format(active_controller))
    cmd_magic_keys_enable = ("echo 1 > /proc/sys/kernel/sysrq")
    cmd_get_start_date = ("python -c \"import datetime; "
                          "print str(datetime.datetime.now())[:-3]\"")
    cmd_get_end_date = ("cat /var/log/mtcAgent.log | "
                        "grep --color=never \"{} MNFA new candidate\" | "
                        "tail -1 | awk '{{print$1}}'".format(controller_host))
    cmd_get_recovered_date = ("cat /var/log/mtcAgent.log | "
                              "grep --color=never '{} unlocked-enabled-available' | "
                              "tail -1 | awk '{{print$1}}'".format(controller_host))
    cmd_trigger_reboot = ("echo b > /proc/sysrq-trigger")

    res = list()
    rec_res = list()

    for i in range(20):
        LOG.tc_step("Start of iter {}".format(i))
        st = str()
        offset = float()
        with host_helper.ssh_to_host(controller_host) as node_ssh:
            offset = float(node_ssh.exec_cmd(cmd=cmd_get_offset, get_exit_code=False)[1])/1000
            node_ssh.send_sudo(cmd="su")
            node_ssh.expect(controller_su_prompt)
            node_ssh.send_sudo(cmd=cmd_magic_keys_enable)
            node_ssh.expect(controller_su_prompt)
            st = node_ssh.exec_cmd(cmd=cmd_get_start_date, get_exit_code=False,
                                   blob=controller_su_prompt)[1]
            node_ssh.exec_sudo_cmd(cmd_trigger_reboot, get_exit_code=False)

        system_helper.wait_for_hosts_states(controller_host, check_interval=20,
                                            availability=HostAvailState.AVAILABLE)
        pods_health = kube_helper.wait_for_pods_healthy(check_interval=20,
                                                        timeout=HostTimeout.REBOOT)
        assert pods_health is True, "Check PODs health has failed"

        st_date = datetime.datetime.fromtimestamp(
            datetime.datetime.strptime(st, '%Y-%m-%d %H:%M:%S.%f').timestamp() - offset)
        et = con_ssh.exec_cmd(cmd=cmd_get_end_date, get_exit_code=False)[1]
        et_date = datetime.datetime.strptime(et, '%Y-%m-%dT%H:%M:%S.%f')
        er = con_ssh.exec_cmd(cmd=cmd_get_recovered_date, get_exit_code=False)[1]
        er_date = datetime.datetime.strptime(er, '%Y-%m-%dT%H:%M:%S.%f')
        diff = et_date - st_date
        rec_diff = er_date - st_date
        LOG.info(("\noffset = {}\n"
                  "start time = {}\n"
                  "end time = {}\n"
                  "recover time = {}".format(offset, st, et, er)))
        LOG.info("\ndiff = {}".format(diff))
        LOG.info("\nrecover diff = {}".format(rec_diff))
        res.append(diff)
        rec_res.append(rec_diff)

    def calc_avg(lst):
        rtrn_sum = datetime.timedelta()
        for i in lst:
            LOG.info("Iter {}: {}".format(lst.index(i), i))
            rtrn_sum += i
        return rtrn_sum/len(lst)

    final_res = calc_avg(res)
    final_rec_res = calc_avg(rec_res)
    LOG.info("Avg time is : {}".format(final_res))
    LOG.info("Avg rec time is : {}".format(final_rec_res))
Ejemplo n.º 17
0
def test_swact_controllers(stx_openstack_required,
                           wait_for_con_drbd_sync_complete):
    """
    Verify swact active controller

    Test Steps:
        - Boot a vm on system and check ping works
        - Swact active controller
        - Verify standby controller and active controller are swapped
        - Verify vm is still pingable

    """
    if not wait_for_con_drbd_sync_complete:
        skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS)

    LOG.tc_step('retrieve active and available controllers')
    pre_active_controller, pre_standby_controller = system_helper.get_active_standby_controllers(
    )
    assert pre_standby_controller, "No standby controller available"

    pre_res_sys, pre_msg_sys = system_helper.wait_for_services_enable(
        timeout=20, fail_ok=True)
    up_hypervisors = host_helper.get_up_hypervisors()
    pre_res_neutron, pre_msg_neutron = network_helper.wait_for_agents_healthy(
        up_hypervisors, timeout=20, fail_ok=True)

    LOG.tc_step("Boot a vm from image and ping it")
    vm_id_img = vm_helper.boot_vm(name='swact_img',
                                  source='image',
                                  cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img)

    LOG.tc_step("Boot a vm from volume and ping it")
    vm_id_vol = vm_helper.boot_vm(name='swact', cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol)

    LOG.tc_step(
        "Swact active controller and ensure active controller is changed")
    host_helper.swact_host(hostname=pre_active_controller)

    LOG.tc_step("Verify standby controller and active controller are swapped")
    post_active_controller = system_helper.get_active_controller_name()
    post_standby_controller = system_helper.get_standby_controller_name()

    assert pre_standby_controller == post_active_controller, \
        "Prev standby: {}; Post active: {}".format(
            pre_standby_controller, post_active_controller)
    assert pre_active_controller == post_standby_controller, \
        "Prev active: {}; Post standby: {}".format(
            pre_active_controller, post_standby_controller)

    LOG.tc_step("Check boot-from-image vm still pingable after swact")
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img, timeout=30)
    LOG.tc_step("Check boot-from-volume vm still pingable after swact")
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol, timeout=30)

    LOG.tc_step(
        "Check system services and neutron agents after swact from {}".format(
            pre_active_controller))
    post_res_sys, post_msg_sys = system_helper.wait_for_services_enable(
        fail_ok=True)
    post_res_neutron, post_msg_neutron = network_helper.wait_for_agents_healthy(
        hosts=up_hypervisors, fail_ok=True)

    assert post_res_sys, \
        "\nPost-evac system services stats: {}\nPre-evac system services stats: {}". \
        format(post_msg_sys, pre_msg_sys)
    assert post_res_neutron, \
        "\nPost evac neutron agents stats: {}\nPre-evac neutron agents stats: {}". \
        format(pre_msg_neutron, post_msg_neutron)

    LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact")
    kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller,
                                            pre_standby_controller),
                                     timeout=30)
Ejemplo n.º 18
0
def _test_basic_swift_provisioning(pool_size, pre_swift_check):
    """
    Verifies basic swift provisioning works as expected
    Args:
        pool_size:
        pre_swift_check:

    Returns:

    """
    ceph_backend_info = get_ceph_backend_info()

    if pool_size == 'default' and pre_swift_check[0]:
        skip("Swift is already provisioned")

    if pool_size == 'fixed_size' and pre_swift_check[0]:
        skip("Swift is already provisioned and set to non-default pool value")

    object_pool_gib = None
    cinder_pool_gib = ceph_backend_info['cinder_pool_gib']

    if pool_size == 'default':
        if not ceph_backend_info['object_gateway']:
            LOG.tc_step("Enabling SWIFT object store .....")

    else:
        if not ceph_backend_info['object_gateway']:
            skip("Swift is not provisioned")

        total_gib = ceph_backend_info['ceph_total_space_gib']
        unallocated_gib = (total_gib - cinder_pool_gib -
                           ceph_backend_info['glance_pool_gib'] -
                           ceph_backend_info['ephemeral_pool_gib'])
        if unallocated_gib == 0:
            unallocated_gib = int(int(cinder_pool_gib) / 4)
            cinder_pool_gib = str(int(cinder_pool_gib) - unallocated_gib)
        elif unallocated_gib < 0:
            skip("Unallocated gib < 0. System is in unknown state.")

        object_pool_gib = str(unallocated_gib)
        LOG.tc_step(
            "Enabling SWIFT object store and setting object pool size to {}....."
            .format(object_pool_gib))

    rc, updated_backend_info = storage_helper.modify_storage_backend(
        'ceph',
        object_gateway=False,
        cinder=cinder_pool_gib,
        object_gib=object_pool_gib,
        services='cinder,glance,nova,swift')

    LOG.info("Verifying if swift object gateway is enabled...")
    assert str(updated_backend_info['object_gateway']).lower() == 'true', "Fail to enable Swift object gateway: {}"\
        .format(updated_backend_info)
    LOG.info("Swift object gateway is enabled.")

    LOG.info("Verifying ceph task ...")
    state = storage_helper.get_storage_backends(backend='ceph',
                                                field='state')[0]
    if system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                    timeout=10,
                                    fail_ok=True,
                                    entity_id='controller-')[0]:
        LOG.info("Verifying ceph task is set to 'add-object-gateway'...")
        assert BackendState.CONFIGURING == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

        LOG.info("Lock/Unlock controllers...")
        active_controller, standby_controller = system_helper.get_active_standby_controllers(
        )
        LOG.info("Active Controller is {}; Standby Controller is {}...".format(
            active_controller, standby_controller))

        for controller in [standby_controller, active_controller]:
            if not controller:
                continue
            HostsToRecover.add(controller)
            host_helper.lock_host(controller, swact=True)
            storage_helper.wait_for_storage_backend_vals(
                backend='ceph-store',
                **{
                    'task': BackendTask.RECONFIG_CONTROLLER,
                    'state': BackendState.CONFIGURING
                })
            host_helper.unlock_host(controller)

        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE, fail_ok=False)
    else:
        assert BackendState.CONFIGURED == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

    LOG.info("Verifying Swift provisioning setups...")
    assert verify_swift_object_setup(), "Failure in swift setups"

    for i in range(3):
        vm_name = 'vm_swift_api_{}'.format(i)
        LOG.tc_step(
            "Boot vm {} and perform nova actions on it".format(vm_name))
        vm_id = vm_helper.boot_vm(name=vm_name, cleanup='function')[1]
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm_id, timeout=VMTimeout.DHCP_RETRY)

        LOG.info("Cold migrate VM {} ....".format(vm_name))
        rc = vm_helper.cold_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to cold migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Live migrate VM {} ....".format(vm_name))
        rc = vm_helper.live_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to live migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Suspend/Resume VM {} ....".format(vm_name))
        vm_helper.suspend_vm(vm_id)
        vm_helper.resume_vm(vm_id)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    LOG.info("Checking overall system health...")
    assert system_helper.get_system_health_query(
    ), "System health not OK after VMs"

    LOG.tc_step("Create Swift container using swift post cli command ...")
    container_names = [
        "test_container_1", "test_container_2", "test_container_3"
    ]

    for container in container_names:
        LOG.info("Creating swift object container {}".format(container))
        rc, out = swift_helper.create_swift_container(container)
        assert rc == 0, "Fail to create swift container {}".format(container)
        LOG.info(
            "Create swift object container {} successfully".format(container))

    LOG.tc_step("Verify swift list to list containers ...")
    container_list = swift_helper.get_swift_containers()[1]
    assert set(container_names) <= set(container_list), "Swift containers {} not listed in {}"\
        .format(container_names, container_list)

    LOG.tc_step("Verify swift delete a container...")
    container_to_delete = container_names[2]
    rc, out = swift_helper.delete_swift_container(container_to_delete)
    assert rc == 0, "Swift delete container rejected: {}".format(out)
    assert container_to_delete not in swift_helper.get_swift_containers()[1], "Unable to delete swift container {}"\
        .format(container_to_delete)

    LOG.tc_step("Verify swift stat to show info of a single container...")
    container_to_stat = container_names[0]
    out = swift_helper.get_swift_container_stat_info(container_to_stat)
    assert out["Container"] == container_to_stat, "Unable to stat swift container {}"\
        .format(container_to_stat)
    assert out["Objects"] == '0', "Incorrect number of objects container {}. Expected O objects, but has {} objects"\
        .format(container_to_stat, out["Objects"])
Ejemplo n.º 19
0
def test_modify_mtu_oam_interface(mtu_range):
    """

    of the 2016-04-04 sysinv_test_plan.pdf
    20) Change the MTU value of the OAM interface using CLI

    Verify that MTU on oam interfaces on both standby and active controller can be modified by cli

    Args:
        mtu_range (str): A string that contain the mtu want to be tested

    Setup:
        - Nothing

    Test Steps:
        - lock standby controller
        - modify the imtu value of the controller
        - unlock the controller
        - revert and oam mtu of the controller and check system is still healthy
        - swact the controller
        - lock the controller
        - modify the imtu value of the controller
        - unlock the controller
        - check the controllers have expected mtu
        - revert the oam mtu of the controller and check system is still healthy

    Teardown:
        - Nothing

    """
    is_sx = system_helper.is_aio_simplex()
    origin_active, origin_standby = system_helper.get_active_standby_controllers()
    if not origin_standby and not is_sx:
        skip("Standby controller unavailable. Cannot lock controller.")

    mtu = __get_mtu_to_mod(providernet_name='-ext', mtu_range=mtu_range)
    first_host = origin_active if is_sx else origin_standby
    max_mtu, cur_mtu, nic_name = get_max_allowed_mtus(host=first_host, network_type='oam')
    LOG.info('OK, the max MTU for {} is {}'.format(nic_name, max_mtu))

    expecting_pass = not max_mtu or mtu <= max_mtu
    if not expecting_pass:
        LOG.warn('Expecting to fail in changing MTU: changing to:{}, max-mtu:{}'.format(mtu, max_mtu))

    oam_attributes = host_helper.get_host_interfaces(host=first_host, field='attributes', name='oam', strict=False)

    # sample attributes: [MTU=9216,AE_MODE=802.3ad]
    pre_oam_mtu = int(oam_attributes[0].split(',')[0].split('=')[1])
    is_stx_openstack_applied = container_helper.is_stx_openstack_deployed(applied_only=True)

    if not is_sx:
        HostsToRecover.add(origin_standby)
        prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True)

        LOG.tc_step("Modify {} oam interface MTU from {} to {} on standby controller, and "
                    "ensure it's applied successfully after unlock".format(origin_standby, pre_oam_mtu, mtu))
        if mtu == cur_mtu:
            LOG.info('Setting to same MTU: from:{} to:{}'.format(mtu, cur_mtu))

        code, res = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=mtu, network_type='oam',
                                                         lock_unlock=True, fail_ok=True)

        LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu))
        code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=pre_oam_mtu,
                                                                       network_type='oam',
                                                                       lock_unlock=True, fail_ok=True)
        if 0 == code:
            assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res)
        else:
            assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res)

        assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert)

        LOG.tc_step("Check openstack cli, application and pods status after modify and revert {} oam mtu".
                    format(origin_standby))
        check_containers(prev_bad_pods, check_app=is_stx_openstack_applied)

        LOG.tc_step("Ensure standby controller is in available state and attempt to swact active controller to {}".
                    format(origin_standby))
        system_helper.wait_for_hosts_states(origin_active, availability=['available'])
        host_helper.swact_host(fail_ok=False)
        host_helper.wait_for_webservice_up(origin_standby)

    prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True)
    HostsToRecover.add(origin_active)
    LOG.tc_step("Modify {} oam interface MTU to: {}, and "
                "ensure it's applied successfully after unlock".format(origin_active, mtu))
    code, res = host_helper.modify_mtu_on_interfaces(origin_active,
                                                     mtu_val=mtu, network_type='oam', lock_unlock=True,
                                                     fail_ok=True)
    LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu))
    code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_active, mtu_val=pre_oam_mtu,
                                                                   network_type='oam',
                                                                   lock_unlock=True, fail_ok=True)
    if 0 == code:
        assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res)
    else:
        assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res)

    assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert)

    LOG.tc_step("Check openstack cli, application and pods after modify and revert {} oam mtu".format(origin_active))
    check_containers(prev_bad_pods, check_app=is_stx_openstack_applied)
Ejemplo n.º 20
0
def test_is_active_con():
    active_con, standby_con = system_helper.get_active_standby_controllers()
    assert system_helper.is_active_controller(active_con)

    if standby_con:
        assert not system_helper.is_active_controller(standby_con)
Ejemplo n.º 21
0
def kill_sm_process_and_verify_impact(name,
                                      cmd='',
                                      pid_file='',
                                      retries=2,
                                      impact='swact',
                                      host='controller-0',
                                      interval=20,
                                      action_timeout=90,
                                      total_retries=3,
                                      process_type='sm',
                                      on_active_controller=True,
                                      con_ssh=None,
                                      auth_info=Tenant.get('admin_platform')):
    """
    Kill the process with the specified name and verify the system behaviors
    as expected

    Args:
        name (str):             name of the process
        cmd (str):              executable of the process
        pid_file (str):         file containing process id
        retries (int):          times of killing actions upon which the
        IMPACT will be triggered
        impact (str):           system behavior including:
                                    swact   -- active controller is swacted
                                    enabled-degraded    -- the status of the
                                    service will change to
                                    disabled-failed     -- the status of the
                                    service will change to
                                    ...
        host (str):             host to test on
        interval (int):         least time to wait between kills
        action_timeout (int):   kills and impact should happen within this
        time frame
        total_retries (int):    total number of retries for whole kill and
        wait actions
        process_type (str):     valid types are: sm, pmon, other
        on_active_controller (boolean):
        con_ssh:                ssh connection/client to the active controller
        auth_info

    Returns: (pid, host)
        pid:
            >0  suceess, the final PID of the process
            -1  fail because of impact NOT happening after killing the
            process up to threshold times
            -2  fail because of impact happening before killing threshold times
            -3  fail after try total_retries times
        host:
            the host tested on
    """
    active_controller, standby_controller = \
        system_helper.get_active_standby_controllers(con_ssh=con_ssh,
                                                     auth_info=auth_info)

    if on_active_controller:
        LOG.info('on active controller: {}, host:{}'.format(
            active_controller, host))

        host = active_controller
        con_ssh = con_ssh or ControllerClient.get_active_controller()

    LOG.info('on host: {}'.format(host))

    if total_retries < 1 or retries < 1:
        LOG.error(
            'retries/total-retries < 1? retires:{}, total retries:{}'.format(
                retries, total_retries))
        return None
    count = 0
    for i in range(1, total_retries + 1):
        LOG.info(
            'retry:{:02d} kill the process:{} and verify impact:{}'.format(
                i, name, impact))

        exec_times = []
        killed_pids = []

        timeout = time.time() + action_timeout * (retries /
                                                  2 if retries > 2 else 1)

        while time.time() < timeout:
            count += 1

            LOG.debug('retry{:02d}-{:02d}: Failed to get process id for {} on '
                      'host:{}, swacted unexpectedly?'.format(
                          i, count, name, host))

            try:
                pid, proc_name = get_process_info(name,
                                                  cmd=cmd,
                                                  host=host,
                                                  process_type=process_type,
                                                  pid_file=pid_file,
                                                  con_ssh=con_ssh)[0:2]

            except pexpect.exceptions.EOF:
                LOG.warn(
                    'retry{:02d}-{:02d}: Failed to get process id for {} on '
                    'host:{}, swacted unexpectedly?'.format(
                        i, count, name, host))
                time.sleep(interval / 3.0)
                continue

            if -1 == pid:
                LOG.error(
                    'retry{:02d}-{:02d}: Failed to get PID for process with '
                    'name:{}, cmd:{}, '
                    'wait and retries'.format(i, count, name, cmd))
                time.sleep(interval / 3.0)
                continue

            if killed_pids and pid in killed_pids:
                LOG.warn('retry{:02d}-{:02d}: No new process re-created, '
                         'prev-pid={}, cur-pid={}'.format(
                             i, count, killed_pids[-1], pid))
                time.sleep(interval / 3.0)
                continue

            last_killed_pid = killed_pids[-1] if killed_pids else None
            killed_pids.append(pid)
            last_kill_time = exec_times[-1] if exec_times else None
            exec_times.append(datetime.datetime.utcnow())

            latest_events = _get_last_events_timestamps(
                event_log_id=KILL_PROC_EVENT_FORMAT[process_type]['event_id'],
                limit=10)

            LOG.info(
                'retry{:02d}-{:02d}: before kill CLI, proc_name={}, pid={}, '
                'last_killed_pid={}, last_kill_time={}'.format(
                    i, count, proc_name, pid, last_killed_pid, last_kill_time))

            LOG.info('\tactive-controller={}, standby-controller={}'.format(
                active_controller, standby_controller))

            kill_cmd = '{} {}'.format(KILL_CMD, pid)

            with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con:
                code, output = con.exec_sudo_cmd(kill_cmd, fail_ok=True)
                if 0 != code:
                    # it happens occasionaly
                    LOG.error('Failed to kill pid:{}, cmd={}, output=<{}>, '
                              'at run:{}, already terminated?'.format(
                                  pid, kill_cmd, output, count))

            if count < retries:
                # IMPACT should not happen yet
                if not check_impact(impact,
                                    proc_name,
                                    last_events=latest_events,
                                    active_controller=active_controller,
                                    standby_controller=standby_controller,
                                    expecting_impact=False,
                                    process_type=process_type,
                                    host=host,
                                    con_ssh=con_ssh):
                    LOG.error(
                        'Impact:{} observed unexpectedly, it should happen '
                        'only after killing {} times, '
                        'actual killed times:{}'.format(
                            impact, retries, count))
                    return -2, host

                LOG.info(
                    'retry{:02d}-{:02d}: OK, NO impact as expected, impact={}, '
                    'will kill it another time'.format(i, count, impact))

                time.sleep(max(interval * 1 / 2.0, 5))

            else:
                no_standby_controller = standby_controller is None
                expecting_impact = True if not no_standby_controller else False
                if not check_impact(impact,
                                    proc_name,
                                    last_events=latest_events,
                                    active_controller=active_controller,
                                    standby_controller=standby_controller,
                                    expecting_impact=expecting_impact,
                                    process_type=process_type,
                                    host=host,
                                    con_ssh=con_ssh):
                    LOG.error(
                        'No impact after killing process {} {} times, while '
                        '{}'.format(proc_name, count,
                                    ('expecting impact' if expecting_impact
                                     else 'not expecting impact')))

                    return -1, host

                LOG.info('OK, final retry{:02d}-{:02d}: OK, IMPACT happened '
                         '(if applicable) as expected, '
                         'impact={}'.format(i, count, impact))

                active_controller, standby_controller = \
                    system_helper.get_active_standby_controllers(
                        con_ssh=con_ssh)

                LOG.info(
                    'OK, after impact:{} (tried:{} times), '
                    'now active-controller={}, standby-controller={}'.format(
                        impact, count, active_controller, standby_controller))

                pid, proc_name = get_process_info(name,
                                                  cmd=cmd,
                                                  host=host,
                                                  pid_file=pid_file,
                                                  process_type=process_type,
                                                  con_ssh=con_ssh)[0:2]

                return pid, active_controller

    return -3, host
Ejemplo n.º 22
0
def test_measure_swact_recover(no_simplex):
    cmd_get_start_date = ("python -c \"import datetime; "
                          "print str(datetime.datetime.now())[:-3]\"")

    res = list()

    try:
        for i in range(2):
            LOG.tc_step("Start of iter {}".format(i))
            con_ssh = ssh.ControllerClient.get_active_controller()

            LOG.info("Get active/standby controllers")
            active_controller, standby_controller = system_helper.get_active_standby_controllers(
            )

            cmd_get_offset = (
                "ntpq -p | grep {} -A1 | "
                "tail -1 | awk '{{print$8}}'".format(active_controller))
            cmd_get_start_date = (
                "cat /var/log/mtcAgent.log | "
                "grep \"{} Action=swact\" | "
                "tail -1 | awk '{{print$1}}'".format(active_controller))
            cmd_get_end_date = (
                "cat /var/log/mtcAgent.log | "
                "grep \"{} Task: Swact: Complete\" | "
                "tail -1 | awk '{{print$1}}'".format(active_controller))

            LOG.info("Start swact action")
            host_helper.swact_host(hostname=active_controller)
            kube_helper.wait_for_nodes_ready(hosts=(active_controller,
                                                    standby_controller),
                                             check_interval=20)

            LOG.info("Calculate swact time")
            con_ssh = ssh.ControllerClient.get_active_controller()
            with host_helper.ssh_to_host(active_controller,
                                         con_ssh=con_ssh) as con_0_ssh:
                con_0_ssh.exec_cmd(cmd="cat /var/log/mtcAgent.log",
                                   get_exit_code=False)
                st = con_0_ssh.exec_cmd(cmd=cmd_get_start_date,
                                        get_exit_code=False)[1]
                st_date = datetime.datetime.strptime(st,
                                                     '%Y-%m-%dT%H:%M:%S.%f')
            offset = float(
                con_ssh.exec_cmd(cmd=cmd_get_offset,
                                 get_exit_code=False)[1]) / 1000
            et = con_ssh.exec_cmd(cmd=cmd_get_end_date, get_exit_code=False)[1]
            et_date = datetime.datetime.fromtimestamp(
                datetime.datetime.strptime(
                    et, '%Y-%m-%dT%H:%M:%S.%f').timestamp() - offset)
            diff = et_date - st_date
            LOG.info("\nstart time = {}\nend time = {}".format(st, et))
            LOG.info("\ndiff = {}".format(diff))
            res.append(diff)
    finally:
        active_controller, standby_controller = system_helper.get_active_standby_controllers(
        )
        if active_controller != "controller-0":
            host_helper.swact_host(hostname=active_controller)
            kube_helper.wait_for_nodes_ready(hosts=(active_controller,
                                                    standby_controller),
                                             check_interval=20)

    def calc_avg(lst):
        rtrn_sum = datetime.timedelta()
        for i in lst:
            LOG.info("Iter {}: {}".format(lst.index(i), i))
            rtrn_sum += i
        return rtrn_sum / len(lst)

    final_res = calc_avg(res)
    LOG.info("Avg time is : {}".format(final_res))