Esempio n. 1
0
def test_swact_controller_host():
    """
    SWACT Controller host - it must fail on simplex
    """
    active_controller_host = system_helper.get_active_controller_name()
    LOG.info(
        "Active Controller Before SWACT: {}".format(active_controller_host))
    standby_controller_host = system_helper.get_standby_controller_name()
    LOG.info(
        "Standby Controller Before SWACT: {}".format(standby_controller_host))

    # On simplex swact must fail
    host_helper.swact_host(fail_ok=system_helper.is_aio_simplex())
    # host_helper.wait_for_swact_complete(before_host=active_controller_host)

    active_controller_host = system_helper.get_active_controller_name()
    LOG.info(
        "Active Controller After SWACT: {}".format(active_controller_host))
    standby_controller_host = system_helper.get_standby_controller_name()
    LOG.info(
        "Standby Controller After SWACT: {}".format(standby_controller_host))

    # Re-SWACT only if duplex
    if not system_helper.is_aio_simplex():
        host_helper.swact_host()
def test_upload_charts_via_helm_upload(copy_test_apps):
    """
    Test upload helm charts via helm-upload cmd directly. i.e., without
    using sysinv cmd.
    Args:
        copy_test_apps:

    Setups:
        - Copy test files from test server to tis system (module)

    Test Steps:
        - Upload helm charts from given controller via 'helm-upload <tar_file>'
        - Verify the charts appear at /www/pages/helm_charts/ on both
            controllers (if applicable)

    """
    app_dir = copy_test_apps

    LOG.tc_step(
        "Upload helm charts via helm-upload cmd from active controller "
        "and check charts are in /www/pages/")
    file_path = container_helper.upload_helm_charts(tar_file=os.path.join(
        app_dir, HELM_TAR),
                                                    delete_first=True)[1]

    if system_helper.get_standby_controller_name():
        LOG.tc_step("Swact active controller and verify uploaded charts "
                    "are synced over")
        host_helper.swact_host()
        con_ssh = ControllerClient.get_active_controller()
        charts_exist = con_ssh.file_exists(file_path)
        assert charts_exist, "{} does not exist after swact to {}".format(
            file_path, con_ssh.get_hostname())
        LOG.info("{} successfully synced after swact".format(file_path))
Esempio n. 3
0
def test_reboot_hosts(hostnames):
    LOG.tc_step("Processing hostnames provided...")
    system_hosts = system_helper.get_hosts()

    is_str = False
    if isinstance(hostnames, str):
        is_str = True
        hostnames = [hostnames]

    tmp_hosts = hostnames
    for host in tmp_hosts:
        if host == 'active_controller':
            hostnames.remove(host)
            host = system_helper.get_active_controller_name()
            hostnames.append(host)
        elif host == 'standby_controller':
            hostnames.remove(host)
            host = system_helper.get_standby_controller_name()
            hostnames.append(host)
        if host not in system_hosts:
            skip("Host(s) not found in system. Host(s) requested: {}."
                 "Hosts in system: {}".format(hostnames, system_hosts))

    if is_str:
        hostnames = hostnames[0]

    LOG.tc_step("Rebooting following host(s): {}".format(hostnames))
    results = host_helper.reboot_hosts(hostnames)
    LOG.tc_step("Results: {}".format(results))
    assert results[0] == 0
Esempio n. 4
0
def lock_controller():
    LOG.fixture_step(
        "Ensure system has no standby controller available for swact")
    standby = system_helper.get_standby_controller_name()

    if standby:
        HostsToRecover.add(standby)
        host_helper.lock_host(standby, swact=False)
Esempio n. 5
0
def get_host():
    if system_helper.is_aio_simplex():
        hostname = 'controller-0'
    elif system_helper.is_aio_duplex():
        hostname = system_helper.get_standby_controller_name()
    else:
        hostname = system_helper.get_computes(
            availability=HostAvailState.AVAILABLE)[0]

    return hostname
Esempio n. 6
0
def test_host_status():
    """
    System overview
    """
    active_controller_host = system_helper.get_active_controller_name()
    LOG.info("Active Controller: {}".format(active_controller_host))
    standby_controller_host = system_helper.get_standby_controller_name()
    LOG.info("Standby Controller {}".format(standby_controller_host))
    host_list = system_helper.get_hosts()
    for host in host_list:
        LOG.info("Host: {}".format(host))
Esempio n. 7
0
def get_hugepage_pod_file():
    """
    Fixture used to return the hugepage deployment file

        - Get the compute-0 if exist, else standby controller
        - Check 2M hugepages configured, elsif check 1G is configured
            else lock,configure 2G of 1G hugepages and unlock host
        - Call modify_yaml function to modify the yaml
          file with the values
        - Modified file scps to host to deploy hugepages pod
        - Deletes the hugepages pod from the host after the test

    """
    if system_helper.is_aio_duplex():
        hostname = system_helper.get_standby_controller_name()
    else:
        hostname = system_helper.get_hypervisors()[0]
    LOG.fixture_step("Checking hugepage values on {}".format(hostname))
    proc_id = 0
    out = host_helper.get_host_memories(hostname,
                                        ('app_hp_avail_2M', 'app_hp_avail_1G'),
                                        proc_id)
    if out[proc_id][0] > 0:
        hugepage_val = "{}Mi".format(out[proc_id][0])
        hugepage_str = "hugepages-2Mi"
    elif out[proc_id][1] > 0:
        hugepage_val = "{}Gi".format(out[proc_id][1])
        hugepage_str = "hugepages-1Gi"
    else:
        hugepage_val = "{}Gi".format(2)
        cmd = "{} -1G {}".format(proc_id, 2)
        hugepage_str = "hugepages-1Gi"
        HostsToRecover.add(hostname)
        host_helper.lock_host(hostname)
        LOG.fixture_step("Configuring hugepage values {} on {}".format(
            hugepage_val, hostname))
        cli.system('host-memory-modify {} {}'.format(hostname, cmd),
                   ssh_client=None,
                   auth_info=Tenant.get('admin_platform'))
        host_helper.unlock_host(hostname)
    LOG.fixture_step("{} {} pod will be configured on {} proc id {}".format(
        hugepage_str, hugepage_val, hostname, proc_id))
    file_dir, file_name = modify_yaml("utils/test_files/",
                                      "hugepages_pod.yaml", hugepage_str,
                                      hugepage_val)
    source_path = "{}/{}".format(file_dir, file_name)
    home_dir = HostLinuxUser.get_home()
    common.scp_from_localhost_to_active_controller(source_path,
                                                   dest_path=home_dir)
    yield file_name
    LOG.fixture_step("Delete hugepages pod")
    kube_helper.delete_resources(resource_names="hugepages-pod")
def get_hosts_with_backing(add_admin_role_module):
    storage_backing, hosts = keywords.host_helper.get_storage_backing_with_max_hosts(
    )
    if len(hosts) < 2:
        skip(
            "Minimum of two hypervisors must support the same storage_backing."
        )

    if not system_helper.is_aio_system():
        host_under_test = hosts[0]
    else:
        host_under_test = system_helper.get_standby_controller_name()

    return storage_backing, host_under_test
def test_admin_password(scenario, less_than_two_cons, _revert_admin_pw):
    """
    Test the admin password change

    Test Steps:
        - lock standby controller change password and unlock
        - change passowrd and swact
        - check alarams

    """
    if 'swact' in scenario and less_than_two_cons:
        skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS)

    host = system_helper.get_standby_controller_name()
    assert host, "No standby controller on system"

    if scenario == "lock_standby_change_pswd":
        # lock the standby
        LOG.tc_step("Attempting to lock {}".format(host))
        res, out = host_helper.lock_host(host=host)
        LOG.tc_step("Result of the lock was: {}".format(res))

    # change password
    prev_pswd = Tenant.get('admin')['password']
    post_pswd = '!{}9'.format(prev_pswd)

    LOG.tc_step('Changing admin password to {}'.format(post_pswd))
    code, output = keystone_helper.set_user('admin', password=post_pswd, auth_info=Tenant.get(
        'admin_platform'))

    # assert "Warning: 'admin' password changed. Please wait 5 minutes before Locking/Unlocking
    # the controllers" in output
    LOG.tc_step("Sleep for 180 seconds after admin password change")
    time.sleep(180)  # CGTS-6928

    LOG.tc_step("Check admin password is updated in keyring")
    assert post_pswd == security_helper.get_admin_password_in_keyring()

    if scenario == "change_pswd_swact":
        LOG.tc_step("Swact active controller")
        host_helper.swact_host()
    else:
        LOG.tc_step("Unlock host {}".format(host))
        res = host_helper.unlock_host(host)
        LOG.info("Unlock hosts result: {}".format(res))

    LOG.tc_step("Check admin password is updated in keyring")
    assert post_pswd == security_helper.get_admin_password_in_keyring()
Esempio n. 10
0
def test_lock_unlock_host(host_type):
    """
    Verify lock unlock host

    Test Steps:
        - Select a host per given type. If type is controller, select
            standby controller.
        - Lock selected host and ensure it is successfully locked
        - Unlock selected host and ensure it is successfully unlocked

    """
    LOG.tc_step("Select a {} node from system if any".format(host_type))
    if host_type == 'controller':
        if system_helper.is_aio_simplex():
            host = 'controller-0'
        else:
            host = system_helper.get_standby_controller_name()
            assert host, "No standby controller available"

    else:
        if host_type == 'compute' and system_helper.is_aio_system():
            skip("No compute host on AIO system")
        elif host_type == 'storage' and not system_helper.is_storage_system():
            skip("System does not have storage nodes")

        hosts = system_helper.get_hosts(personality=host_type,
                                        availability=HostAvailState.AVAILABLE,
                                        operational=HostOperState.ENABLED)

        assert hosts, "No good {} host on system".format(host_type)
        host = hosts[0]

    LOG.tc_step("Lock {} host - {} and ensure it is successfully "
                "locked".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.lock_host(host, swact=False)

    # wait for services to stabilize before unlocking
    time.sleep(20)

    # unlock standby controller node and verify controller node is
    # successfully unlocked
    LOG.tc_step("Unlock {} host - {} and ensure it is successfully "
                "unlocked".format(host_type, host))
    host_helper.unlock_host(host)
Esempio n. 11
0
def test_lock_unlock_standby_controller(no_simplex):
    """
    Lock - Unlock standby controller
    """
    standby_controller_host = system_helper.get_standby_controller_name()
    LOG.info("Standby Controller Host: {}".format(standby_controller_host))
    assert standby_controller_host, "Standby controller not found"

    # Lock
    host_helper.lock_host(host=standby_controller_host, fail_ok=False)

    container_helper.wait_for_apps_status(apps="stx-openstack",
                                          status=AppStatus.APPLIED,
                                          timeout=600,
                                          check_interval=60)
    # Unlock
    host_helper.unlock_host(host=standby_controller_host, fail_ok=False)
    host_helper.wait_for_hosts_ready(hosts=standby_controller_host)
Esempio n. 12
0
    def target_hosts(self):
        """
        Test fixture for test_lock_with_vms().
        Calculate target host(s) to perform lock based on storage backing of vms_to_test, and live migrate suitable vms
        to target host before test start.
        """

        storage_backing, target_hosts = keywords.host_helper.get_storage_backing_with_max_hosts(
        )
        if len(target_hosts) < 2:
            skip(
                SkipStorageBacking.LESS_THAN_TWO_HOSTS_WITH_BACKING.format(
                    storage_backing))

        target_host = target_hosts[0]
        if SysType.AIO_DX == system_helper.get_sys_type():
            target_host = system_helper.get_standby_controller_name()

        return storage_backing, target_host
Esempio n. 13
0
def test_controllerfs_mod_when_host_locked():
    """
    This test attempts to modify controllerfs value while one of the
    controllers is locked.  All controller filesystem modification attempts
    should be rejected when any one of the controllers in not available.

    Arguments:
    - None

    Test Steps:
    1.  Lock standby controller or only controller (in the case of AIO systems)
    2.  Attempt to modify controller filesystem.  This should be rejected.

    Assumptions:
    - None

    Teardown:
    - Unlock controller
    """

    if system_helper.is_aio_simplex():
        target_host = "controller-0"
    else:
        target_host = system_helper.get_standby_controller_name()

    host_helper.lock_host(target_host)
    HostsToRecover.add(target_host, scope="function")

    drbdfs_val = {}
    fs = "database"
    LOG.tc_step("Determine the current filesystem size")
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 1
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of filesystems")
    code = storage_helper.modify_controllerfs(fail_ok=True, **drbdfs_val)[0]
    assert 1 == code, "Filesystem modify succeeded while failure is expected: {}".format(
        drbdfs_val)
Esempio n. 14
0
def test_swact_standby_controller_negative():
    """
    TC610_2
    Verify that trying to swact a standby controller is rejected

    Test Steps:
        - Get the standby controller
        - Attempt to swact the controller
        - Verify that the swact doesn't happen

    """
    standby = system_helper.get_standby_controller_name()
    active = system_helper.get_active_controller_name()
    LOG.tc_step(
        "Attempting to swact from standby controller {}".format(standby))
    code, out = host_helper.swact_host(standby, fail_ok=True)
    LOG.tc_step("Verifying that the swact didn't occur.")
    assert 0 != code, "FAIL: The swact wasn't rejected"
    curr_active = system_helper.get_active_controller_name()
    assert curr_active == active, "FAIL: The active controller was changed. " \
                                  "Previous: {} Current: {}".format(active, curr_active)
Esempio n. 15
0
def delete_object_file(object_path, rm_dir=False, client=None):
    def _delete_on_client(client_):
        cmd = "ls {}".format(object_path)
        rc, output = client_.exec_cmd(cmd)
        if rc == 0:
            cmd = 'rm {} {}'.format('-r' if rm_dir else '', object_path)
            client_.exec_cmd(cmd)
            LOG.info("Files deleted {}: {}".format(object_path, output))

    if not client:
        client = get_cli_client()
    _delete_on_client(client_=client)

    if not ProjVar.get_var('REMOTE_CLI'):
        standby_controller = system_helper.get_standby_controller_name()
        with host_helper.ssh_to_host(
                standby_controller,
                username=HostLinuxUser.get_user(),
                password=HostLinuxUser.get_password()) as standby_ssh:
            _delete_on_client(client_=standby_ssh)

    return True
def test_apply_storage_profile_negative(create_storage_profile, personality):

    if personality == 'controller':
        host_name = system_helper.get_standby_controller_name()
        assert host_name, "No standby controller available on system"
    else:
        host_name = host_helper.get_up_hypervisors()[0]

    # For storage systems, skip test if ceph isn't healthy
    if len(system_helper.get_storage_nodes()) > 0:
        ceph_healthy = storage_helper.is_ceph_healthy()
        if not ceph_healthy:
            skip('Skipping due to ceph not being healthy')

    profile_name = create_storage_profile['profile_name']
    origin_disk_num = create_storage_profile['disk_num']
    disks_num = len(storage_helper.get_host_disks(host_name, 'device_node'))

    expt_err = 'profile has more disks than host does' if disks_num < origin_disk_num -1 \
        else "Please check if host's disks match profile criteria"
    expt_err_list = [
        "Please check if host's disks match profile criteria",
        "Failed to create storage function. Host personality must be 'storage'",
    ]
    if disks_num < origin_disk_num - 1:
        expt_err_list.append("profile has more disks than host does")

    positional_arg = host_name + ' ' + profile_name

    HostsToRecover.add(host_name)
    host_helper.lock_host(host_name, swact=True)
    exitcode, output = cli.system('host-apply-storprofile',
                                  positional_arg,
                                  fail_ok=True)
    host_helper.unlock_host(host_name)

    assert exitcode == 1 and any(expt in output for expt in expt_err_list)
Esempio n. 17
0
def test_modify_mtu_data_interface(mtu_range, revert_data_mtu):
    """
    23) Change the MTU value of the data interface using CLI
    Verify that MTU on data interfaces on all compute node can be modified by cli
    The min mtu for data interface can be 1500,9000 or 9216, in which case MTU is unchangable. Need to confirm
    Args:
        mtu_range (str): A string that contain the mtu want to be tested
        revert_data_mtu: A fixture to restore changed mtus if any to their original values

    Setup:
        - Nothing

    Test Steps:
        - lock standby controller
        - modify the imtu value of the compute node
        - unlock the controller
        - check the compute node have expected mtu

    Teardown:
        - Revert data mtu

    """

    hypervisors = host_helper.get_hypervisors(state='up')
    if len(hypervisors) < 2:
        skip("Less than two hypervisors available.")

    if system_helper.is_aio_duplex():
        standby = system_helper.get_standby_controller_name()
        if not standby:
            skip("Standby controller unavailable on CPE system. Unable to lock host")
        hypervisors = [standby]
    else:
        if len(hypervisors) > 2:
            hypervisors = random.sample(hypervisors, 2)

    LOG.tc_step("Delete vms to reduce lock time")
    vm_helper.delete_vms()

    mtu = __get_mtu_to_mod(providernet_name='-data', mtu_range=mtu_range)

    LOG.tc_step("Modify data MTU to {} for hosts: {}".format(mtu, hypervisors))

    net_type = 'data'

    active_controller = system_helper.get_active_controller_name()
    hosts = hypervisors[:]
    if active_controller in hosts:
        hosts.remove(active_controller)
        hosts.append(active_controller)

    for host in hosts:
        interfaces = get_ifs_to_mod(host, net_type, mtu)
        revert_ifs = list(interfaces)
        if not revert_ifs:
            LOG.info('Skip host:{} because there is no interface to set MTU'.format(host))
            continue

        host_helper.lock_host(host, swact=True)

        revert_ifs.reverse()
        changed_ifs = []
        for interface in revert_ifs:
            LOG.tc_step('Checking the max MTU for the IF:{} on host:{}'.format(interface, host))
            max_mtu, cur_mtu, nic_name = get_max_allowed_mtus(host=host, network_type=net_type, if_name=interface)

            LOG.info('Checking the max MTU for the IF:{}, max MTU: {}, host:{}'.format(
                interface, max_mtu or 'NOT SET', host))

            expecting_pass = not max_mtu or mtu <= max_mtu
            if not expecting_pass:
                LOG.warn('Expecting to fail in changing MTU: changing to:{}, max-mtu:{}'.format(mtu, max_mtu))

            pre_mtu = int(host_helper.get_host_interface_values(host, interface, 'imtu')[0])

            LOG.tc_step('Modify MTU of IF:{} on host:{} to:{}, expeting: {}'.format(
                interface, host, mtu, 'PASS' if expecting_pass else 'FAIL'))

            code, res = host_helper.modify_mtu_on_interface(host, interface, mtu_val=mtu, network_type=net_type,
                                                            lock_unlock=False, fail_ok=True)
            msg_result = "PASS" if expecting_pass else "FAIL"
            msg = "Failed to modify data MTU, expecting to {}, \nnew MTU:{}, max MTU:{}, old MTU:{}, " \
                  "Return code:{}; Details: {}".format(msg_result, pre_mtu, max_mtu, pre_mtu, code, res)

            if 0 == code:
                if mtu != cur_mtu:
                    changed_ifs.append(interface)
                    HOSTS_IF_MODIFY_ARGS.append((host, pre_mtu, mtu, max_mtu, interface, net_type))
                assert expecting_pass, msg
            else:
                assert not expecting_pass, msg

            LOG.info('OK, modification of MTU of data interface {} as expected: {}'.format(msg_result, msg_result))

        host_helper.unlock_host(host)
        for interface in revert_ifs:
            if interface in changed_ifs:
                actual_mtu = int(host_helper.get_host_interface_values(host,
                                                                       interface=interface, fields=['imtu'])[0])
                assert actual_mtu == mtu, \
                    'Actual MTU after modification did not match expected, expected:{}, actual:{}'.format(
                        mtu, actual_mtu)
        changed_ifs[:] = []

    if not HOSTS_IF_MODIFY_ARGS:
        skip('No data interface changed!')
        return

    HOSTS_IF_MODIFY_ARGS.reverse()
Esempio n. 18
0
def test_swact_controllers(stx_openstack_required,
                           wait_for_con_drbd_sync_complete):
    """
    Verify swact active controller

    Test Steps:
        - Boot a vm on system and check ping works
        - Swact active controller
        - Verify standby controller and active controller are swapped
        - Verify vm is still pingable

    """
    if not wait_for_con_drbd_sync_complete:
        skip(SkipSysType.LESS_THAN_TWO_CONTROLLERS)

    LOG.tc_step('retrieve active and available controllers')
    pre_active_controller, pre_standby_controller = system_helper.get_active_standby_controllers(
    )
    assert pre_standby_controller, "No standby controller available"

    pre_res_sys, pre_msg_sys = system_helper.wait_for_services_enable(
        timeout=20, fail_ok=True)
    up_hypervisors = host_helper.get_up_hypervisors()
    pre_res_neutron, pre_msg_neutron = network_helper.wait_for_agents_healthy(
        up_hypervisors, timeout=20, fail_ok=True)

    LOG.tc_step("Boot a vm from image and ping it")
    vm_id_img = vm_helper.boot_vm(name='swact_img',
                                  source='image',
                                  cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img)

    LOG.tc_step("Boot a vm from volume and ping it")
    vm_id_vol = vm_helper.boot_vm(name='swact', cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol)

    LOG.tc_step(
        "Swact active controller and ensure active controller is changed")
    host_helper.swact_host(hostname=pre_active_controller)

    LOG.tc_step("Verify standby controller and active controller are swapped")
    post_active_controller = system_helper.get_active_controller_name()
    post_standby_controller = system_helper.get_standby_controller_name()

    assert pre_standby_controller == post_active_controller, \
        "Prev standby: {}; Post active: {}".format(
            pre_standby_controller, post_active_controller)
    assert pre_active_controller == post_standby_controller, \
        "Prev active: {}; Post standby: {}".format(
            pre_active_controller, post_standby_controller)

    LOG.tc_step("Check boot-from-image vm still pingable after swact")
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_img, timeout=30)
    LOG.tc_step("Check boot-from-volume vm still pingable after swact")
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id_vol, timeout=30)

    LOG.tc_step(
        "Check system services and neutron agents after swact from {}".format(
            pre_active_controller))
    post_res_sys, post_msg_sys = system_helper.wait_for_services_enable(
        fail_ok=True)
    post_res_neutron, post_msg_neutron = network_helper.wait_for_agents_healthy(
        hosts=up_hypervisors, fail_ok=True)

    assert post_res_sys, \
        "\nPost-evac system services stats: {}\nPre-evac system services stats: {}". \
        format(post_msg_sys, pre_msg_sys)
    assert post_res_neutron, \
        "\nPost evac neutron agents stats: {}\nPre-evac neutron agents stats: {}". \
        format(pre_msg_neutron, post_msg_neutron)

    LOG.tc_step("Check hosts are Ready in kubectl get nodes after swact")
    kube_helper.wait_for_nodes_ready(hosts=(pre_active_controller,
                                            pre_standby_controller),
                                     timeout=30)
Esempio n. 19
0
def _test_increase_ceph_mon():
    """
    Increase the size of ceph-mon.  Only applicable to a storage system.

    Fails until CGTS-8216

    Test steps:
    1.  Determine the current size of ceph-mon
    2.  Attempt to modify ceph-mon to invalid values
    3.  Check if there is free space to increase ceph-mon
    4.  Attempt to increase ceph-mon
    5.  Wait for config out-of-date alarms to raise
    6.  Lock/unlock all affected nodes (controllers and storage)
    7.  Wait for alarms to clear
    8.  Check that ceph-mon has the correct updated value

    Enhancement:
    1.  Possibly check there is enough disk space for ceph-mon to increase.  Not sure if
    this is required since there always seems to be some space on the rootfs.

    """
    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    LOG.info("ceph_mon_gib is currently: {}".format(ceph_mon_gib))

    LOG.tc_step("Attempt to modify ceph-mon to invalid values")
    invalid_cmg = ['19', '41', 'fds']
    for value in invalid_cmg:
        host = "controller-0"
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(host, value),
                   fail_ok=True)

    if int(ceph_mon_gib) >= 30:
        skip("Insufficient disk space to execute test")

    ceph_mon_gib_avail = 40 - int(ceph_mon_gib)
    new_ceph_mon_gib = math.trunc(ceph_mon_gib_avail / 10) + int(ceph_mon_gib)

    LOG.tc_step("Increase ceph_mon_gib to {}".format(new_ceph_mon_gib))
    hosts = system_helper.get_controllers()
    for host in hosts:
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(
            host, new_ceph_mon_gib))
        # We only need to do this for one controller now and it applies to both
        break

    LOG.info("Wait for expected alarms to appear")
    storage_hosts = system_helper.get_storage_nodes()
    total_hosts = hosts + storage_hosts
    for host in total_hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                     entity_id="host={}".format(host))

    LOG.tc_step("Lock/unlock all affected nodes")
    for host in storage_hosts:
        HostsToRecover.add(host)
        host_helper.lock_host(host)
        host_helper.unlock_host(host)
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host))
        time.sleep(10)

    standby = system_helper.get_standby_controller_name()
    active = system_helper.get_active_controller_name()
    HostsToRecover.add(standby)
    host_helper.lock_host(standby)
    host_helper.unlock_host(standby)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(standby))
    time.sleep(10)
    host_helper.swact_host(active)
    HostsToRecover.add(active)
    host_helper.lock_host(active)
    host_helper.unlock_host(active)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(active))

    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    assert ceph_mon_gib != new_ceph_mon_gib, "ceph-mon did not change"
Esempio n. 20
0
def test_modify_drdb_swact_then_reboot():
    """
    This test modifies the size of the drbd based filesystems, does and
    immediate swact and then reboots the active controller.

    Arguments:
    - None

    Test Steps:
    - Determine how much free space we have available
    - Increase datebase
    - Increase extension
    - Initiate a controller swact
    - Initiate a controller reboot

    Assumptions:
    - None

    """

    drbdfs = DRBDFS
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Determine the available free space on the system")
    cmd = "vgdisplay -C --noheadings --nosuffix -o vg_free --units g cgts-vg"
    rc, out = con_ssh.exec_sudo_cmd(cmd)
    free_space = out.lstrip()
    LOG.info("Available free space on the system is: {}".format(free_space))
    if float(free_space) <= 10:
        skip("Not enough free space to complete test.")

    drbdfs_val = {}
    LOG.tc_step("Determine the space available for each drbd fs")
    for fs in drbdfs:
        table_ = table_parser.table(
            cli.system('controllerfs-show {}'.format(fs))[1])
        drbdfs_val[fs] = table_parser.get_value_two_col_table(table_, 'size')

    LOG.info("Current fs values are: {}".format(drbdfs_val))

    LOG.tc_step("Increase the size of the extension and database filesystem")
    partition_name = "database"
    partition_value = drbdfs_val[partition_name]
    backup_freespace = math.trunc(float(free_space) / 10)
    new_partition_value = backup_freespace + int(partition_value)
    cmd = "controllerfs-modify {}={}".format(partition_name,
                                             new_partition_value)
    cli.system(cmd)

    partition_name = "extension"
    partition_value = drbdfs_val[partition_name]
    cgcs_freespace = math.trunc(backup_freespace / 2)
    new_partition_value = cgcs_freespace + int(partition_value)
    cmd = "controllerfs-modify {}={}".format(partition_name,
                                             new_partition_value)
    cli.system(cmd)

    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)
    standby_cont = system_helper.get_standby_controller_name()
    system_helper.wait_for_host_values(standby_cont,
                                       availability=HostAvailState.AVAILABLE)
    host_helper.swact_host()

    act_cont = system_helper.get_active_controller_name()
    host_helper.reboot_hosts(act_cont)

    time.sleep(5)

    system_helper.wait_for_alarm_gone(
        alarm_id=EventLogID.HOST_RECOVERY_IN_PROGRESS,
        entity_id="host={}".format(act_cont),
        timeout=600)
Esempio n. 21
0
def test_lock_cont_check_mon_down():
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that we alarm when a CEPH monitor goes
    down.  This test is specifically for controller hosts.

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock controller node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs stay up
            - Check that the appropriate alarms are raised:
              - controller-X is locked
              - ceph mon down
        3.  Unlock controller node
            - ensure CEPH is HEALTH_OK
            - Check that alarms are cleared

    Enhancements:
       1.  Should we do both controllers?  This will require a swact.
    """

    con_ssh = ControllerClient.get_active_controller()

    host = system_helper.get_standby_controller_name()
    LOG.tc_step('Lock standby controller node {}'.format(host))
    HostsToRecover.add(host, scope='function')
    rtn_code, out = host_helper.lock_host(host)
    assert rtn_code == 0, out

    LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
        "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
        "Alarm {} not raised".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check OSDs are still up after lock')
    osd_list = storage_helper.get_osds(con_ssh=con_ssh)
    for osd_id in osd_list:
        osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
        msg = 'OSD ID {} should be up but is not'.format(osd_id)
        assert osd_up, msg
        msg = 'OSD ID {} is up'.format(osd_id)
        LOG.info(msg)

    LOG.tc_step('Unlock standby controller node {}'.format(host))
    rtn_code, out = host_helper.unlock_host(host, available_only=True)
    assert rtn_code == 0, out

    LOG.tc_step('Check that the host locked alarm is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
        "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check that the Storage Alarm Condition is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
        "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check health of CEPH cluster')
    msg = ''
    end_time = time.time() + 40
    while time.time() < end_time:
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        if ceph_healthy:
            break
    else:
        assert 0, "ceph is not healthy"
Esempio n. 22
0
def test_system_alarms_and_events_on_lock_unlock_compute(no_simplex):
    """
    Verify fm alarm-show command

    Test Steps:
    - Delete active alarms
    - Lock a host
    - Check active alarm generated for host lock
    - Check relative values are the same in fm alarm-list and fm alarm-show
    <uuid>
    - Check host lock 'set' event logged via fm event-list
    - Unlock host
    - Check active alarms cleared via fm alarm-list
    - Check host lock 'clear' event logged via fm event-list
    """

    # Remove following step because it's unnecessary and fails the test when
    # alarm is re-generated
    # # Clear the alarms currently present
    # LOG.tc_step("Clear the alarms table")
    # system_helper.delete_alarms()

    # Raise a new alarm by locking a compute node
    # Get the compute
    compute_host = host_helper.get_up_hypervisors()[0]
    if compute_host == system_helper.get_active_controller_name():
        compute_host = system_helper.get_standby_controller_name()
        if not compute_host:
            skip('Standby controller unavailable')

    LOG.tc_step("Lock a nova hypervisor host {}".format(compute_host))
    pre_lock_time = common.get_date_in_format()
    HostsToRecover.add(compute_host)
    host_helper.lock_host(compute_host)

    LOG.tc_step("Check host lock alarm is generated")
    post_lock_alarms = \
        system_helper.wait_for_alarm(field='UUID', entity_id=compute_host,
                                     reason=compute_host,
                                     alarm_id=EventLogID.HOST_LOCK,
                                     strict=False,
                                     fail_ok=False)[1]

    LOG.tc_step(
        "Check related fields in fm alarm-list and fm alarm-show are of the "
        "same values")
    post_lock_alarms_tab = system_helper.get_alarms_table(uuid=True)

    alarms_l = ['Alarm ID', 'Entity ID', 'Severity', 'Reason Text']
    alarms_s = ['alarm_id', 'entity_instance_id', 'severity', 'reason_text']

    # Only 1 alarm since we are now checking the specific alarm ID
    for post_alarm in post_lock_alarms:
        LOG.tc_step(
            "Verify {} for alarm {} in alarm-list are in sync with "
            "alarm-show".format(
                alarms_l, post_alarm))

        alarm_show_tab = table_parser.table(cli.fm('alarm-show', post_alarm)[1])
        alarm_list_tab = table_parser.filter_table(post_lock_alarms_tab,
                                                   UUID=post_alarm)

        for i in range(len(alarms_l)):
            alarm_l_val = table_parser.get_column(alarm_list_tab,
                                                  alarms_l[i])[0]
            alarm_s_val = table_parser.get_value_two_col_table(alarm_show_tab,
                                                               alarms_s[i])

            assert alarm_l_val == alarm_s_val, \
                "{} value in alarm-list: {} is different than alarm-show: " \
                "{}".format(alarms_l[i], alarm_l_val, alarm_s_val)

    LOG.tc_step("Check host lock is logged via fm event-list")
    system_helper.wait_for_events(entity_instance_id=compute_host,
                                  start=pre_lock_time, timeout=60,
                                  event_log_id=EventLogID.HOST_LOCK,
                                  fail_ok=False, **{'state': 'set'})

    pre_unlock_time = common.get_date_in_format()
    LOG.tc_step("Unlock {}".format(compute_host))
    host_helper.unlock_host(compute_host)

    LOG.tc_step("Check host lock active alarm cleared")
    alarm_sets = [(EventLogID.HOST_LOCK, compute_host)]
    system_helper.wait_for_alarms_gone(alarm_sets, fail_ok=False)

    LOG.tc_step("Check host lock clear event logged")
    system_helper.wait_for_events(event_log_id=EventLogID.HOST_LOCK,
                                  start=pre_unlock_time,
                                  entity_instance_id=compute_host,
                                  fail_ok=False, **{'state': 'clear'})
Esempio n. 23
0
def test_lock_unlock_host(host_type, collect_kpi):
    """
    Verify lock unlock host

    Test Steps:
        - Select a host per given type. If type is controller, select standby controller.
        - Lock selected host and ensure it is successfully locked
        - Unlock selected host and ensure it is successfully unlocked

    """
    init_time = None
    if collect_kpi:
        init_time = common.get_date_in_format(date_format=KPI_DATE_FORMAT)

    LOG.tc_step("Select a {} node from system if any".format(host_type))
    if host_type == 'controller':
        if system_helper.is_aio_simplex():
            host = 'controller-0'
        else:
            host = system_helper.get_standby_controller_name()
            assert host, "No standby controller available"

    else:
        if host_type == 'compute' and (system_helper.is_aio_duplex()
                                       or system_helper.is_aio_simplex()):
            skip("No compute host on AIO system")
        elif host_type == 'storage' and not system_helper.is_storage_system():
            skip("System does not have storage nodes")

        hosts = system_helper.get_hosts(personality=host_type,
                                        availability=HostAvailState.AVAILABLE,
                                        operational=HostOperState.ENABLED)

        assert hosts, "No good {} host on system".format(host_type)
        host = hosts[0]

    LOG.tc_step(
        "Lock {} host - {} and ensure it is successfully locked".format(
            host_type, host))
    HostsToRecover.add(host)
    host_helper.lock_host(host, swact=False)

    # wait for services to stabilize before unlocking
    time.sleep(20)

    # unlock standby controller node and verify controller node is successfully unlocked
    LOG.tc_step(
        "Unlock {} host - {} and ensure it is successfully unlocked".format(
            host_type, host))
    host_helper.unlock_host(host)

    LOG.tc_step("Check helm list after host unlocked")
    con_ssh = ControllerClient.get_active_controller()
    con_ssh.exec_cmd('helm list', fail_ok=False)

    if collect_kpi:
        lock_kpi_name = HostLock.NAME.format(host_type)
        unlock_kpi_name = HostUnlock.NAME.format(host_type)
        unlock_host_type = host_type
        if container_helper.is_stx_openstack_deployed():
            if system_helper.is_aio_system():
                unlock_host_type = 'compute'
        else:
            lock_kpi_name += '_platform'
            unlock_kpi_name += '_platform'
            if unlock_host_type == 'compute':
                unlock_host_type = 'compute_platform'

        LOG.info("Collect kpi for lock/unlock {}".format(host_type))
        code_lock, out_lock = kpi_log_parser.record_kpi(
            local_kpi_file=collect_kpi,
            kpi_name=lock_kpi_name,
            host=None,
            log_path=HostLock.LOG_PATH,
            end_pattern=HostLock.END.format(host),
            start_pattern=HostLock.START.format(host),
            start_path=HostLock.START_PATH,
            init_time=init_time)

        time.sleep(30)  # delay in sysinv log vs nova hypervisor list
        code_unlock, out_unlock = kpi_log_parser.record_kpi(
            local_kpi_file=collect_kpi,
            kpi_name=unlock_kpi_name,
            host=None,
            log_path=HostUnlock.LOG_PATH,
            end_pattern=HostUnlock.END[unlock_host_type].format(host),
            init_time=init_time,
            start_pattern=HostUnlock.START.format(host),
            start_path=HostUnlock.START_PATH)

        assert code_lock == 0, 'Failed to collect kpi for host-lock {}. ' \
                               'Error: \n'.format(host, out_lock)
        assert code_unlock == 0, 'Failed to collect kpi for host-unlock {}. ' \
                                 'Error: \n'.format(host, out_lock)
Esempio n. 24
0
def test_get_active_con():
    active = system_helper.get_active_controller_name()
    standby = system_helper.get_standby_controller_name()

    LOG.tc_step("Active: {}; Standby: {}".format(active, standby))
    assert 'controller-' in active
Esempio n. 25
0
def fail_controller(request):
    standby = system_helper.get_standby_controller_name()
    HostsToRecover.add(standby, scope='function')
    host_helper.reboot_hosts(standby, wait_for_reboot_finish=False)

    return True
Esempio n. 26
0
def test_swact_100_times():
    """
    Skip Condition:
        - Less than two controllers on system

    Test Steps:
        - Boot a vm and ensure it's pingable
        - Start writing from pre-existed vm before swacting
        - Repeat following steps 100 times:
            - ensure system has standby controller
            - system host-swact
            - ensure all services are active in sudo sm-dump on new active controller
            - ensure pre-existed vm is still pingable from NatBox
            - ensure writing did not stop on pre-existed vm
            - ensure new vm can be launched in 2 minutes
            - ensure newly booted vm is pingable from NatBox
            - delete newly booted vm

    Teardown:
        - delete vms, volumes

    """
    if len(system_helper.get_controllers()) < 2:
        skip("Less than two controllers on system")

    if not system_helper.get_standby_controller_name():
        assert False, "No standby controller on system"

    LOG.tc_step("Boot a vm and ensure it's pingable")
    vm_base = vm_helper.boot_vm(name='pre_swact', cleanup='function')[1]

    LOG.tc_step("Start writing from pre-existed vm before swacting")
    end_event = Events("End write in base vm")
    base_vm_thread = vm_helper.write_in_vm(vm_base, end_event=end_event, expect_timeout=40, thread_timeout=60*100)

    try:
        for i in range(100):
            iter_str = "Swact iter{}/100 - ".format(i+1)

            LOG.tc_step("{}Ensure system has standby controller".format(iter_str))
            standby = system_helper.get_standby_controller_name()
            assert standby

            LOG.tc_step("{}Swact active controller and ensure active controller is changed".format(iter_str))
            host_helper.swact_host()

            LOG.tc_step("{}Check all services are up on active controller via sudo sm-dump".format(iter_str))
            host_helper.wait_for_sm_dump_desired_states(controller=standby, fail_ok=False)

            LOG.tc_step("{}Ensure pre-existed vm still pingable post swact".format(iter_str))
            vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_base, timeout=45)

            time.sleep(5)
            LOG.tc_step("{}Ensure writing from pre-existed vm resumes after swact".format(iter_str))
            assert base_vm_thread.res is True, "Writing in pre-existed vm stopped after {}".format(iter_str.lower())

            LOG.tc_step("{}Attempt to boot new vm after 2 minutes of post swact and ensure it's pingable".
                        format(iter_str))
            time.sleep(60)
            for j in range(3):
                code, vm_new, msg = vm_helper.boot_vm(name='post_swact', fail_ok=True, cleanup='function')

                if code == 0:
                    break

                LOG.warning("VM failed to boot - attempt{}".format(j+1))
                vm_helper.delete_vms(vms=vm_new)
                assert j < 2, "No vm can be booted 2+ minutes after swact"

                LOG.tc_step("{}VM{} failed to boot, wait for 30 seconds and retry".format(j+1, iter_str))
                time.sleep(30)

            vm_helper.wait_for_vm_pingable_from_natbox(vm_new)

            LOG.tc_step("{}Delete the vm created".format(iter_str))
            vm_helper.delete_vms(vms=vm_new)
    except:
        raise
    finally:
        LOG.tc_step("End the base_vm_thread")
        end_event.set()
        base_vm_thread.wait_for_thread_end(timeout=20)

    post_standby = system_helper.get_standby_controller_name()
    assert post_standby, "System does not have standby controller after last swact"
Esempio n. 27
0
def test_dc_modify_timezone(prev_check):
    """
    Test timezone modify on system controller and subcloud. Ensure timezone change is not
    propagated.
    Setups:
        - Ensure both central and subcloud regions are configured with UTC
        - Get the timestamps for host created_at before timezone modify

    Test Steps
        - Change the timezone in central region and wait until the change is applied
        - Change the timezone to a different zone in subcloud and wait until the change is applied
        - Verify host created_at timestamp updated according to the local timezone for the region
        - Swact on subcloud and ensure timezone and host created_at timestamp persists locally
        - Swact central controller and ensure timezone and host created_at timestamp persists
          in central and subcloud

    Teardown
        - Change timezone to UTC in both central and subcloud regions
        - Ensure host created_at timestamp is reverted to original

    """
    prev_central_time, prev_sub_time, central_zone, sub_zone, central_auth, subcloud_auth, \
        subcloud = prev_check

    LOG.tc_step("Modify timezone to {} in central region".format(central_zone))
    system_helper.modify_timezone(timezone=central_zone,
                                  auth_info=central_auth)

    LOG.tc_step(
        "Waiting for timestamp for host created_at to update in central region"
    )
    post_central_time = wait_for_timestamp_update(
        prev_timestamp=prev_central_time, auth_info=central_auth)
    assert post_central_time != prev_central_time, \
        "host created_at timestamp did not update after timezone changed " \
        "to {} in central region".format(central_zone)

    LOG.tc_step("Modify timezone to {} in {}".format(sub_zone, subcloud))
    system_helper.modify_timezone(timezone=sub_zone, auth_info=subcloud_auth)

    LOG.tc_step(
        "Waiting for timestamp for same host created_at to update in {}".
        format(subcloud))
    post_sub_time = wait_for_timestamp_update(prev_timestamp=prev_sub_time,
                                              auth_info=subcloud_auth)
    assert post_sub_time != prev_sub_time, \
        "host created_at timestamp did not update after timezone changed to {} " \
        "in {}".format(sub_zone, subcloud)
    assert post_sub_time != post_central_time, \
        "Host created_at timestamp is the same on central and {} when configured with different " \
        "timezones".format(subcloud)

    LOG.tc_step(
        "Ensure host created_at timestamp does not change after subcloud sync audit"
    )
    dc_helper.wait_for_sync_audit(subclouds=subcloud,
                                  fail_ok=True,
                                  timeout=660)
    post_sync_sub_time = system_helper.get_host_values(
        host='controller-0', fields='created_at', auth_info=subcloud_auth)[0]
    assert post_sub_time == post_sync_sub_time, \
        "Host created_at timestamp changed after sync audit on {}".format(subcloud)

    if not system_helper.is_aio_simplex():
        LOG.tc_step(
            "Swact in {} region and verify timezone persists locally".format(
                subcloud))
        host_helper.swact_host(auth_info=subcloud_auth)
        post_swact_sub_zone = system_helper.get_timezone(
            auth_info=subcloud_auth)
        assert post_swact_sub_zone == sub_zone

        post_swact_sub_time = system_helper.get_host_values(
            host='controller-0', fields='created_at',
            auth_info=subcloud_auth)[0]
        assert post_swact_sub_time == post_sub_time

    if system_helper.get_standby_controller_name(auth_info=central_auth):
        LOG.tc_step(
            "Swact in central region, and ensure timezone persists locally in central"
            " and subcloud")
        host_helper.swact_host(auth_info=central_auth)

        # Verify central timezone persists
        post_swact_central_zone = system_helper.get_timezone(
            auth_info=central_auth)
        assert post_swact_central_zone == central_zone
        post_swact_central_time = system_helper.get_host_values(
            host='controller-0', fields='created_at',
            auth_info=central_auth)[0]
        assert post_swact_central_time == post_central_time

        # Verify subcloud timezone persists
        post_central_swact_sub_zone = system_helper.get_timezone(
            auth_info=subcloud_auth)
        assert post_central_swact_sub_zone == sub_zone
        post_central_swact_sub_time = system_helper.get_host_values(
            host='controller-0', fields='created_at',
            auth_info=subcloud_auth)[0]
        assert post_central_swact_sub_time == post_sub_time
Esempio n. 28
0
def _test_system_alarm_on_host_lock():
    """
    Verify fm event-list command in the system upon host-lock

    Scenario:
    1. Execute "fm alarm-list" command in the system.
    2. Lock one compute and wait 30 seconds.
    3. Verify commands return list of active alarms in table with expected
    rows.
    """

    LOG.info("Execute fm alarm-list. Verify header of " +
             "a table consist of correct items")

    # Get and save the list of existing alarms present in the system
    res, out = cli.fm('alarm-list')
    alarm_list = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    current_alarms = []
    for alarm in alarm_list['values']:
        if re.match(".", alarm[0].strip()) is not None:
            current_alarms.append(alarm[0])
            LOG.info("The current alarms in the system are: "
                     "{0}".format(alarm[0]))

    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    # Check that a valid alarm header is present
    alarm_header = [
        'UUID', 'Time Stamp', 'State', 'Event Log ID', 'Reason Text',
        'Entity Instance ID', 'Severity'
    ]
    if hist_alarm_table['headers'] != alarm_header:
        LOG.info("Fields in table not correct actual {0} expected {1}".format(
            hist_alarm_table['headers'], alarm_header))

    # Verify the existing alarms are present in the historical list in state 'set'
    for name in current_alarms:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Raise a new alarm by locking a compute node
    # Get the compute
        LOG.info("Lock compute and wait 30 seconds")
    host = 'compute-1'
    if system_helper.is_aio_duplex():
        host = system_helper.get_standby_controller_name()

    HostsToRecover.add(host, scope='function')
    host_helper.lock_host(host)
    time.sleep(20)

    # Verify the new alarm is present in the historical alarm and active alarm lists
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    # Save the list of new alarms present in the list
    new_alarms = []
    for alarm in new_active_alarm_table['values']:
        if (re.match(".", alarm[0].strip()) is not None):
            new_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    new_alarm_list = list(set(new_alarms) - set(current_alarms))
    LOG.info(new_alarm_list)

    # Verify the new alarms are present in the historical list in state 'set'
    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Clear the alarm by unlocking the compute node
        LOG.info("Unlock compute and wait 30 seconds")
    compute_ssh = host_helper.unlock_host(host)
    time.sleep(30)

    #Verify the alarm clear is shown in the historical table
    LOG.info("Verify event-list command returns list of active alarms")
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['clear']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    #Verify the alarm disappears from the active alarm table
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    active_alarms = []
    for alarm in new_active_alarm_table['values']:
        if re.match(".", alarm[0].strip()) is not None:
            active_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    for name in new_alarm_list:
        if name in active_alarms:
            LOG.info("The alarm was not cleared from the active alarm table")
            test_res = False
            break
Esempio n. 29
0
def test_dc_dns_override_local_change(ensure_synced):
    """
    Verify DNS modification on subcloud will be overridden by central region config
    Args:
        ensure_synced: test fixture

    Setups:
        - Ensure primary subcloud is managed and DNS config is valid and synced

    Test Steps:
        - Un-manage primary subcloud
        - Configure DNS servers on primary subcloud to a unreachable ip address (8.4.4.4)
        - Wait for sync log for any managed subcloud with best effort
        - Ensure DNS config is not updated on unmanaged primary subcloud
        - Verify nslookup passes on central region and fails on primary subcloud
        - Re-manage primary subcloud and ensure DNS config syncs over
        - Verify nslookup in Central Region and primary subcloud are working as expected

    Teardown:
        - Manage primary subcloud if not managed (module)
        - Reset DNS servers to original value on central region (module)

    """
    primary_subcloud, managed_subclouds, sc_dns = ensure_synced
    new_dns_servers = compose_new_dns_servers(scenario='unreachable_server',
                                              prev_dns_servers=sc_dns)

    LOG.tc_step("Unmanage {}".format(primary_subcloud))
    dc_helper.unmanage_subcloud(subcloud=primary_subcloud, check_first=True)

    LOG.tc_step("Reconfigure DNS on {} from {} to {}".format(
        primary_subcloud, sc_dns, new_dns_servers))
    system_helper.set_dns_servers(new_dns_servers,
                                  auth_info=Tenant.get(
                                      'admin_platform',
                                      dc_region=primary_subcloud))

    managed_cloud = managed_subclouds[0] if managed_subclouds else ''
    LOG.tc_step(
        "Wait for sync update log for managed subcloud {} with best effort".
        format(managed_cloud))
    dc_helper.wait_for_sync_audit(subclouds=managed_cloud,
                                  fail_ok=True,
                                  timeout=660)

    LOG.tc_step(
        "Ensure DNS config is not updated on unmanaged subcloud: {}".format(
            primary_subcloud))
    code = dc_helper.wait_for_subcloud_dns_config(subcloud=primary_subcloud,
                                                  expected_dns=sc_dns,
                                                  fail_ok=True,
                                                  timeout=60)[0]
    assert 1 == code, "Actual return code: {}".format(code)

    LOG.tc_step("Verify nslookup fails on {}".format(primary_subcloud))
    central_res, local_res = verify_dns_on_central_and_subcloud(
        primary_subcloud, fail_ok=True, sc_dns=sc_dns)
    assert 0 == central_res, "nslookup failed on central region"
    assert 1 == local_res, "nslookup succeeded on {} with unreachable DNS servers configured".\
        format(primary_subcloud)

    central_auth = Tenant.get('admin_platform', dc_region='RegionOne')
    if system_helper.get_standby_controller_name(auth_info=central_auth):
        LOG.tc_step("Swact in central region")
        host_helper.swact_host(auth_info=central_auth)

    LOG.tc_step(
        'Re-manage {} and ensure local DNS config is overridden by central config'
        .format(primary_subcloud))
    dc_helper.manage_subcloud(subcloud=primary_subcloud, check_first=False)
    dc_helper.wait_for_subcloud_dns_config(subcloud=primary_subcloud,
                                           expected_dns=sc_dns)

    LOG.tc_step('Verify nslookup works in Central Region and {}'.format(
        primary_subcloud))
    verify_dns_on_central_and_subcloud(primary_subcloud, sc_dns=sc_dns)
Esempio n. 30
0
def test_horizon_storage_overview_display(storage_overview_pg):
    """
    Tests the storage overview display:

    Setups:
        - Login as Admin
        - Go to Admin > Platform > Storage Overview

    Teardown:
        - Logout

    Test Steps:
        - Test Storage cluster UUID, Health Status and Details display
        - Test host and rank table display
        - Test osd.# table and status display
    """
    con_ssh = ControllerClient.get_active_controller()
    standby = system_helper.get_standby_controller_name(con_ssh=con_ssh)
    if standby == 'controller-1':
        LOG.info('Workaround for CGTS-16739')
        host_helper.swact_host(con_ssh=con_ssh)

    LOG.tc_step('Check storage cluster UUID, ceph health and storage usage display')
    cli_storage_service_info = []

    uuid = system_helper.get_clusters(field='cluster_uuid')[0]
    cli_storage_service_info.append(uuid)

#   'ceph health' cmd output sample:
#   HEALTH_ERR 1728 pgs are stuck inactive for more than 300 seconds; 1728 pgs stuck inactive; 1728 pgs stuck unclean;\
#   1 mons down, quorum 0,1 controller-0,controller-1
    health_details = con_ssh.exec_cmd('ceph health')[1]
    health_status = health_details.split(' ')[0]
    cli_storage_service_info.append(health_status)

    if health_status == 'HEALTH_ERR':
        health_details = health_details.split('HEALTH_ERR ')[1]
    elif health_status == 'HEALTH_WARN':
        health_details = health_details.split('HEALTH_WARN ')[1]
    cli_storage_service_info.append(health_details)

    horizon_ceph_info = storage_overview_pg.storage_service_info.get_content()
    for info in cli_storage_service_info:
        assert info in horizon_ceph_info.values(), 'Horizon storage cluster info does not match to cli info'
    LOG.tc_step('Storage service details display correct')

    LOG.info('Test host and rank table display')
    ceph_mon_status = eval(con_ssh.exec_cmd('ceph mon_status')[1])
    mon_map = ceph_mon_status.get('monmap')
    cli_ceph_monitor = {}
    # mon_map.get('mons') returns a dict list
    for mon_info_dict in mon_map.get('mons'):
        host_name = mon_info_dict.get('name')
        host_rank = mon_info_dict.get('rank')
        cli_ceph_monitor[host_name] = str(host_rank)

    for host_name in cli_ceph_monitor.keys():
        cli_rank_val = cli_ceph_monitor[host_name]
        horizon_rank_val = storage_overview_pg.get_storage_overview_monitor_info(host_name, 'Rank')
        assert horizon_rank_val == cli_rank_val, '{} rank display incorrectly'.format(host_name)

    LOG.info('Host and rank table display correct')

    LOG.tc_step('Test osd table and status display')
    osd_list = storage_helper.get_osds()
    for osd_id in osd_list:
        if osd_id is not None:
            expt_horizon = {}
            for header in storage_overview_pg.osds_table.column_names:
                host_name = storage_helper.get_osd_host(osd_id)
                osd_name = 'osd.{}'.format(osd_id)
                expt_horizon['Host'] = host_name
                expt_horizon['Name'] = osd_name
                expt_horizon['Status'] = 'up'
                if not storage_helper.is_osd_up(osd_id, con_ssh):
                    expt_horizon['Status'] = 'down'
                horizon_val = storage_overview_pg.get_storage_overview_osd_info(osd_name, header)
                assert expt_horizon[header] == horizon_val, '{}{} display incorrect'.format(osd_name, header)
    LOG.info('Osd table display correct')
    horizon.test_result = True