Ejemplo n.º 1
0
def __clear_config_out_of_date_alarms(hosts):
    LOG.info("Check config out-of-date alarms are raised against the nodes and lock unlock them to clear alarms")
    for node in hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, entity_id="host={}".format(node))

    LOG.info("Wait 60 seconds to ensure the service parameter is applied")
    time.sleep(60)

    host_helper.lock_unlock_hosts(hosts=hosts)
    for node in hosts:
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, entity_id="host={}".format(node))
Ejemplo n.º 2
0
def test_patch_orch_reject_with_alarms(patch_orchestration_setup, patch_function_check):
    """
    This test verifies the patch orchestration operation can not proceed with presence of alarms that are not normally
    ignored by the orchestration. The test generates the alarm ( 700.002 - VM paused) before executing the patch
    orchestration.
    Args:
        patch_orchestration_setup:
        patch_function_check

    Returns:

    """
    vms = patch_function_check
    patches, controllers, computes, storages = patch_orchestration_setup

    LOG.tc_step("Generate VM paused ( 700.002) critical alarm")
    paused_vm, unpaused_vm = vms
    vm_helper.pause_vm(paused_vm)
    system_helper.wait_for_alarm(alarm_id='700.002')

    patch = patching_helper.parse_test_patches(patch_ids=patches, search_str='RR_ALLNODES')[0]
    patch_file = patches[patch]
    LOG.tc_step("Upload patch file {}".format(patch_file))
    uploaded_id = patching_helper.upload_patches(patch_files=patch_file)[1][0]
    assert patch == uploaded_id, "Expected patch {} and uploaded patch {} mismatch"\
        .format(patch, uploaded_id)
    LOG.info("Patch {} uploaded".format(uploaded_id))

    LOG.tc_step("Apply patch {}".format(uploaded_id))
    applied = patching_helper.apply_patches(patch_ids=[uploaded_id])[1]
    LOG.info("Patch {} applied".format(applied))

    LOG.tc_step("Attempt to create patch orchestration strategy; expected to fail")
    rc, msg = orchestration_helper.create_strategy('patch', fail_ok=True)
    assert rc != 0, "Patch orchestration strategy created with presence of critical alarm; expected to fail: {}"\
        .format(msg)

    LOG.info("Delete the failed patch orchestration strategy")
    orchestration_helper.delete_strategy("patch")

    LOG.tc_step("Remove test patch {}".format(applied))
    patching_helper.remove_patches(patch_ids=applied)
    assert 0 == patching_helper.wait_for_patch_states(applied, expected_states=PatchState.AVAILABLE)[0]

    LOG.tc_step("Un-pause vm after test patch removal, and check vms are in good state.")
    vm_helper.unpause_vm(paused_vm)
    vm_helper.wait_for_vm_pingable_from_natbox(paused_vm)
    check_vms(vms)
Ejemplo n.º 3
0
def test_dead_office_recovery(reserve_unreserve_all_hosts_module):
    """
    Test dead office recovery with vms
    Args:
        reserve_unreserve_all_hosts_module: test fixture to reserve unreserve all vlm nodes for lab under test

    Setups:
        - Reserve all nodes in vlm

    Test Steps:
        - Boot 5 vms with various boot_source, disks, etc and ensure they can be reached from NatBox
        - Power off all nodes in vlm using multi-processing to simulate a power outage
        - Power on all nodes
        - Wait for nodes to become online/available
        - Check vms are recovered after hosts come back up and vms can be reached from NatBox

    """
    LOG.tc_step("Boot 5 vms with various boot_source, disks, etc")
    vms = vm_helper.boot_vms_various_types()

    hosts = system_helper.get_hosts()
    hosts_to_check = system_helper.get_hosts(availability=['available', 'online'])

    LOG.info("Online or Available hosts before power-off: {}".format(hosts_to_check))
    LOG.tc_step("Powering off hosts in multi-processes to simulate power outage: {}".format(hosts))
    region = None
    if ProjVar.get_var('IS_DC'):
        region = ProjVar.get_var('PRIMARY_SUBCLOUD')

    try:
        vlm_helper.power_off_hosts_simultaneously(hosts, region=region)
    except:
        raise
    finally:
        LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts))
        time.sleep(60)
        LOG.info("Hosts to check after power-on: {}".format(hosts_to_check))
        vlm_helper.power_on_hosts(hosts, reserve=False, reconnect_timeout=HostTimeout.REBOOT+HostTimeout.REBOOT,
                                  hosts_to_check=hosts_to_check, region=region)

    LOG.tc_step("Check vms are recovered after dead office recovery")
    vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)
    for vm in vms:
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
    computes = host_helper.get_hypervisors()
    if len(computes) >= 4:
        system_helper.wait_for_alarm(alarm_id=EventLogID.MULTI_NODE_RECOVERY, timeout=120)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.MULTI_NODE_RECOVERY, check_interval=60, timeout=1200)
Ejemplo n.º 4
0
def wait_after_change_sysadmin_password():
    total_wait_time = MAX_WAIT_FOR_ALARM
    each_wait_time = 120
    waited_time = 0

    time.sleep(30)

    alarm_id = ALARM_ID_OUTOF_CONFIG
    while waited_time < total_wait_time:
        waited_time += each_wait_time

        found = system_helper.wait_for_alarm(alarm_id=alarm_id,
                                             fail_ok=True,
                                             timeout=each_wait_time)
        if found:
            LOG.info('OK, found alarm for password change, alarm-id:{}'.format(
                alarm_id))
            alarm_gone = system_helper.wait_for_alarm_gone(
                alarm_id, fail_ok=True, timeout=each_wait_time)
            if alarm_gone:
                LOG.info(
                    'OK, found alarms were cleared for password change, alarm-id:{}'
                    .format(alarm_id))
                break
    else:
        assert False, 'Failed to find alarms/or alarms not cleared for password change within {} seconds, ' \
                      'expecting alarm-id:{}'.format(waited_time, alarm_id)
    return True
Ejemplo n.º 5
0
def test_reapply_openstack():
    container_helper.wait_for_apps_status(apps="stx-openstack",
                                          status=AppStatus.APPLIED,
                                          timeout=600,
                                          check_interval=60)
    container_helper.remove_app(app_name="stx-openstack", check_first=True)
    alarm_id = EventLogID.CONFIG_OUT_OF_DATE
    if system_helper.wait_for_alarm(alarm_id=alarm_id,
                                    entity_id='controller',
                                    timeout=15,
                                    fail_ok=True)[0]:
        system_helper.wait_for_alarm_gone(alarm_id=alarm_id,
                                          entity_id='controller',
                                          timeout=120,
                                          check_interval=10)
    container_helper.apply_app(app_name="stx-openstack",
                               check_first=False,
                               check_interval=300,
                               applied_timeout=5400)
    provider_network_setup(PHYSNET0, PHYSNET1)
    tenant_networking_setup(physnet0=PHYSNET0,
                            physnet1=PHYSNET1,
                            externalnet=EXTERNALNET,
                            publicnet=PUBLICNET,
                            privatenet=PRIVATENET,
                            internalnet=INTERNALNET,
                            publicsubnet=PUBLICSUBNET,
                            privatesubnet=PRIVATESUBNET,
                            internalsubnet=INTERNALSUBNET,
                            externalsubnet=EXTERNALSUBNET,
                            publicrouter=PUBLICROUTER,
                            privaterouter=PRIVATEROUTER)
Ejemplo n.º 6
0
def alarm_summary_add_and_del(subcloud):
    try:
        # Test adding alarm on subcloud
        ssh_client = ControllerClient.get_active_controller(name=subcloud)
        LOG.info("Wait for alarm raised on subcloud {}".format(subcloud))
        system_helper.wait_for_alarm(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_client)
        LOG.tc_step(
            "Ensure alarm summary match nn Central with subcloud: {}".format(
                subcloud))
        check_alarm_summary_match_subcloud(subcloud)

        # Test clearing alarm on subcloud
        LOG.tc_step("Clear alarm on subcloud: {}".format(subcloud))
        ssh_client.exec_cmd('fmClientCli -D host=testhost-0', fail_ok=False)
        LOG.info("Wait for alarm clear on subcloud {}".format(subcloud))
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_client)
        check_alarm_summary_match_subcloud(subcloud)
    finally:
        ssh_client = ControllerClient.get_active_controller(name=subcloud)
        LOG.info("Clear alarm on subcloud: {}".format(subcloud))
        ssh_client.exec_cmd('fmClientCli -D host=testhost-0')
Ejemplo n.º 7
0
def clear_config_out_of_date_alarm():
    active, standby = system_helper.get_active_standby_controllers()
    for host in (standby, active):
        if host and system_helper.wait_for_alarm(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                timeout=5,
                entity_id=host,
                fail_ok=True)[0]:
            host_helper.lock_host(host, swact=True)
            time.sleep(60)
            host_helper.unlock_host(host)
            system_helper.wait_for_alarm_gone(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                entity_id=host,
                fail_ok=False)
Ejemplo n.º 8
0
def test_transition_sensorgroup_actions(host,
                                        event_type,
                                        action_level,
                                        action,
                                        suppression,
                                        expt_alarm,
                                        expt_host_avail,
                                        new_action,
                                        new_suppression,
                                        new_expt_alarm,
                                        new_expt_host_avail,
                                        sensor_data_fit):
    """
    Verify the sensorgroup can properly transition from one action to another when
    an event remains unchanged.

    Test Steps:
        - Get a sensorgroup to test
        - Set the event level and expected action
        - trigger an out-of-scope event for that sensorgroup
        - verify that the expected action is taken
        - transition the sensorgroup action
        - verify the new action is taken
    """
    bmc_hosts = sensor_data_fit
    if host not in bmc_hosts:
        skip("{} is not configured with BMC sensor".format(host))

    global HOST
    HOST = host
    # Get a sensor to validate
    expt_severity = action_level.split('_')[-1] if 'yes' in expt_alarm else None
    new_expt_severity = action_level.split('_')[-1] if 'yes' in new_expt_alarm else None

    if suppression is not None:
        suppression = True if suppression == 'suppressed' else False
    if new_suppression is not None:
        new_suppression = True if new_suppression == 'suppressed' else False

    for sensorgroup_name in bmc_helper.get_sensorgroup_name(host):
        LOG.tc_step("Validating that sensorgroup: {} can be set to sensor action: {} for event level: {}".
                    format(sensorgroup_name, action, action_level))

        # Set the sensorgroup action, suppress state, and audit interval
        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', audit_interval=10, suppress=suppression,
                                      **{action_level: action})

        # Get a sensor that is part of the sensorgroup
        sensor_name = bmc_helper.get_first_sensor_from_sensorgroup(sensorgroup_name, host)
        entity_id = 'host={}.sensor={}'.format(host, sensor_name)

        LOG.tc_step("Trigger event for sensorgroup: {} and sensor name: {}".format(sensorgroup_name, sensor_name))
        bmc_helper.trigger_event(host, sensor_name, event_type)

        LOG.tc_step("Check the alarm status for sensor: {}".format(sensor_name))
        res = system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, timeout=60, entity_id=entity_id,
                                           severity=expt_severity, strict=False, fail_ok=True)[0]

        if expt_alarm == 'yes_alarm':
            assert res, "FAIL: Alarm expected but no alarms found for sensor on {}".format(host)
        else:
            assert not res, "FAIL: Alarm raised but no alarms were expected for sensor on {}".format(host)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        system_helper.wait_for_host_values(host, timeout=90, availability=expt_host_avail, fail_ok=False)

        start_time = common.get_date_in_format()
        # modify sensorgroup with new action/suppression level
        LOG.tc_step("Transition sensorgroup: {} from current sensor action: {} to new sensor action: {} "
                    "for event level: {}".format(sensorgroup_name, action, new_action, action_level))

        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', suppress=new_suppression,
                                      **{action_level: new_action})

        # Verify the new action is taken
        LOG.tc_step("Check alarm status after transition from {} to {} for {}".format(action, new_action, sensor_name))

        if new_expt_alarm == 'yes_alarm':
            system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id,
                                         severity=new_expt_severity, timeout=60, strict=False, fail_ok=False)
        else:
            events = system_helper.wait_for_events(timeout=60, num=10, event_log_id=EventLogID.BMC_SENSOR_ACTION,
                                                   entity_instance_id=entity_id, start=start_time, state='log',
                                                   fail_ok=True, strict=False, severity=new_expt_severity)
            if new_expt_alarm == 'yes_log':
                assert events, "No event log found for {} {} {} event".format(host, sensorgroup_name, action_level)
            else:
                assert not events, "Event logged unexpectedly for sensor on {}".format(host)
                system_helper.wait_for_alarm_gone(EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id, strict=False,
                                                  timeout=5, fail_ok=False)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        system_helper.wait_for_host_values(host, timeout=90, availability=new_expt_host_avail, fail_ok=False)

        LOG.tc_step("Check the alarm clears and host in available state after clearing events")
        bmc_helper.clear_events(host)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=host, strict=False,
                                          timeout=60)
        system_helper.wait_for_host_values(host, fail_ok=False, availability='available')

    HOST = ''
Ejemplo n.º 9
0
def test_sensorgroup_power_cycle(host,
                                 eventlevel,
                                 action,
                                 expected_host_state,
                                 expected_alarm_state,
                                 event_type,
                                 suppressionlevel, sensor_data_fit):
    """
    Verify that the sensorgroup action taken for an event is valid.

    Test Steps:
        - Get a sensorgroup to test
        - Set the event level and expected action
        - trigger an out-of-scope event for that sensorgroup
        - verify that the expected action is taken

    """
    bmc_hosts = sensor_data_fit
    if host not in bmc_hosts:
        skip("{} is not configured with BMC sensor".format(host))

    global HOST
    HOST = host

    if suppressionlevel == 'suppressed':
        # global SUPPRESSED
        # SUPPRESSED = host
        suppress = True
    else:
        suppress = False

    expt_severity = eventlevel.split('_')[-1] if 'yes' in expected_alarm_state else None

    # Get a sensor to validate
    sensorgroup_name = random.choice(bmc_helper.get_sensor_names(host, sensor_group=True))
    for i in range(4):
        LOG.info("################## iter {} #########################".format(i+1))
        LOG.tc_step("Validating that sensorgroup: {} "
                    "can be set to sensor action: {} "
                    "for event level: {}".format(sensorgroup_name, action,
                                                 eventlevel))

        # Set the event level and action
        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', suppress=suppress, audit_interval=10,
                                      **{eventlevel: action})

        # Get a sensor that is part of the sensorgroup
        sensor_name = bmc_helper.get_first_sensor_from_sensorgroup(sensorgroup_name, host)
        entity_id = 'host={}.sensor={}'.format(host, sensor_name)

        LOG.tc_step("Trigger event for sensorgroup: {} and sensor name: {}".
                    format(sensorgroup_name, sensor_name))
        if action in ['power-cycle', 'reset']:
            HostsToRecover.add(host)

        start_time = common.get_date_in_format()
        bmc_helper.trigger_event(host, sensor_name, event_type)

        LOG.tc_step("Check sensor status and alarm for {}".format(sensor_name))
        if expected_alarm_state == 'yes_alarm':
            system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id,
                                         severity=expt_severity, timeout=60, strict=False, fail_ok=False)
        else:
            events = system_helper.wait_for_events(timeout=60, num=10, event_log_id=EventLogID.BMC_SENSOR_ACTION,
                                                   entity_instance_id=entity_id, start=start_time, state='log',
                                                   severity=expt_severity, fail_ok=True, strict=False)
            if expected_alarm_state == 'yes_log':
                assert events, "No event log found for {} {} {} event".format(host, sensorgroup_name, eventlevel)
            else:
                assert not events, "Event logged unexpectedly for sensor on {}".format(host)
                system_helper.wait_for_alarm_gone(EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id, strict=False,
                                                  timeout=5, fail_ok=False)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        host_state_timeout = 120
        if action == 'reset':
            host_state_timeout = 1080  # 15 min reset interval in between two reset triggers
        system_helper.wait_for_host_values(host, timeout=host_state_timeout, fail_ok=False,
                                                    availability=expected_host_state)
        if action == 'power-cycle':
            system_helper.wait_for_host_values(host, timeout=20, task=HostTask.POWER_CYCLE, strict=False)

        LOG.tc_step("Check the alarm clears and host in available state after clearing events")
        bmc_helper.clear_events(host)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=host, strict=False,
                                          timeout=60)
        wait_time = 3000 if action == 'power-cycle' else HostTimeout.REBOOT
        expt_states = {'availability': 'available'}
        strict = True
        if action == 'power-cycle' and i == 3:
            wait_time = 1200
            strict = False
            expt_states = {'availability': HostAvailState.POWER_OFF,
                           'operational': HostOperState.DISABLED,
                           'administrative': HostAdminState.UNLOCKED,
                           'task': HostTask.POWER_DOWN}

        system_helper.wait_for_host_values(host, fail_ok=False, timeout=wait_time, strict=strict, **expt_states)

    LOG.tc_step("Power on {} after test ends".format(host))
    host_helper.lock_host(host=host)
    host_helper.power_on_host(host=host)
    HOST = ''
Ejemplo n.º 10
0
def test_dc_ntp_modify(ntp_precheck):
    """
    Update NTP servers on central region and check it is propagated to subclouds
    Args:
        ntp_precheck (fixture for test setup and teardown)

    Setups:
        - Ensure primary subcloud is manged and NTP config is in sync with central region
        - Un-manage rest of the subclouds except one

    Test Steps:
        - Un-manage primary subcloud
        - Configure NTP servers on above unmanaged subcloud to remove the first NTP server
        - Configure NTP servers on central region to add an invalid server 8.8.8.8
        - Lock/unlock controllers on central region to apply the config
        - Wait for new NTP config to sync over to the only managed osubcloud and config out-of-date alarms appear
        - Lock/unlock controllers on managed subcloud to apply config
        - Ensure central NTP config does not sync to unmanaged primary subcloud
        - Re-manage primary subcloud and ensure NTP config syncs over
        - Lock/unlock controllers in primary subcloud to apply new NTP configuration
        - Verify fm alarm 100.114 appears for invalid/unreachable NTP server on central region and managed

    Teardown:
        - Reset NTP servers to original value
        - Lock/unlock controllers on all managed subclouds to clear the config out of date alarm
        - Re-manage subclouds that were umanaged in setup
        - Verify no config out-of-date alarms on the re-managed subclouds

    """
    primary_subcloud, managed_subcloud, prev_central_ntp = ntp_precheck
    new_central_ntp = ['8.8.8.8'] + prev_central_ntp[:-1]
    local_subcloud_ntp = prev_central_ntp[1:]

    central_auth = Tenant.get('admin_platform', dc_region='RegionOne')
    primary_sub_auth = Tenant.get('admin_platform', dc_region=primary_subcloud)
    auth_list = [central_auth, primary_sub_auth]
    if managed_subcloud:
        managed_sub_auth = Tenant.get('admin_platform', dc_region=managed_subcloud)
        auth_list.append(managed_sub_auth)

    LOG.tc_step("Unmanage {}".format(primary_subcloud))
    dc_helper.unmanage_subcloud(subcloud=primary_subcloud, check_first=True)

    LOG.tc_step("While {} is unmanaged, modify its NTP servers locally from {} to {}".
                format(primary_subcloud, prev_central_ntp, local_subcloud_ntp))
    system_helper.modify_ntp(ntp_servers=local_subcloud_ntp, auth_info=primary_sub_auth)

    LOG.tc_step("Reconfigure NTP servers on central region from {} to {}".format(prev_central_ntp, new_central_ntp))
    system_helper.modify_ntp(ntp_servers=new_central_ntp, auth_info=central_auth)

    if managed_subcloud:
        LOG.tc_step("Wait for new NTP config to sync over to managed subcloud: {}".format(managed_subcloud))
        dc_helper.wait_for_subcloud_ntp_config(subcloud=managed_subcloud, expected_ntp=new_central_ntp)

    LOG.tc_step("Ensure NTP config is not updated on unmanaged subcloud: {}".format(primary_subcloud))
    code = dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=new_central_ntp,
                                                  timeout=60, fail_ok=True, clear_alarm=False)[0]
    assert 1 == code, "Actual return code: {}".format(code)
    assert local_subcloud_ntp == system_helper.get_ntp_servers(auth_info=primary_sub_auth)

    LOG.tc_step('Re-manage {} and ensure NTP config syncs over'.format(primary_subcloud))
    dc_helper.manage_subcloud(subcloud=primary_subcloud, check_first=False)
    dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=new_central_ntp)

    LOG.tc_step('Verify NTP alarm appeared for invalid server 8.8.8.8 on central and managed subclouds')
    for auth_info in auth_list:
        system_helper.wait_for_alarm(alarm_id=EventLogID.NTP_ALARM, auth_info=auth_info, timeout=660)
Ejemplo n.º 11
0
def test_increase_extensionfs_with_alarm():
    """
    This test increases the size of the extenteion controllerfs filesystems while there is an alarm condition for the
    fs.

    Arguments:
    - None

    Test Steps:
    - Query the filesystem for their current size
    - cause an alarm condition by filling the space on that fs
    - verify controller-0 is degraded
    - Increase the size of extension filesystem.
    - Verify alarm is gone

    Assumptions:
    - There is sufficient free space to allow for an increase, otherwise skip
      test.
    """
    file_loc = "/opt/extension"
    cmd = "cd " + file_loc
    file_path = file_loc + "/" + "testFile"
    drbdfs_val = {}
    fs = "extension"

    active_controller = system_helper.get_active_controller_name()

    LOG.tc_step("Determine the space available for extension filesystem")
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))

    # get the 91% of the current size
    LOG.info(
        "Will attempt to fill up the space to 90% of fs {} of value of {}".
        format(fs, drbdfs_val[fs]))
    file_size = int((drbdfs_val[fs] * 0.91) * 1000)
    file_size = str(file_size) + "M"
    cmd1 = "fallocate -l {} testFile".format(file_size)
    con_ssh = ControllerClient.get_active_controller()
    con_ssh.exec_cmd(cmd)
    con_ssh.exec_sudo_cmd(cmd1)
    if not con_ssh.file_exists(file_path=file_path):
        LOG.info("File {} is not created".format(file_path))
        return 0

    # fill_in_fs(size=file_size)
    LOG.tc_step(
        "Verifying that the alarm is created after filling the fs space in {}".
        format(fs))
    system_helper.wait_for_alarm(alarm_id="100.104",
                                 entity_id=active_controller,
                                 timeout=600,
                                 strict=False)

    # verify the controller is in degraded state
    LOG.tc_step(
        "Verifying controller is degraded after filling the fs space in {}".
        format(fs))
    system_helper.wait_for_host_values(active_controller,
                                       availability='degraded')

    drbdfs_val[fs] = drbdfs_val[fs] + 2

    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of extension filesystem")
    storage_helper.modify_controllerfs(**drbdfs_val)

    # Need to wait until the change takes effect before checking the
    # filesystems
    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)
        LOG.tc_step(
            "Verifying that the alarm is cleared after increasing the fs space in {}"
            .format(fs))
        system_helper.wait_for_alarm_gone(alarm_id="100.104",
                                          entity_id="host={}".format(host),
                                          timeout=600,
                                          strict=False)

    LOG.tc_step(
        "Confirm the underlying filesystem size matches what is expected")
    storage_helper.check_controllerfs(**drbdfs_val)

    # verify the controller is in available state
    LOG.tc_step(
        "Verifying that the controller is in available state after increasing the fs space in {}"
        .format(fs))
    system_helper.wait_for_host_values(active_controller,
                                       availability='available')
Ejemplo n.º 12
0
def _test_increase_ceph_mon():
    """
    Increase the size of ceph-mon.  Only applicable to a storage system.

    Fails until CGTS-8216

    Test steps:
    1.  Determine the current size of ceph-mon
    2.  Attempt to modify ceph-mon to invalid values
    3.  Check if there is free space to increase ceph-mon
    4.  Attempt to increase ceph-mon
    5.  Wait for config out-of-date alarms to raise
    6.  Lock/unlock all affected nodes (controllers and storage)
    7.  Wait for alarms to clear
    8.  Check that ceph-mon has the correct updated value

    Enhancement:
    1.  Possibly check there is enough disk space for ceph-mon to increase.  Not sure if
    this is required since there always seems to be some space on the rootfs.

    """
    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    LOG.info("ceph_mon_gib is currently: {}".format(ceph_mon_gib))

    LOG.tc_step("Attempt to modify ceph-mon to invalid values")
    invalid_cmg = ['19', '41', 'fds']
    for value in invalid_cmg:
        host = "controller-0"
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(host, value),
                   fail_ok=True)

    if int(ceph_mon_gib) >= 30:
        skip("Insufficient disk space to execute test")

    ceph_mon_gib_avail = 40 - int(ceph_mon_gib)
    new_ceph_mon_gib = math.trunc(ceph_mon_gib_avail / 10) + int(ceph_mon_gib)

    LOG.tc_step("Increase ceph_mon_gib to {}".format(new_ceph_mon_gib))
    hosts = system_helper.get_controllers()
    for host in hosts:
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(
            host, new_ceph_mon_gib))
        # We only need to do this for one controller now and it applies to both
        break

    LOG.info("Wait for expected alarms to appear")
    storage_hosts = system_helper.get_storage_nodes()
    total_hosts = hosts + storage_hosts
    for host in total_hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                     entity_id="host={}".format(host))

    LOG.tc_step("Lock/unlock all affected nodes")
    for host in storage_hosts:
        HostsToRecover.add(host)
        host_helper.lock_host(host)
        host_helper.unlock_host(host)
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host))
        time.sleep(10)

    standby = system_helper.get_standby_controller_name()
    active = system_helper.get_active_controller_name()
    HostsToRecover.add(standby)
    host_helper.lock_host(standby)
    host_helper.unlock_host(standby)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(standby))
    time.sleep(10)
    host_helper.swact_host(active)
    HostsToRecover.add(active)
    host_helper.lock_host(active)
    host_helper.unlock_host(active)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(active))

    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    assert ceph_mon_gib != new_ceph_mon_gib, "ceph-mon did not change"
Ejemplo n.º 13
0
def test_resize_drbd_filesystem_while_resize_inprogress():
    """
    This test attempts to resize a drbd filesystem while an existing drbd
    resize is in progress.  This should be rejected.

    Arguments:
    - None

    Test steps:
    1.  Increase the size of backup to allow for test to proceed.
    2.  Wait for alarms to clear and then check the underlying filesystem is
    updated
    2.  Attempt to resize the glance filesystem.  This should be successful.
    3.  Attempt to resize cgcs again immediately.  This should be rejected.

    Assumptions:
    - None

    """

    start_time = common.get_date_in_format()
    drbdfs_val = {}
    fs = "extension"
    LOG.tc_step(
        "Increase the {} size before proceeding with rest of test".format(fs))
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 5
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))
    LOG.tc_step("Increase the size of filesystems")
    storage_helper.modify_controllerfs(**drbdfs_val)

    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_events(
            event_log_id=EventLogID.CONFIG_OUT_OF_DATE,
            start=start_time,
            entity_instance_id="host={}".format(host),
            strict=False,
            **{'state': 'set'})

    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)

    LOG.tc_step(
        "Confirm the underlying filesystem size matches what is expected")
    storage_helper.check_controllerfs(**drbdfs_val)

    drbdfs_val = {}
    fs = "database"
    LOG.tc_step("Determine the current filesystem size")
    value = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, value))
    drbdfs_val[fs] = int(value) + 1
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of filesystems")
    storage_helper.modify_controllerfs(**drbdfs_val)

    LOG.tc_step("Attempt to increase the size of the filesystem again")
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 1
    code = storage_helper.modify_controllerfs(fail_ok=True, **drbdfs_val)[0]
    assert 1 == code, "Filesystem modify succeeded while failure is expected: {}".format(
        drbdfs_val)

    # Appearance of sync alarm is delayed so wait for it to appear and then
    # clear
    if not system_helper.is_aio_simplex():
        system_helper.wait_for_alarm(alarm_id=EventLogID.CON_DRBD_SYNC,
                                     timeout=300)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CON_DRBD_SYNC,
                                          timeout=300)
Ejemplo n.º 14
0
def modify_https(enable_https=True,
                 check_first=True,
                 con_ssh=None,
                 auth_info=Tenant.get('admin_platform'),
                 fail_ok=False):
    """
    Modify platform https via 'system modify https_enable=<bool>'

    Args:
        enable_https (bool): True/False to enable https or not
        check_first (bool): if user want to check if the lab is already in
        the state that user try to enable
        con_ssh (SSHClient):
        auth_info (dict):
        fail_ok (bool):

    Returns (tuple):
        (-1, msg)
        (0, msg)
        (1, <std_err>)

    """
    if check_first:
        is_https = keystone_helper.is_https_enabled(source_openrc=False,
                                                    auth_info=auth_info,
                                                    con_ssh=con_ssh)
        if (is_https and enable_https) or (not is_https and not enable_https):
            msg = "Https is already {}. Do nothing.".format(
                'enabled' if enable_https else 'disabled')
            LOG.info(msg)
            return -1, msg

    LOG.info("Modify system to {} https".format(
        'enable' if enable_https else 'disable'))
    res, output = system_helper.modify_system(fail_ok=fail_ok,
                                              con_ssh=con_ssh,
                                              auth_info=auth_info,
                                              https_enabled='{}'.format(
                                                  str(enable_https).lower()))
    if res == 1:
        return 1, output

    LOG.info("Wait up to 60s for config out-of-date alarm with best effort.")
    system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                 entity_id='controller-',
                                 strict=False,
                                 con_ssh=con_ssh,
                                 timeout=60,
                                 fail_ok=True,
                                 auth_info=auth_info)

    LOG.info("Wait up to 600s for config out-of-date alarm to clear.")
    system_helper.wait_for_alarm_gone(EventLogID.CONFIG_OUT_OF_DATE,
                                      con_ssh=con_ssh,
                                      timeout=600,
                                      check_interval=20,
                                      fail_ok=False,
                                      auth_info=auth_info)

    LOG.info("Wait up to 300s for public endpoints to be updated")
    expt_status = 'enabled' if enable_https else 'disabled'
    end_time = time.time() + 300
    while time.time() < end_time:
        if keystone_helper.is_https_enabled(con_ssh=con_ssh,
                                            source_openrc=False,
                                            auth_info=auth_info) == \
                enable_https:
            break
        time.sleep(10)
    else:
        raise exceptions.KeystoneError(
            "Https is not {} in 'openstack endpoint list'".format(expt_status))

    msg = 'Https is {} successfully'.format(expt_status)
    LOG.info(msg)
    # TODO: install certificate for https. There will be a warning msg if
    #  self-signed certificate is used

    if not ProjVar.get_var('IS_DC') or \
            (auth_info and auth_info.get('region', None) in (
            'RegionOne', 'SystemController')):
        # If DC, use the central region https as system https, since that is
        # the one used for external access
        CliAuth.set_vars(HTTPS=enable_https)

    return 0, msg
Ejemplo n.º 15
0
def test_storgroup_semantic_checks():
    """
    This test validates CEPH semantic checks as it applies to storage nodes in
    a replication group.

    Args:
        - None

    Setup:
        - Requires a system with storage nodes (minimum of 2)
        - Requires TiS Release 3 and up

    Test Steps:
        1.  Lock one storage node in a storage node pair
        2.  Check the appropriate alarms are raised
        3.  Check OSDs are down on the storage node
        4.  Check that CEPH is no longer healthy
        5.  Attempt to lock the other node and ensure it is rejected
        6.  Attempt to force lock the other node and ensure it is rejected
        7.  If the storage node is a storage monitor, attempt to lock and force
            lock the controllers
        8.  Unlock the storage node in the storage node pair
        9.  Check that the alarms are cleared
        10.  Check that OSDs are up
        11.  Check that CEPH is healthy

    Defects this addresses:
        1.  CGTS-4286 Unexpected allowing lock action on storage node peergroup
            when redundancy lost
        2.  CGTS-3494 Some OSDs observed to be up on locked storage node
        3.  CGTS-3643 Able to lock standby controller despite only two CEPH
            monitors being available
        4.  CGTS-2690 Storage: Force locking a controller should be rejected when storage
            is locked.
    """

    con_ssh = ControllerClient.get_active_controller()

    table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1])
    capabilities = table_parser.get_value_two_col_table(table_, 'capabilities')
    replication_factor = capabilities[1]
    LOG.info("The replication factor is: {}".format(replication_factor))

    # We want to test storage-0 since it is a ceph monitor
    # Then we want to test another storage host in another group.  The choice
    # depends on the replication factor.
    storage_nodes = ["storage-0"]
    if replication_factor == "3":
        storage_nodes.append("storage-3")

    if replication_factor == "2" and len(storage_nodes) > 2:
        storage_nodes.append("storage-2")

    LOG.info("Storage hosts under test are: {}".format(storage_nodes))

    for host in storage_nodes:
        LOG.tc_step('Lock {}:'.format(host))
        HostsToRecover.add(host, scope='function')
        rtn_code, out = host_helper.lock_host(host)
        assert rtn_code == 0, out

        LOG.tc_step("Verify CEPH cluster health reflects the OSD being down")
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check that alarms are raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raise')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that the ceph health warning alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        hosts = []
        if host == 'storage-0':
            hosts.append('controller-0')
            hosts.append('controller-1')

        for node in hosts:
            LOG.tc_step('Attempt to lock the {}'.format(node))
            HostsToRecover.add(node)
            rtn_code, out = host_helper.lock_host(node, fail_ok=True)
            assert 1 == rtn_code, out

            LOG.tc_step('Attempt to force lock {}'.format(node))
            rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True)
            assert 1 == rtn_code, out

        LOG.tc_step('Unlock storage host {}'.format(host))
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        LOG.info("Check if alarms have cleared")
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg
Ejemplo n.º 16
0
def test_dc_fault_scenario(subcloud_to_test):
    """
    Test Fault Scenario on Distributed Cloud
    Args:
        subcloud_to_test (str): module fixture

    Setup:
        - Make sure there is consistency between alarm summary on
        Central Cloud and on subclouds

    Test Steps:
        - Make subcloud offline (e. g. delete route)
        Step1:
        - Ensure suncloud shows offline
        Step2:
        - Raise alarm on subcloud
        - Ensure relative alarm raised on subcloud,
        - Ensure system alarm-summary on subcloud has changed
        - Ensure  dcmanager alarm summary on system controller has no change
        Step3:
        - Resume connectivity to subcloud (e. g. add route back)
        - Ensure suncloud shows online and in-sync
        - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system
        controller
        Step4:
        - Clean alarm on subcloud
        - Ensure relative alarm cleared on subcloud
        - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system
        controller
    """
    ssh_central = ControllerClient.get_active_controller(name="RegionOne")
    ssh_subcloud = ControllerClient.get_active_controller(
        name=subcloud_to_test)
    subcloud_table = {}
    try:
        code, output = cli.dcmanager(
            "subcloud show {}".format(subcloud_to_test),
            ssh_client=ssh_central)
        gateway = table_parser.get_value_two_col_table(
            table_parser.table(output), "management_gateway_ip")
        code, hosts_raw = cli.system("host-list", ssh_client=ssh_subcloud)
        hosts_id = table_parser.get_values(table_parser.table(hosts_raw), 'id')
        for host_id in hosts_id:
            code, route_raw = cli.system("host-route-list {}".format(host_id),
                                         ssh_client=ssh_subcloud)
            route_table = table_parser.filter_table(
                table_parser.table(route_raw), **{'gateway': gateway})
            subcloud_table[host_id] = route_table

        LOG.tc_step(
            "Delete route for subcloud: {} and wait for it to go offline.".
            format(subcloud_to_test))
        ssh_subcloud = ControllerClient.get_active_controller(
            name=subcloud_to_test)
        for host_id in subcloud_table:
            command = "host-route-delete {}".format(
                table_parser.get_values(subcloud_table[host_id], "uuid")[0])
            cli.system(command, ssh_client=ssh_subcloud)

        dc_helper.wait_for_subcloud_status(subcloud_to_test,
                                           avail=SubcloudStatus.AVAIL_OFFLINE,
                                           timeout=DCTimeout.SYNC,
                                           con_ssh=ssh_central)

        LOG.tc_step("Raise alarm on subcloud: {}".format(subcloud_to_test))
        ssh_subcloud = ControllerClient.get_active_controller(
            name=subcloud_to_test)
        code_sub_before, output_sub_before = cli.fm("alarm-summary",
                                                    ssh_client=ssh_subcloud)
        code_central_before, output_central_before = cli.dcmanager(
            'alarm summary')
        ssh_subcloud.exec_cmd(
            "fmClientCli -c \"### ###300.005###clear###system.vm###host="
            "testhost-0### ###critical### ###processing-error###cpu-cycles-limit-exceeded"
            "### ###True###True###'\"",
            fail_ok=False)
        LOG.info("Ensure relative alarm was raised at subcloud: {}".format(
            subcloud_to_test))
        system_helper.wait_for_alarm(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_subcloud)
        code_sub_after, output_sub_after = cli.fm("alarm-summary",
                                                  ssh_client=ssh_subcloud)
        code_central_after, output_central_after = cli.dcmanager(
            'alarm summary')
        LOG.info(
            "Ensure fm alarm summary on subcloud: {} has changed but dcmanager alarm"
            "summary has not changed".format(subcloud_to_test))
        assert output_central_before == output_central_after and output_sub_before != \
            output_sub_after

        add_routes_to_subcloud(subcloud_to_test, subcloud_table)

        dc_helper.wait_for_subcloud_status(subcloud_to_test,
                                           avail=SubcloudStatus.AVAIL_ONLINE,
                                           sync=SubcloudStatus.SYNCED,
                                           timeout=DCTimeout.SYNC,
                                           con_ssh=ssh_central)
        alarm_summary_add_and_del(subcloud_to_test)

    finally:
        cli.dcmanager("subcloud show {}".format(subcloud_to_test),
                      ssh_client=ssh_central,
                      fail_ok=True)
        add_routes_to_subcloud(subcloud_to_test, subcloud_table, fail_ok=True)
        LOG.info("Clear alarm on subcloud: {}".format(subcloud_to_test))
        ssh_subcloud.exec_cmd('fmClientCli -D host=testhost-0')
        check_alarm_summary_match_subcloud(subcloud=subcloud_to_test)
Ejemplo n.º 17
0
def test_patch_orch_with_ignored_alarms(patch_orchestration_setup, patch_function_check, ignored_alarm_texts):
    """
    This test verifies the patch orchestration operation with presence of alarms that are normally ignored by the
    orchestration. These alarms are '200.001', '700.004,', '900.001', '900.005', '900.101'. This test generates the
    alarms host lock (200.001) and VM stopped ( 700.004) before executing the patch orchestration.
    Args:
        patch_orchestration_setup:
        patch_function_check
        ignored_alarm_texts:

    Returns:

    """
    vms = patch_function_check
    patches, controllers, computes, storages = patch_orchestration_setup
    hosts = controllers + computes + storages
    patch_id = patching_helper.parse_test_patches(patch_ids=patches, search_str='INSVC_ALLNODES')[0]

    if 'HOST_LOCK' in ignored_alarm_texts and len(hosts) < 2:
        skip("Not enough hosts present in the system")

    if 'HOST_LOCK' in ignored_alarm_texts:
        host = hosts[-1]
        HostsToRecover.add(host)
        LOG.info("Lock host {} to generate 200.001 alarm".format(host))
        host_helper.lock_host(host)
        system_helper.wait_for_alarm(alarm_id='200.001', fail_ok=False)
        LOG.info("Host {} is locked and 200.001 alarm is generated".format(host))

    vm_id_to_stop = None
    if 'VM_STOP' in ignored_alarm_texts:
        vm_id_to_stop = vms[0]
        LOG.info("Stop VM {} to generate 700.004 alarm".format(vm_id_to_stop))
        vm_helper.stop_vms(vm_id_to_stop)
        system_helper.wait_for_alarm(alarm_id='700.004')

    patch_file = patches[patch_id]

    LOG.tc_step("Upload patch file {}".format(patch_file))
    uploaded_id = patching_helper.upload_patches(patch_files=patch_file)[1][0]
    assert patch_id == uploaded_id, "Expected patch {} and uploaded patch {} mismatch".format(patch_id, uploaded_id)
    LOG.info("Patch {} uploaded".format(uploaded_id))

    LOG.tc_step("Apply patch {}".format(uploaded_id))
    applied = patching_helper.apply_patches(patch_ids=[uploaded_id])[1]
    LOG.info("Patch {} applied".format(applied))

    LOG.tc_step("Install patch {} through orchestration".format(uploaded_id))
    patching_helper.wait_for_affecting_alarms_gone()
    run_patch_orchestration_strategy()
    LOG.info("Install patch through orchestration completed for patch {}".format(applied))
    host_helper.wait_for_hosts_ready(hosts=hosts)

    LOG.tc_step("Check vms after patch is installed.")
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)

    LOG.tc_step("Remove test patch {}".format(applied))
    if vm_id_to_stop:
        vm_helper.stop_vms(vm_id_to_stop)

    patching_helper.remove_patches(patch_ids=applied)

    LOG.tc_step("Remove patch through orchestration: {}".format(applied))
    run_patch_orchestration_strategy(alarm_restrictions='relaxed')
    LOG.info("Apply/Remove through patch orchestration completed for patch {}".format(applied))

    LOG.tc_step("Check vms after patch removed: {}.".format(applied))
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)
Ejemplo n.º 18
0
def test_ceph_mon_process_kill(monitor, ceph_monitors):
    """
    us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt

    Verify that ceph mon processes recover when they are killed.

    Args:
        - Nothing

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Run CEPH pre-check fixture to check:
            - system has storage nodes
            - health of the ceph cluster is okay
            - that we have OSDs provisioned
        2.  Pick one ceph monitor and remove it from the quorum
        3.  Kill the monitor process
        4.  Check that the appropriate alarms are raised
        5.  Restore the monitor to the quorum
        6.  Check that the alarms clear
        7.  Ensure the ceph monitor is restarted under a different pid

    Potential flaws:
        1.  We're not checking if unexpected alarms are raised (TODO)

    Teardown:
        - None

    What defects this addresses:
        1.  CGTS-2975

    """
    if monitor not in ceph_monitors:
        skip("{} is not a ceph monitor".format(monitor))

    LOG.tc_step('Get process ID of ceph monitor')
    mon_pid = storage_helper.get_mon_pid(monitor)

    with host_helper.ssh_to_host(monitor) as host_ssh:
        with host_ssh.login_as_root() as root_ssh:
            LOG.tc_step('Remove the monitor')
            cmd = 'ceph mon remove {}'.format(monitor)
            root_ssh.exec_cmd(cmd)

            LOG.tc_step('Stop the ceph monitor')
            cmd = 'service ceph stop mon.{}'.format('controller' if system_helper.is_aio_duplex() else monitor)
            root_ssh.exec_cmd(cmd)

    LOG.tc_step('Check that ceph monitor failure alarm is raised')
    system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_DEGRADE, timeout=300)

    with host_helper.ssh_to_host(monitor) as host_ssh:
        with host_ssh.login_as_root() as root_ssh:
            LOG.tc_step('Get cluster fsid')
            cmd = 'ceph fsid'
            fsid = host_ssh.exec_cmd(cmd)[0]
            ceph_conf = '/etc/ceph/ceph.conf'

            LOG.tc_step('Remove old ceph monitor directory')
            cmd = 'rm -rf /var/lib/ceph/mon/ceph-{}'.format(monitor)
            root_ssh.exec_cmd(cmd)

            LOG.tc_step('Re-add the monitor')
            cmd = 'ceph-mon -i {} -c {} --mkfs --fsid {}'.format(monitor, ceph_conf, fsid)
            root_ssh.exec_cmd(cmd)

    LOG.tc_step('Check the ceph storage alarm condition clears')
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.STORAGE_DEGRADE, timeout=360)

    LOG.tc_step('Check the ceph-mon process is restarted with a different pid')
    mon_pid2 = None
    for i in range(0, PROC_RESTART_TIME):
        mon_pid2 = storage_helper.get_mon_pid(monitor, fail_ok=True)
        if mon_pid2 and mon_pid2 != mon_pid:
            break
        time.sleep(5)

    LOG.info('Old pid is {} and new pid is {}'.format(mon_pid, mon_pid2))
    msg = 'Process did not restart in time'
    assert mon_pid2 and mon_pid2 != mon_pid, msg
Ejemplo n.º 19
0
def _test_basic_swift_provisioning(pool_size, pre_swift_check):
    """
    Verifies basic swift provisioning works as expected
    Args:
        pool_size:
        pre_swift_check:

    Returns:

    """
    ceph_backend_info = get_ceph_backend_info()

    if pool_size == 'default' and pre_swift_check[0]:
        skip("Swift is already provisioned")

    if pool_size == 'fixed_size' and pre_swift_check[0]:
        skip("Swift is already provisioned and set to non-default pool value")

    object_pool_gib = None
    cinder_pool_gib = ceph_backend_info['cinder_pool_gib']

    if pool_size == 'default':
        if not ceph_backend_info['object_gateway']:
            LOG.tc_step("Enabling SWIFT object store .....")

    else:
        if not ceph_backend_info['object_gateway']:
            skip("Swift is not provisioned")

        total_gib = ceph_backend_info['ceph_total_space_gib']
        unallocated_gib = (total_gib - cinder_pool_gib -
                           ceph_backend_info['glance_pool_gib'] -
                           ceph_backend_info['ephemeral_pool_gib'])
        if unallocated_gib == 0:
            unallocated_gib = int(int(cinder_pool_gib) / 4)
            cinder_pool_gib = str(int(cinder_pool_gib) - unallocated_gib)
        elif unallocated_gib < 0:
            skip("Unallocated gib < 0. System is in unknown state.")

        object_pool_gib = str(unallocated_gib)
        LOG.tc_step(
            "Enabling SWIFT object store and setting object pool size to {}....."
            .format(object_pool_gib))

    rc, updated_backend_info = storage_helper.modify_storage_backend(
        'ceph',
        object_gateway=False,
        cinder=cinder_pool_gib,
        object_gib=object_pool_gib,
        services='cinder,glance,nova,swift')

    LOG.info("Verifying if swift object gateway is enabled...")
    assert str(updated_backend_info['object_gateway']).lower() == 'true', "Fail to enable Swift object gateway: {}"\
        .format(updated_backend_info)
    LOG.info("Swift object gateway is enabled.")

    LOG.info("Verifying ceph task ...")
    state = storage_helper.get_storage_backends(backend='ceph',
                                                field='state')[0]
    if system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                    timeout=10,
                                    fail_ok=True,
                                    entity_id='controller-')[0]:
        LOG.info("Verifying ceph task is set to 'add-object-gateway'...")
        assert BackendState.CONFIGURING == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

        LOG.info("Lock/Unlock controllers...")
        active_controller, standby_controller = system_helper.get_active_standby_controllers(
        )
        LOG.info("Active Controller is {}; Standby Controller is {}...".format(
            active_controller, standby_controller))

        for controller in [standby_controller, active_controller]:
            if not controller:
                continue
            HostsToRecover.add(controller)
            host_helper.lock_host(controller, swact=True)
            storage_helper.wait_for_storage_backend_vals(
                backend='ceph-store',
                **{
                    'task': BackendTask.RECONFIG_CONTROLLER,
                    'state': BackendState.CONFIGURING
                })
            host_helper.unlock_host(controller)

        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE, fail_ok=False)
    else:
        assert BackendState.CONFIGURED == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

    LOG.info("Verifying Swift provisioning setups...")
    assert verify_swift_object_setup(), "Failure in swift setups"

    for i in range(3):
        vm_name = 'vm_swift_api_{}'.format(i)
        LOG.tc_step(
            "Boot vm {} and perform nova actions on it".format(vm_name))
        vm_id = vm_helper.boot_vm(name=vm_name, cleanup='function')[1]
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm_id, timeout=VMTimeout.DHCP_RETRY)

        LOG.info("Cold migrate VM {} ....".format(vm_name))
        rc = vm_helper.cold_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to cold migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Live migrate VM {} ....".format(vm_name))
        rc = vm_helper.live_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to live migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Suspend/Resume VM {} ....".format(vm_name))
        vm_helper.suspend_vm(vm_id)
        vm_helper.resume_vm(vm_id)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    LOG.info("Checking overall system health...")
    assert system_helper.get_system_health_query(
    ), "System health not OK after VMs"

    LOG.tc_step("Create Swift container using swift post cli command ...")
    container_names = [
        "test_container_1", "test_container_2", "test_container_3"
    ]

    for container in container_names:
        LOG.info("Creating swift object container {}".format(container))
        rc, out = swift_helper.create_swift_container(container)
        assert rc == 0, "Fail to create swift container {}".format(container)
        LOG.info(
            "Create swift object container {} successfully".format(container))

    LOG.tc_step("Verify swift list to list containers ...")
    container_list = swift_helper.get_swift_containers()[1]
    assert set(container_names) <= set(container_list), "Swift containers {} not listed in {}"\
        .format(container_names, container_list)

    LOG.tc_step("Verify swift delete a container...")
    container_to_delete = container_names[2]
    rc, out = swift_helper.delete_swift_container(container_to_delete)
    assert rc == 0, "Swift delete container rejected: {}".format(out)
    assert container_to_delete not in swift_helper.get_swift_containers()[1], "Unable to delete swift container {}"\
        .format(container_to_delete)

    LOG.tc_step("Verify swift stat to show info of a single container...")
    container_to_stat = container_names[0]
    out = swift_helper.get_swift_container_stat_info(container_to_stat)
    assert out["Container"] == container_to_stat, "Unable to stat swift container {}"\
        .format(container_to_stat)
    assert out["Objects"] == '0', "Incorrect number of objects container {}. Expected O objects, but has {} objects"\
        .format(container_to_stat, out["Objects"])
Ejemplo n.º 20
0
def test_system_alarms_and_events_on_lock_unlock_compute(no_simplex):
    """
    Verify fm alarm-show command

    Test Steps:
    - Delete active alarms
    - Lock a host
    - Check active alarm generated for host lock
    - Check relative values are the same in fm alarm-list and fm alarm-show
    <uuid>
    - Check host lock 'set' event logged via fm event-list
    - Unlock host
    - Check active alarms cleared via fm alarm-list
    - Check host lock 'clear' event logged via fm event-list
    """

    # Remove following step because it's unnecessary and fails the test when
    # alarm is re-generated
    # # Clear the alarms currently present
    # LOG.tc_step("Clear the alarms table")
    # system_helper.delete_alarms()

    # Raise a new alarm by locking a compute node
    # Get the compute
    compute_host = host_helper.get_up_hypervisors()[0]
    if compute_host == system_helper.get_active_controller_name():
        compute_host = system_helper.get_standby_controller_name()
        if not compute_host:
            skip('Standby controller unavailable')

    LOG.tc_step("Lock a nova hypervisor host {}".format(compute_host))
    pre_lock_time = common.get_date_in_format()
    HostsToRecover.add(compute_host)
    host_helper.lock_host(compute_host)

    LOG.tc_step("Check host lock alarm is generated")
    post_lock_alarms = \
        system_helper.wait_for_alarm(field='UUID', entity_id=compute_host,
                                     reason=compute_host,
                                     alarm_id=EventLogID.HOST_LOCK,
                                     strict=False,
                                     fail_ok=False)[1]

    LOG.tc_step(
        "Check related fields in fm alarm-list and fm alarm-show are of the "
        "same values")
    post_lock_alarms_tab = system_helper.get_alarms_table(uuid=True)

    alarms_l = ['Alarm ID', 'Entity ID', 'Severity', 'Reason Text']
    alarms_s = ['alarm_id', 'entity_instance_id', 'severity', 'reason_text']

    # Only 1 alarm since we are now checking the specific alarm ID
    for post_alarm in post_lock_alarms:
        LOG.tc_step(
            "Verify {} for alarm {} in alarm-list are in sync with "
            "alarm-show".format(
                alarms_l, post_alarm))

        alarm_show_tab = table_parser.table(cli.fm('alarm-show', post_alarm)[1])
        alarm_list_tab = table_parser.filter_table(post_lock_alarms_tab,
                                                   UUID=post_alarm)

        for i in range(len(alarms_l)):
            alarm_l_val = table_parser.get_column(alarm_list_tab,
                                                  alarms_l[i])[0]
            alarm_s_val = table_parser.get_value_two_col_table(alarm_show_tab,
                                                               alarms_s[i])

            assert alarm_l_val == alarm_s_val, \
                "{} value in alarm-list: {} is different than alarm-show: " \
                "{}".format(alarms_l[i], alarm_l_val, alarm_s_val)

    LOG.tc_step("Check host lock is logged via fm event-list")
    system_helper.wait_for_events(entity_instance_id=compute_host,
                                  start=pre_lock_time, timeout=60,
                                  event_log_id=EventLogID.HOST_LOCK,
                                  fail_ok=False, **{'state': 'set'})

    pre_unlock_time = common.get_date_in_format()
    LOG.tc_step("Unlock {}".format(compute_host))
    host_helper.unlock_host(compute_host)

    LOG.tc_step("Check host lock active alarm cleared")
    alarm_sets = [(EventLogID.HOST_LOCK, compute_host)]
    system_helper.wait_for_alarms_gone(alarm_sets, fail_ok=False)

    LOG.tc_step("Check host lock clear event logged")
    system_helper.wait_for_events(event_log_id=EventLogID.HOST_LOCK,
                                  start=pre_unlock_time,
                                  entity_instance_id=compute_host,
                                  fail_ok=False, **{'state': 'clear'})
Ejemplo n.º 21
0
def apply_app(app_name,
              check_first=False,
              fail_ok=False,
              applied_timeout=300,
              check_interval=10,
              wait_for_alarm_gone=True,
              con_ssh=None,
              auth_info=Tenant.get('admin_platform')):
    """
    Apply/Re-apply application via system application-apply. Check for status
    reaches 'applied'.
    Args:
        app_name (str):
        check_first:
        fail_ok:
        applied_timeout:
        check_interval:
        con_ssh:
        wait_for_alarm_gone (bool):
        auth_info:

    Returns (tuple):
        (-1, "<app_name> is already applied. Do nothing.")     # only returns
        if check_first=True.
        (0, "<app_name> (re)applied successfully")
        (1, <std_err>)  # cli rejected
        (2, "<app_name> failed to apply")   # did not reach applied status
        after apply.

    """
    if check_first:
        app_status = get_apps(application=app_name,
                              field='status',
                              con_ssh=con_ssh,
                              auth_info=auth_info)
        if app_status and app_status[0] == AppStatus.APPLIED:
            msg = '{} is already applied. Do nothing.'.format(app_name)
            LOG.info(msg)
            return -1, msg

    LOG.info("Apply application: {}".format(app_name))
    code, output = cli.system('application-apply',
                              app_name,
                              ssh_client=con_ssh,
                              fail_ok=fail_ok,
                              auth_info=auth_info)
    if code > 0:
        return 1, output

    res = wait_for_apps_status(apps=app_name,
                               status=AppStatus.APPLIED,
                               timeout=applied_timeout,
                               check_interval=check_interval,
                               con_ssh=con_ssh,
                               auth_info=auth_info,
                               fail_ok=fail_ok)[0]
    if not res:
        return 2, "{} failed to apply".format(app_name)

    if wait_for_alarm_gone:
        alarm_id = EventLogID.CONFIG_OUT_OF_DATE
        if system_helper.wait_for_alarm(alarm_id=alarm_id,
                                        entity_id='controller',
                                        timeout=15,
                                        fail_ok=True,
                                        auth_info=auth_info,
                                        con_ssh=con_ssh)[0]:
            system_helper.wait_for_alarm_gone(alarm_id=alarm_id,
                                              entity_id='controller',
                                              timeout=120,
                                              check_interval=10,
                                              con_ssh=con_ssh,
                                              auth_info=auth_info)

    msg = '{} (re)applied successfully'.format(app_name)
    LOG.info(msg)
    return 0, msg
Ejemplo n.º 22
0
def test_lock_stor_check_osds_down(stx_openstack_required, host):
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that all OSDs go down on a locked storage
    node.  There are two variants:

    1.  Lock 'storage-0' which is a ceph monitor
    2.  Lock a storage node that is not 'storage-0', i.e. not a ceph monitor

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock storage node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs on the locked storage node are down
            - Check that the appropriate alarms are raised:
        3.  Unlock storage node
            - ensure CEPH is HEALTH_OK
            - ensure all OSDs on unlocked node are up
            - Check that alarms are cleared

    Note: If the storage node to be locked is monitor, we also expect to see
    the mon down alarm.

    What defects this addresses:
        1.  CGTS-2609 - Ceph processes fail to start after storage node reboot

    Notes:
        - Updated test to write to disk to add I/O load on system

    """

    con_ssh = ControllerClient.get_active_controller()

    if host == 'any':
        storage_nodes = system_helper.get_hosts(personality='storage')
        LOG.info('System has {} storage nodes:'.format(storage_nodes))
        storage_nodes.remove('storage-0')
        node_id = random.randint(0, len(storage_nodes) - 1)
        host = storage_nodes[node_id]

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")
    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        LOG.tc_step('Lock storage node {}'.format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host, check_first=False)

        LOG.tc_step('Determine the storage group for host {}'.format(host))
        storage_group, msg = storage_helper.get_storage_group(host)
        LOG.info(msg)

        LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that ceph is in health warn')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        # We're waiting 5 minutes for ceph rebalancing to be performed
        # DO NOT REMOVE.  This is part of the test.
        time.sleep(300)

        LOG.tc_step('Unlock storage node')
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        health = False
        end_time = time.time() + 40
        while time.time() < end_time:
            health = storage_helper.is_ceph_healthy(con_ssh)
            if health is True:
                break
        assert health, "Ceph did not become healthy"

        LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host))
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that the replication group alarm is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        LOG.tc_step('Check that the Storage Alarm Condition is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg

        LOG.tc_step('Check health of CEPH cluster')
        end_time = time.time() + 40
        while time.time() < end_time:
            ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
            if ceph_healthy is True:
                break

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Ejemplo n.º 23
0
def test_lock_cont_check_mon_down():
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that we alarm when a CEPH monitor goes
    down.  This test is specifically for controller hosts.

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock controller node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs stay up
            - Check that the appropriate alarms are raised:
              - controller-X is locked
              - ceph mon down
        3.  Unlock controller node
            - ensure CEPH is HEALTH_OK
            - Check that alarms are cleared

    Enhancements:
       1.  Should we do both controllers?  This will require a swact.
    """

    con_ssh = ControllerClient.get_active_controller()

    host = system_helper.get_standby_controller_name()
    LOG.tc_step('Lock standby controller node {}'.format(host))
    HostsToRecover.add(host, scope='function')
    rtn_code, out = host_helper.lock_host(host)
    assert rtn_code == 0, out

    LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
        "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
        "Alarm {} not raised".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check OSDs are still up after lock')
    osd_list = storage_helper.get_osds(con_ssh=con_ssh)
    for osd_id in osd_list:
        osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
        msg = 'OSD ID {} should be up but is not'.format(osd_id)
        assert osd_up, msg
        msg = 'OSD ID {} is up'.format(osd_id)
        LOG.info(msg)

    LOG.tc_step('Unlock standby controller node {}'.format(host))
    rtn_code, out = host_helper.unlock_host(host, available_only=True)
    assert rtn_code == 0, out

    LOG.tc_step('Check that the host locked alarm is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
        "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check that the Storage Alarm Condition is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
        "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check health of CEPH cluster')
    msg = ''
    end_time = time.time() + 40
    while time.time() < end_time:
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        if ceph_healthy:
            break
    else:
        assert 0, "ceph is not healthy"