def test_alarm_overwritten():
    """
    Verify the chronological order to the alarms

    Scenario:
    1. Query the alarm table
    2. Verify the list is shown most recent alarm to oldest (based on timestamp) [REQ-14]
    """
    output = cli.fm('event-list', '--limit 10 --nowrap --nopaging --uuid')[1]
    alarm_table = table_parser.table(output, combine_multiline_entry=True)
    size = len(alarm_table['values'])

    LOG.info('Get the last entry in the alarm table')
    last_alarm = alarm_table['values'][size - 1][0]
    secondlast_alarm = alarm_table['values'][size - 2][0]
    LOG.info("last_alarm = %s" % last_alarm)
    LOG.info("secondlast_alarm = %s" % secondlast_alarm)

    time_1 = alarm_table['values'][size - 1][1]
    time_2 = alarm_table['values'][size - 2][1]

    # The last alarm should be older than the second last
    assert (
        common.get_timedelta_for_isotimes(time_1, time_2).total_seconds() > 0
        or time_1.split('.')[1] < time_2.split('.')[1])
Example #2
0
def verify_cli(sub_auth=None, central_auth=None):
    auths = [central_auth, sub_auth]
    auths = [auth for auth in auths if auth]

    for auth in auths:
        cli.system('host-list', fail_ok=False, auth_info=auth)
        cli.fm('alarm-list', fail_ok=False, auth_info=auth)
        cli.openstack('server list --a', fail_ok=False, auth_info=auth)
        cli.openstack('image list', fail_ok=False, auth_info=auth)
        cli.openstack('volume list --a', fail_ok=False, auth_info=auth)
        cli.openstack('user list', fail_ok=False, auth_info=auth)
        cli.openstack('router list', fail_ok=False, auth_info=auth)

    if sub_auth:
        cli.openstack('stack list', fail_ok=False, auth_info=sub_auth)
        cli.openstack('alarm list', fail_ok=False, auth_info=sub_auth)
        cli.openstack('metric status', fail_ok=False, auth_info=sub_auth)
Example #3
0
def verify_cli(sub_auth=None, central_auth=None):
    auths = [central_auth, sub_auth]
    auths = [auth for auth in auths if auth]

    for auth in auths:
        cli.system('host-list', fail_ok=False, auth_info=auth)
        cli.fm('alarm-list', fail_ok=False, auth_info=auth)
        if container_helper.is_stx_openstack_deployed(applied_only=True, auth_info=auth):
            cli.openstack('server list --a', fail_ok=False, auth_info=auth)
            cli.openstack('image list', fail_ok=False, auth_info=auth)
            cli.openstack('volume list --a', fail_ok=False, auth_info=auth)
            cli.openstack('user list', fail_ok=False, auth_info=auth)
            cli.openstack('router list', fail_ok=False, auth_info=auth)

    if sub_auth and container_helper.is_stx_openstack_deployed(applied_only=True,
                                                               auth_info=sub_auth):
        cli.openstack('stack list', fail_ok=False, auth_info=sub_auth)
        cli.openstack('alarm list', fail_ok=False, auth_info=sub_auth)
        cli.openstack('metric status', fail_ok=False, auth_info=sub_auth)
def _test_tc4693_verify_no_alarms():
    """Method to list alarms
    """

    # list the alarms

    alarms_found = False

    output = cli.fm('alarm-list')[1]

    LOG.tc_step(
        "Check no unexpected alarms in output for fm alarm-list: \n%s" %
        output)

    if (('warning' in output) or ('minor' in output) or ('major' in output)
            or ('critical' in output)):
        if not (any(val in output for val in allowable_alarms)):
            alarms_found = True

    assert not alarms_found
Example #5
0
def check_alarm_summary_match_subcloud(subcloud, timeout=400):
    LOG.info(
        "Ensure alarm summary on SystemController with subcloud {}".format(
            subcloud))
    subcloud_auth = Tenant.get('admin_platform', dc_region=subcloud)
    central_auth = Tenant.get('admin_platform', dc_region='RegionOne')

    severities = [
        "critical_alarms", "major_alarms", "minor_alarms", "warnings"
    ]
    central_alarms = subcloud_alarms = None
    end_time = time.time() + timeout
    while time.time() < end_time:
        output_central = cli.dcmanager('alarm summary',
                                       auth_info=central_auth,
                                       fail_ok=False)[1]
        output_sub = cli.fm("alarm-summary",
                            auth_info=subcloud_auth,
                            fail_ok=False)[1]

        central_alarms = table_parser.get_multi_values(
            table_parser.table(output_central),
            fields=severities,
            **{"NAME": subcloud})
        subcloud_alarms = table_parser.get_multi_values(
            table_parser.table(output_sub), severities)

        if central_alarms == subcloud_alarms:
            LOG.info(
                "'dcmanager alarm summary' output for {} matches 'fm alarm-summary' on "
                "{}".format(subcloud, subcloud))
            return

        time.sleep(30)

    assert central_alarms == subcloud_alarms, \
        "'dcmanager alarm summary did not match 'fm alarm-summary' on {} " \
        "within {}s".format(subcloud, timeout)
Example #6
0
def _test_system_alarm_on_host_lock():
    """
    Verify fm event-list command in the system upon host-lock

    Scenario:
    1. Execute "fm alarm-list" command in the system.
    2. Lock one compute and wait 30 seconds.
    3. Verify commands return list of active alarms in table with expected
    rows.
    """

    LOG.info("Execute fm alarm-list. Verify header of " +
             "a table consist of correct items")

    # Get and save the list of existing alarms present in the system
    res, out = cli.fm('alarm-list')
    alarm_list = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    current_alarms = []
    for alarm in alarm_list['values']:
        if re.match(".", alarm[0].strip()) is not None:
            current_alarms.append(alarm[0])
            LOG.info("The current alarms in the system are: "
                     "{0}".format(alarm[0]))

    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    # Check that a valid alarm header is present
    alarm_header = [
        'UUID', 'Time Stamp', 'State', 'Event Log ID', 'Reason Text',
        'Entity Instance ID', 'Severity'
    ]
    if hist_alarm_table['headers'] != alarm_header:
        LOG.info("Fields in table not correct actual {0} expected {1}".format(
            hist_alarm_table['headers'], alarm_header))

    # Verify the existing alarms are present in the historical list in state 'set'
    for name in current_alarms:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Raise a new alarm by locking a compute node
    # Get the compute
        LOG.info("Lock compute and wait 30 seconds")
    host = 'compute-1'
    if system_helper.is_aio_duplex():
        host = system_helper.get_standby_controller_name()

    HostsToRecover.add(host, scope='function')
    host_helper.lock_host(host)
    time.sleep(20)

    # Verify the new alarm is present in the historical alarm and active alarm lists
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    if len(alarm_list['values']) == 0:
        LOG.info("There are no alarms are not present in the alarm list")

    # Save the list of new alarms present in the list
    new_alarms = []
    for alarm in new_active_alarm_table['values']:
        if (re.match(".", alarm[0].strip()) is not None):
            new_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    new_alarm_list = list(set(new_alarms) - set(current_alarms))
    LOG.info(new_alarm_list)

    # Verify the new alarms are present in the historical list in state 'set'
    # Get the historical list of alarms
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['set']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    # Clear the alarm by unlocking the compute node
        LOG.info("Unlock compute and wait 30 seconds")
    compute_ssh = host_helper.unlock_host(host)
    time.sleep(30)

    #Verify the alarm clear is shown in the historical table
    LOG.info("Verify event-list command returns list of active alarms")
    hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True)

    for name in new_alarm_list:
        kwargs = {"Event Log ID": name}
        alarm_state = table_parser.get_values(hist_alarm_table, 'State',
                                              **kwargs)
        LOG.info('new alarm: %s  state: %s' % (name, alarm_state))
        if alarm_state != ['clear']:
            LOG.info('Alarm state is incorrect')
            test_res = False
            break

    #Verify the alarm disappears from the active alarm table
    LOG.info("Verify alarm-list command returns list of active alarms")
    res, out = cli.fm('alarm-list')
    new_active_alarm_table = table_parser.table(out)

    active_alarms = []
    for alarm in new_active_alarm_table['values']:
        if re.match(".", alarm[0].strip()) is not None:
            active_alarms.append(alarm[0])
            LOG.info("The alarm ID in the alarm list table is: "
                     "{0}".format(alarm[0]))

    # Identify the new alarms
    for name in new_alarm_list:
        if name in active_alarms:
            LOG.info("The alarm was not cleared from the active alarm table")
            test_res = False
            break
def test_system_alarms_and_events_on_lock_unlock_compute(no_simplex):
    """
    Verify fm alarm-show command

    Test Steps:
    - Delete active alarms
    - Lock a host
    - Check active alarm generated for host lock
    - Check relative values are the same in fm alarm-list and fm alarm-show
    <uuid>
    - Check host lock 'set' event logged via fm event-list
    - Unlock host
    - Check active alarms cleared via fm alarm-list
    - Check host lock 'clear' event logged via fm event-list
    """

    # Remove following step because it's unnecessary and fails the test when
    # alarm is re-generated
    # # Clear the alarms currently present
    # LOG.tc_step("Clear the alarms table")
    # system_helper.delete_alarms()

    # Raise a new alarm by locking a compute node
    # Get the compute
    compute_host = host_helper.get_up_hypervisors()[0]
    if compute_host == system_helper.get_active_controller_name():
        compute_host = system_helper.get_standby_controller_name()
        if not compute_host:
            skip('Standby controller unavailable')

    LOG.tc_step("Lock a nova hypervisor host {}".format(compute_host))
    pre_lock_time = common.get_date_in_format()
    HostsToRecover.add(compute_host)
    host_helper.lock_host(compute_host)

    LOG.tc_step("Check host lock alarm is generated")
    post_lock_alarms = \
        system_helper.wait_for_alarm(field='UUID', entity_id=compute_host,
                                     reason=compute_host,
                                     alarm_id=EventLogID.HOST_LOCK,
                                     strict=False,
                                     fail_ok=False)[1]

    LOG.tc_step(
        "Check related fields in fm alarm-list and fm alarm-show are of the "
        "same values")
    post_lock_alarms_tab = system_helper.get_alarms_table(uuid=True)

    alarms_l = ['Alarm ID', 'Entity ID', 'Severity', 'Reason Text']
    alarms_s = ['alarm_id', 'entity_instance_id', 'severity', 'reason_text']

    # Only 1 alarm since we are now checking the specific alarm ID
    for post_alarm in post_lock_alarms:
        LOG.tc_step(
            "Verify {} for alarm {} in alarm-list are in sync with "
            "alarm-show".format(
                alarms_l, post_alarm))

        alarm_show_tab = table_parser.table(cli.fm('alarm-show', post_alarm)[1])
        alarm_list_tab = table_parser.filter_table(post_lock_alarms_tab,
                                                   UUID=post_alarm)

        for i in range(len(alarms_l)):
            alarm_l_val = table_parser.get_column(alarm_list_tab,
                                                  alarms_l[i])[0]
            alarm_s_val = table_parser.get_value_two_col_table(alarm_show_tab,
                                                               alarms_s[i])

            assert alarm_l_val == alarm_s_val, \
                "{} value in alarm-list: {} is different than alarm-show: " \
                "{}".format(alarms_l[i], alarm_l_val, alarm_s_val)

    LOG.tc_step("Check host lock is logged via fm event-list")
    system_helper.wait_for_events(entity_instance_id=compute_host,
                                  start=pre_lock_time, timeout=60,
                                  event_log_id=EventLogID.HOST_LOCK,
                                  fail_ok=False, **{'state': 'set'})

    pre_unlock_time = common.get_date_in_format()
    LOG.tc_step("Unlock {}".format(compute_host))
    host_helper.unlock_host(compute_host)

    LOG.tc_step("Check host lock active alarm cleared")
    alarm_sets = [(EventLogID.HOST_LOCK, compute_host)]
    system_helper.wait_for_alarms_gone(alarm_sets, fail_ok=False)

    LOG.tc_step("Check host lock clear event logged")
    system_helper.wait_for_events(event_log_id=EventLogID.HOST_LOCK,
                                  start=pre_unlock_time,
                                  entity_instance_id=compute_host,
                                  fail_ok=False, **{'state': 'clear'})
Example #8
0
def get_system_health_query_upgrade_2(con_ssh=None):
    """
    Queries the upgrade health of a system in use.
    Args:
        con_ssh:

    Returns: tuple
        (0, None) - success
        (1, dict(error msg) ) -  health query reported 1 or more failures other than missing manifest and alarm
        (2, dict(error msg) ) -  health query reported missing manifest and atleast one alarm
        (3, dict(error msg) ) -  health query reported only minor alarm
        (4, dict(error msg) ) -  health query reported only missing manifest

    """

    output = (cli.system('health-query-upgrade', ssh_client=con_ssh)[1]).splitlines()
    failed = {}
    ok = {}

    for line in output:
        if ":" in line:
            k, v = line.split(":")
            if "[OK]" in v.strip():
                ok[k.strip()] = v.strip()
            elif "[Fail]" in v.strip():
                failed[k.strip()] = v.strip()
            elif "Hosts missing placement configuration" in k:
                failed[k.strip()] = v.strip()
            elif "Incomplete configuration" in k:
                failed[k.strip()] = v.strip()
            elif "Locked or disabled hosts" in k:
                failed[k.strip()] = v.strip()

        elif "Missing manifests" in line:
            failed[line] = line
        elif "alarms found" in line:
            if len(line.split(',')) > 1:
                failed["managment affecting"] = int(line.split(',')[1].strip()[1])

    if len(failed) == 0:
        LOG.info("system health is OK to start upgrade......")
        return 0, None,  None

    actions = {"lock_unlock": [[], ""],
               "force_upgrade": [False, ''],
               "swact": [False, ''],
               }

    for k, v in failed.items():
        if "No alarms" in k:
            table_ = table_parser.table(cli.fm('alarm-list --uuid')[1])
            alarm_severity_list = table_parser.get_column(table_, "Severity")
            if len(alarm_severity_list) > 0 \
                    and "major" not in alarm_severity_list \
                    and "critical" not in alarm_severity_list:
                # minor alarm present
                LOG.warn("System health query upgrade found minor alarms: {}".format(alarm_severity_list))
                actions["force_upgrade"] = [True, "Minor alarms present"]

        elif "managment affecting" in k:
            if v == 0:
                # non management affecting alarm present  use  foce upgrade
                LOG.warn("System health query upgrade found non managment affecting alarms: {}"
                         .format(k))
                actions["force_upgrade"] = [True, "Non managment affecting  alarms present"]

            else:
                # major/critical alarm present,  management affecting
                LOG.error("System health query upgrade found major or critical alarms.")
                return 1, failed, None

        elif "Missing manifests" in k:
            # manifest = True
            if "controller-1" in k:
                if "controller-1" not in actions["lock_unlock"][0]:
                    actions["lock_unlock"][0].append("controller-1")
            if "controller-0" in k:
                if "controller-0" not in actions["lock_unlock"][0]:
                    actions["lock_unlock"][0].append("controller-0")

            actions["lock_unlock"][1] += "Missing manifests;"

        elif any(s in k for s in ("Cinder configuration", "Incomplete configuration")):
            # cinder_config = True
            actions["swact"] = [True, actions["swact"][1] + "Invalid Cinder configuration;"]

        elif "Placement Services Enabled" in k or "Hosts missing placement configuration" in k:
            # placement_services = True
            if "controller-1" in v:
                if "controller-1" not in actions["lock_unlock"][0]:
                    actions["lock_unlock"][0].append("controller-1")
            if "controller-0" in v:
                if "controller-0" not in actions["lock_unlock"][0]:
                    actions["lock_unlock"][0].append("controller-0")
            actions["lock_unlock"][1] += "Missing placement configuration;"
        else:
            err_msg = "System health query upgrade failed: {}".format(failed)
            LOG.error(err_msg)
            return 1, failed,  None

    return 2, failed, actions
Example #9
0
def get_system_health_query_upgrade(con_ssh=None):
    """
    Queries the upgrade health of a system in use.
    Args:
        con_ssh:

    Returns: tuple
        (0, None) - success
        (1, dict(error msg) ) -  health query reported 1 or more failures other than missing manifest and alarm
        (2, dict(error msg) ) -  health query reported missing manifest and atleast one alarm
        (3, dict(error msg) ) -  health query reported only minor alarm
        (4, dict(error msg) ) -  health query reported only missing manifest

    """

    output = (cli.system('health-query-upgrade', ssh_client=con_ssh)[1]).splitlines()
    failed = {}
    ok = {}

    for line in output:
        if ":" in line:
            k, v = line.split(":")
            if "[OK]" in v.strip():
                ok[k.strip()] = v.strip()
            elif "[Fail]" in v.strip():
                failed[k.strip()] = v.strip()
        elif "Missing manifests" in line:
            failed[line] = line

    if len(failed) == 0:
        LOG.info("system health is OK to start upgrade......")
        return 0, None

    alarms = any("No alarms" in h for h in failed.keys())
    manifest = any("Missing manifests" in h for h in failed.keys())
    cinder_config = any("Cinder configuration" in h for h in failed.keys())
    err_msg = "System health query upgrade failed: {}".format(failed)
    if len(failed) > 3:
        # more than three health check failures
        LOG.error(err_msg)
        return 1, failed

    if len(failed) == 3:
        # check if the two failures are alarms and manifest,  otherwise return error.
        if not alarms or not manifest or not cinder_config:
            LOG.error(err_msg)
            return 1, failed
    else:
        # Only one health check failure. Return error if not alarm or manifest
        if not alarms and not manifest and not cinder_config:
            LOG.error(err_msg)
            return 1, failed

    if alarms:
        # Check if it alarm
        table_ = table_parser.table(cli.fm('alarm-list')[1])
        alarm_severity_list = table_parser.get_column(table_, "Severity")
        if len(alarm_severity_list) > 0 and \
                ("major" not in alarm_severity_list and "critical" not in alarm_severity_list):
            # minor alarm present
            LOG.warn("System health query upgrade found minor alarms: {}".format(alarm_severity_list))

        else:
            # major/critical alarm present
            LOG.error("System health query upgrade found major or critical alarms: {}".format(alarm_severity_list))
            return 1, failed

    if manifest and alarms:
        return 2, failed

    elif alarms:
        # only minor alarm
        return 3, failed
    else:
        # only missing manifests
        return 4, failed
Example #10
0
def test_dc_fault_scenario(subcloud_to_test):
    """
    Test Fault Scenario on Distributed Cloud
    Args:
        subcloud_to_test (str): module fixture

    Setup:
        - Make sure there is consistency between alarm summary on
        Central Cloud and on subclouds

    Test Steps:
        - Make subcloud offline (e. g. delete route)
        Step1:
        - Ensure suncloud shows offline
        Step2:
        - Raise alarm on subcloud
        - Ensure relative alarm raised on subcloud,
        - Ensure system alarm-summary on subcloud has changed
        - Ensure  dcmanager alarm summary on system controller has no change
        Step3:
        - Resume connectivity to subcloud (e. g. add route back)
        - Ensure suncloud shows online and in-sync
        - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system
        controller
        Step4:
        - Clean alarm on subcloud
        - Ensure relative alarm cleared on subcloud
        - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system
        controller
    """
    ssh_central = ControllerClient.get_active_controller(name="RegionOne")
    ssh_subcloud = ControllerClient.get_active_controller(
        name=subcloud_to_test)
    subcloud_table = {}
    try:
        code, output = cli.dcmanager(
            "subcloud show {}".format(subcloud_to_test),
            ssh_client=ssh_central)
        gateway = table_parser.get_value_two_col_table(
            table_parser.table(output), "management_gateway_ip")
        code, hosts_raw = cli.system("host-list", ssh_client=ssh_subcloud)
        hosts_id = table_parser.get_values(table_parser.table(hosts_raw), 'id')
        for host_id in hosts_id:
            code, route_raw = cli.system("host-route-list {}".format(host_id),
                                         ssh_client=ssh_subcloud)
            route_table = table_parser.filter_table(
                table_parser.table(route_raw), **{'gateway': gateway})
            subcloud_table[host_id] = route_table

        LOG.tc_step(
            "Delete route for subcloud: {} and wait for it to go offline.".
            format(subcloud_to_test))
        ssh_subcloud = ControllerClient.get_active_controller(
            name=subcloud_to_test)
        for host_id in subcloud_table:
            command = "host-route-delete {}".format(
                table_parser.get_values(subcloud_table[host_id], "uuid")[0])
            cli.system(command, ssh_client=ssh_subcloud)

        dc_helper.wait_for_subcloud_status(subcloud_to_test,
                                           avail=SubcloudStatus.AVAIL_OFFLINE,
                                           timeout=DCTimeout.SYNC,
                                           con_ssh=ssh_central)

        LOG.tc_step("Raise alarm on subcloud: {}".format(subcloud_to_test))
        ssh_subcloud = ControllerClient.get_active_controller(
            name=subcloud_to_test)
        code_sub_before, output_sub_before = cli.fm("alarm-summary",
                                                    ssh_client=ssh_subcloud)
        code_central_before, output_central_before = cli.dcmanager(
            'alarm summary')
        ssh_subcloud.exec_cmd(
            "fmClientCli -c \"### ###300.005###clear###system.vm###host="
            "testhost-0### ###critical### ###processing-error###cpu-cycles-limit-exceeded"
            "### ###True###True###'\"",
            fail_ok=False)
        LOG.info("Ensure relative alarm was raised at subcloud: {}".format(
            subcloud_to_test))
        system_helper.wait_for_alarm(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_subcloud)
        code_sub_after, output_sub_after = cli.fm("alarm-summary",
                                                  ssh_client=ssh_subcloud)
        code_central_after, output_central_after = cli.dcmanager(
            'alarm summary')
        LOG.info(
            "Ensure fm alarm summary on subcloud: {} has changed but dcmanager alarm"
            "summary has not changed".format(subcloud_to_test))
        assert output_central_before == output_central_after and output_sub_before != \
            output_sub_after

        add_routes_to_subcloud(subcloud_to_test, subcloud_table)

        dc_helper.wait_for_subcloud_status(subcloud_to_test,
                                           avail=SubcloudStatus.AVAIL_ONLINE,
                                           sync=SubcloudStatus.SYNCED,
                                           timeout=DCTimeout.SYNC,
                                           con_ssh=ssh_central)
        alarm_summary_add_and_del(subcloud_to_test)

    finally:
        cli.dcmanager("subcloud show {}".format(subcloud_to_test),
                      ssh_client=ssh_central,
                      fail_ok=True)
        add_routes_to_subcloud(subcloud_to_test, subcloud_table, fail_ok=True)
        LOG.info("Clear alarm on subcloud: {}".format(subcloud_to_test))
        ssh_subcloud.exec_cmd('fmClientCli -D host=testhost-0')
        check_alarm_summary_match_subcloud(subcloud=subcloud_to_test)