def test_alarm_overwritten(): """ Verify the chronological order to the alarms Scenario: 1. Query the alarm table 2. Verify the list is shown most recent alarm to oldest (based on timestamp) [REQ-14] """ output = cli.fm('event-list', '--limit 10 --nowrap --nopaging --uuid')[1] alarm_table = table_parser.table(output, combine_multiline_entry=True) size = len(alarm_table['values']) LOG.info('Get the last entry in the alarm table') last_alarm = alarm_table['values'][size - 1][0] secondlast_alarm = alarm_table['values'][size - 2][0] LOG.info("last_alarm = %s" % last_alarm) LOG.info("secondlast_alarm = %s" % secondlast_alarm) time_1 = alarm_table['values'][size - 1][1] time_2 = alarm_table['values'][size - 2][1] # The last alarm should be older than the second last assert ( common.get_timedelta_for_isotimes(time_1, time_2).total_seconds() > 0 or time_1.split('.')[1] < time_2.split('.')[1])
def verify_cli(sub_auth=None, central_auth=None): auths = [central_auth, sub_auth] auths = [auth for auth in auths if auth] for auth in auths: cli.system('host-list', fail_ok=False, auth_info=auth) cli.fm('alarm-list', fail_ok=False, auth_info=auth) cli.openstack('server list --a', fail_ok=False, auth_info=auth) cli.openstack('image list', fail_ok=False, auth_info=auth) cli.openstack('volume list --a', fail_ok=False, auth_info=auth) cli.openstack('user list', fail_ok=False, auth_info=auth) cli.openstack('router list', fail_ok=False, auth_info=auth) if sub_auth: cli.openstack('stack list', fail_ok=False, auth_info=sub_auth) cli.openstack('alarm list', fail_ok=False, auth_info=sub_auth) cli.openstack('metric status', fail_ok=False, auth_info=sub_auth)
def verify_cli(sub_auth=None, central_auth=None): auths = [central_auth, sub_auth] auths = [auth for auth in auths if auth] for auth in auths: cli.system('host-list', fail_ok=False, auth_info=auth) cli.fm('alarm-list', fail_ok=False, auth_info=auth) if container_helper.is_stx_openstack_deployed(applied_only=True, auth_info=auth): cli.openstack('server list --a', fail_ok=False, auth_info=auth) cli.openstack('image list', fail_ok=False, auth_info=auth) cli.openstack('volume list --a', fail_ok=False, auth_info=auth) cli.openstack('user list', fail_ok=False, auth_info=auth) cli.openstack('router list', fail_ok=False, auth_info=auth) if sub_auth and container_helper.is_stx_openstack_deployed(applied_only=True, auth_info=sub_auth): cli.openstack('stack list', fail_ok=False, auth_info=sub_auth) cli.openstack('alarm list', fail_ok=False, auth_info=sub_auth) cli.openstack('metric status', fail_ok=False, auth_info=sub_auth)
def _test_tc4693_verify_no_alarms(): """Method to list alarms """ # list the alarms alarms_found = False output = cli.fm('alarm-list')[1] LOG.tc_step( "Check no unexpected alarms in output for fm alarm-list: \n%s" % output) if (('warning' in output) or ('minor' in output) or ('major' in output) or ('critical' in output)): if not (any(val in output for val in allowable_alarms)): alarms_found = True assert not alarms_found
def check_alarm_summary_match_subcloud(subcloud, timeout=400): LOG.info( "Ensure alarm summary on SystemController with subcloud {}".format( subcloud)) subcloud_auth = Tenant.get('admin_platform', dc_region=subcloud) central_auth = Tenant.get('admin_platform', dc_region='RegionOne') severities = [ "critical_alarms", "major_alarms", "minor_alarms", "warnings" ] central_alarms = subcloud_alarms = None end_time = time.time() + timeout while time.time() < end_time: output_central = cli.dcmanager('alarm summary', auth_info=central_auth, fail_ok=False)[1] output_sub = cli.fm("alarm-summary", auth_info=subcloud_auth, fail_ok=False)[1] central_alarms = table_parser.get_multi_values( table_parser.table(output_central), fields=severities, **{"NAME": subcloud}) subcloud_alarms = table_parser.get_multi_values( table_parser.table(output_sub), severities) if central_alarms == subcloud_alarms: LOG.info( "'dcmanager alarm summary' output for {} matches 'fm alarm-summary' on " "{}".format(subcloud, subcloud)) return time.sleep(30) assert central_alarms == subcloud_alarms, \ "'dcmanager alarm summary did not match 'fm alarm-summary' on {} " \ "within {}s".format(subcloud, timeout)
def _test_system_alarm_on_host_lock(): """ Verify fm event-list command in the system upon host-lock Scenario: 1. Execute "fm alarm-list" command in the system. 2. Lock one compute and wait 30 seconds. 3. Verify commands return list of active alarms in table with expected rows. """ LOG.info("Execute fm alarm-list. Verify header of " + "a table consist of correct items") # Get and save the list of existing alarms present in the system res, out = cli.fm('alarm-list') alarm_list = table_parser.table(out) if len(alarm_list['values']) == 0: LOG.info("There are no alarms are not present in the alarm list") current_alarms = [] for alarm in alarm_list['values']: if re.match(".", alarm[0].strip()) is not None: current_alarms.append(alarm[0]) LOG.info("The current alarms in the system are: " "{0}".format(alarm[0])) # Get the historical list of alarms hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True) # Check that a valid alarm header is present alarm_header = [ 'UUID', 'Time Stamp', 'State', 'Event Log ID', 'Reason Text', 'Entity Instance ID', 'Severity' ] if hist_alarm_table['headers'] != alarm_header: LOG.info("Fields in table not correct actual {0} expected {1}".format( hist_alarm_table['headers'], alarm_header)) # Verify the existing alarms are present in the historical list in state 'set' for name in current_alarms: kwargs = {"Event Log ID": name} alarm_state = table_parser.get_values(hist_alarm_table, 'State', **kwargs) LOG.info('alarm: %s state: %s' % (name, alarm_state)) if alarm_state != ['set']: LOG.info('Alarm state is incorrect') test_res = False break # Raise a new alarm by locking a compute node # Get the compute LOG.info("Lock compute and wait 30 seconds") host = 'compute-1' if system_helper.is_aio_duplex(): host = system_helper.get_standby_controller_name() HostsToRecover.add(host, scope='function') host_helper.lock_host(host) time.sleep(20) # Verify the new alarm is present in the historical alarm and active alarm lists LOG.info("Verify alarm-list command returns list of active alarms") res, out = cli.fm('alarm-list') new_active_alarm_table = table_parser.table(out) if len(alarm_list['values']) == 0: LOG.info("There are no alarms are not present in the alarm list") # Save the list of new alarms present in the list new_alarms = [] for alarm in new_active_alarm_table['values']: if (re.match(".", alarm[0].strip()) is not None): new_alarms.append(alarm[0]) LOG.info("The alarm ID in the alarm list table is: " "{0}".format(alarm[0])) # Identify the new alarms new_alarm_list = list(set(new_alarms) - set(current_alarms)) LOG.info(new_alarm_list) # Verify the new alarms are present in the historical list in state 'set' # Get the historical list of alarms hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True) for name in new_alarm_list: kwargs = {"Event Log ID": name} alarm_state = table_parser.get_values(hist_alarm_table, 'State', **kwargs) LOG.info('new alarm: %s state: %s' % (name, alarm_state)) if alarm_state != ['set']: LOG.info('Alarm state is incorrect') test_res = False break # Clear the alarm by unlocking the compute node LOG.info("Unlock compute and wait 30 seconds") compute_ssh = host_helper.unlock_host(host) time.sleep(30) #Verify the alarm clear is shown in the historical table LOG.info("Verify event-list command returns list of active alarms") hist_alarm_table = system_helper.get_events_table(limit=15, show_uuid=True) for name in new_alarm_list: kwargs = {"Event Log ID": name} alarm_state = table_parser.get_values(hist_alarm_table, 'State', **kwargs) LOG.info('new alarm: %s state: %s' % (name, alarm_state)) if alarm_state != ['clear']: LOG.info('Alarm state is incorrect') test_res = False break #Verify the alarm disappears from the active alarm table LOG.info("Verify alarm-list command returns list of active alarms") res, out = cli.fm('alarm-list') new_active_alarm_table = table_parser.table(out) active_alarms = [] for alarm in new_active_alarm_table['values']: if re.match(".", alarm[0].strip()) is not None: active_alarms.append(alarm[0]) LOG.info("The alarm ID in the alarm list table is: " "{0}".format(alarm[0])) # Identify the new alarms for name in new_alarm_list: if name in active_alarms: LOG.info("The alarm was not cleared from the active alarm table") test_res = False break
def test_system_alarms_and_events_on_lock_unlock_compute(no_simplex): """ Verify fm alarm-show command Test Steps: - Delete active alarms - Lock a host - Check active alarm generated for host lock - Check relative values are the same in fm alarm-list and fm alarm-show <uuid> - Check host lock 'set' event logged via fm event-list - Unlock host - Check active alarms cleared via fm alarm-list - Check host lock 'clear' event logged via fm event-list """ # Remove following step because it's unnecessary and fails the test when # alarm is re-generated # # Clear the alarms currently present # LOG.tc_step("Clear the alarms table") # system_helper.delete_alarms() # Raise a new alarm by locking a compute node # Get the compute compute_host = host_helper.get_up_hypervisors()[0] if compute_host == system_helper.get_active_controller_name(): compute_host = system_helper.get_standby_controller_name() if not compute_host: skip('Standby controller unavailable') LOG.tc_step("Lock a nova hypervisor host {}".format(compute_host)) pre_lock_time = common.get_date_in_format() HostsToRecover.add(compute_host) host_helper.lock_host(compute_host) LOG.tc_step("Check host lock alarm is generated") post_lock_alarms = \ system_helper.wait_for_alarm(field='UUID', entity_id=compute_host, reason=compute_host, alarm_id=EventLogID.HOST_LOCK, strict=False, fail_ok=False)[1] LOG.tc_step( "Check related fields in fm alarm-list and fm alarm-show are of the " "same values") post_lock_alarms_tab = system_helper.get_alarms_table(uuid=True) alarms_l = ['Alarm ID', 'Entity ID', 'Severity', 'Reason Text'] alarms_s = ['alarm_id', 'entity_instance_id', 'severity', 'reason_text'] # Only 1 alarm since we are now checking the specific alarm ID for post_alarm in post_lock_alarms: LOG.tc_step( "Verify {} for alarm {} in alarm-list are in sync with " "alarm-show".format( alarms_l, post_alarm)) alarm_show_tab = table_parser.table(cli.fm('alarm-show', post_alarm)[1]) alarm_list_tab = table_parser.filter_table(post_lock_alarms_tab, UUID=post_alarm) for i in range(len(alarms_l)): alarm_l_val = table_parser.get_column(alarm_list_tab, alarms_l[i])[0] alarm_s_val = table_parser.get_value_two_col_table(alarm_show_tab, alarms_s[i]) assert alarm_l_val == alarm_s_val, \ "{} value in alarm-list: {} is different than alarm-show: " \ "{}".format(alarms_l[i], alarm_l_val, alarm_s_val) LOG.tc_step("Check host lock is logged via fm event-list") system_helper.wait_for_events(entity_instance_id=compute_host, start=pre_lock_time, timeout=60, event_log_id=EventLogID.HOST_LOCK, fail_ok=False, **{'state': 'set'}) pre_unlock_time = common.get_date_in_format() LOG.tc_step("Unlock {}".format(compute_host)) host_helper.unlock_host(compute_host) LOG.tc_step("Check host lock active alarm cleared") alarm_sets = [(EventLogID.HOST_LOCK, compute_host)] system_helper.wait_for_alarms_gone(alarm_sets, fail_ok=False) LOG.tc_step("Check host lock clear event logged") system_helper.wait_for_events(event_log_id=EventLogID.HOST_LOCK, start=pre_unlock_time, entity_instance_id=compute_host, fail_ok=False, **{'state': 'clear'})
def get_system_health_query_upgrade_2(con_ssh=None): """ Queries the upgrade health of a system in use. Args: con_ssh: Returns: tuple (0, None) - success (1, dict(error msg) ) - health query reported 1 or more failures other than missing manifest and alarm (2, dict(error msg) ) - health query reported missing manifest and atleast one alarm (3, dict(error msg) ) - health query reported only minor alarm (4, dict(error msg) ) - health query reported only missing manifest """ output = (cli.system('health-query-upgrade', ssh_client=con_ssh)[1]).splitlines() failed = {} ok = {} for line in output: if ":" in line: k, v = line.split(":") if "[OK]" in v.strip(): ok[k.strip()] = v.strip() elif "[Fail]" in v.strip(): failed[k.strip()] = v.strip() elif "Hosts missing placement configuration" in k: failed[k.strip()] = v.strip() elif "Incomplete configuration" in k: failed[k.strip()] = v.strip() elif "Locked or disabled hosts" in k: failed[k.strip()] = v.strip() elif "Missing manifests" in line: failed[line] = line elif "alarms found" in line: if len(line.split(',')) > 1: failed["managment affecting"] = int(line.split(',')[1].strip()[1]) if len(failed) == 0: LOG.info("system health is OK to start upgrade......") return 0, None, None actions = {"lock_unlock": [[], ""], "force_upgrade": [False, ''], "swact": [False, ''], } for k, v in failed.items(): if "No alarms" in k: table_ = table_parser.table(cli.fm('alarm-list --uuid')[1]) alarm_severity_list = table_parser.get_column(table_, "Severity") if len(alarm_severity_list) > 0 \ and "major" not in alarm_severity_list \ and "critical" not in alarm_severity_list: # minor alarm present LOG.warn("System health query upgrade found minor alarms: {}".format(alarm_severity_list)) actions["force_upgrade"] = [True, "Minor alarms present"] elif "managment affecting" in k: if v == 0: # non management affecting alarm present use foce upgrade LOG.warn("System health query upgrade found non managment affecting alarms: {}" .format(k)) actions["force_upgrade"] = [True, "Non managment affecting alarms present"] else: # major/critical alarm present, management affecting LOG.error("System health query upgrade found major or critical alarms.") return 1, failed, None elif "Missing manifests" in k: # manifest = True if "controller-1" in k: if "controller-1" not in actions["lock_unlock"][0]: actions["lock_unlock"][0].append("controller-1") if "controller-0" in k: if "controller-0" not in actions["lock_unlock"][0]: actions["lock_unlock"][0].append("controller-0") actions["lock_unlock"][1] += "Missing manifests;" elif any(s in k for s in ("Cinder configuration", "Incomplete configuration")): # cinder_config = True actions["swact"] = [True, actions["swact"][1] + "Invalid Cinder configuration;"] elif "Placement Services Enabled" in k or "Hosts missing placement configuration" in k: # placement_services = True if "controller-1" in v: if "controller-1" not in actions["lock_unlock"][0]: actions["lock_unlock"][0].append("controller-1") if "controller-0" in v: if "controller-0" not in actions["lock_unlock"][0]: actions["lock_unlock"][0].append("controller-0") actions["lock_unlock"][1] += "Missing placement configuration;" else: err_msg = "System health query upgrade failed: {}".format(failed) LOG.error(err_msg) return 1, failed, None return 2, failed, actions
def get_system_health_query_upgrade(con_ssh=None): """ Queries the upgrade health of a system in use. Args: con_ssh: Returns: tuple (0, None) - success (1, dict(error msg) ) - health query reported 1 or more failures other than missing manifest and alarm (2, dict(error msg) ) - health query reported missing manifest and atleast one alarm (3, dict(error msg) ) - health query reported only minor alarm (4, dict(error msg) ) - health query reported only missing manifest """ output = (cli.system('health-query-upgrade', ssh_client=con_ssh)[1]).splitlines() failed = {} ok = {} for line in output: if ":" in line: k, v = line.split(":") if "[OK]" in v.strip(): ok[k.strip()] = v.strip() elif "[Fail]" in v.strip(): failed[k.strip()] = v.strip() elif "Missing manifests" in line: failed[line] = line if len(failed) == 0: LOG.info("system health is OK to start upgrade......") return 0, None alarms = any("No alarms" in h for h in failed.keys()) manifest = any("Missing manifests" in h for h in failed.keys()) cinder_config = any("Cinder configuration" in h for h in failed.keys()) err_msg = "System health query upgrade failed: {}".format(failed) if len(failed) > 3: # more than three health check failures LOG.error(err_msg) return 1, failed if len(failed) == 3: # check if the two failures are alarms and manifest, otherwise return error. if not alarms or not manifest or not cinder_config: LOG.error(err_msg) return 1, failed else: # Only one health check failure. Return error if not alarm or manifest if not alarms and not manifest and not cinder_config: LOG.error(err_msg) return 1, failed if alarms: # Check if it alarm table_ = table_parser.table(cli.fm('alarm-list')[1]) alarm_severity_list = table_parser.get_column(table_, "Severity") if len(alarm_severity_list) > 0 and \ ("major" not in alarm_severity_list and "critical" not in alarm_severity_list): # minor alarm present LOG.warn("System health query upgrade found minor alarms: {}".format(alarm_severity_list)) else: # major/critical alarm present LOG.error("System health query upgrade found major or critical alarms: {}".format(alarm_severity_list)) return 1, failed if manifest and alarms: return 2, failed elif alarms: # only minor alarm return 3, failed else: # only missing manifests return 4, failed
def test_dc_fault_scenario(subcloud_to_test): """ Test Fault Scenario on Distributed Cloud Args: subcloud_to_test (str): module fixture Setup: - Make sure there is consistency between alarm summary on Central Cloud and on subclouds Test Steps: - Make subcloud offline (e. g. delete route) Step1: - Ensure suncloud shows offline Step2: - Raise alarm on subcloud - Ensure relative alarm raised on subcloud, - Ensure system alarm-summary on subcloud has changed - Ensure dcmanager alarm summary on system controller has no change Step3: - Resume connectivity to subcloud (e. g. add route back) - Ensure suncloud shows online and in-sync - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system controller Step4: - Clean alarm on subcloud - Ensure relative alarm cleared on subcloud - Ensure system alarm-summary on subcloud matches dcmanager alarm summary on system controller """ ssh_central = ControllerClient.get_active_controller(name="RegionOne") ssh_subcloud = ControllerClient.get_active_controller( name=subcloud_to_test) subcloud_table = {} try: code, output = cli.dcmanager( "subcloud show {}".format(subcloud_to_test), ssh_client=ssh_central) gateway = table_parser.get_value_two_col_table( table_parser.table(output), "management_gateway_ip") code, hosts_raw = cli.system("host-list", ssh_client=ssh_subcloud) hosts_id = table_parser.get_values(table_parser.table(hosts_raw), 'id') for host_id in hosts_id: code, route_raw = cli.system("host-route-list {}".format(host_id), ssh_client=ssh_subcloud) route_table = table_parser.filter_table( table_parser.table(route_raw), **{'gateway': gateway}) subcloud_table[host_id] = route_table LOG.tc_step( "Delete route for subcloud: {} and wait for it to go offline.". format(subcloud_to_test)) ssh_subcloud = ControllerClient.get_active_controller( name=subcloud_to_test) for host_id in subcloud_table: command = "host-route-delete {}".format( table_parser.get_values(subcloud_table[host_id], "uuid")[0]) cli.system(command, ssh_client=ssh_subcloud) dc_helper.wait_for_subcloud_status(subcloud_to_test, avail=SubcloudStatus.AVAIL_OFFLINE, timeout=DCTimeout.SYNC, con_ssh=ssh_central) LOG.tc_step("Raise alarm on subcloud: {}".format(subcloud_to_test)) ssh_subcloud = ControllerClient.get_active_controller( name=subcloud_to_test) code_sub_before, output_sub_before = cli.fm("alarm-summary", ssh_client=ssh_subcloud) code_central_before, output_central_before = cli.dcmanager( 'alarm summary') ssh_subcloud.exec_cmd( "fmClientCli -c \"### ###300.005###clear###system.vm###host=" "testhost-0### ###critical### ###processing-error###cpu-cycles-limit-exceeded" "### ###True###True###'\"", fail_ok=False) LOG.info("Ensure relative alarm was raised at subcloud: {}".format( subcloud_to_test)) system_helper.wait_for_alarm( alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_subcloud) code_sub_after, output_sub_after = cli.fm("alarm-summary", ssh_client=ssh_subcloud) code_central_after, output_central_after = cli.dcmanager( 'alarm summary') LOG.info( "Ensure fm alarm summary on subcloud: {} has changed but dcmanager alarm" "summary has not changed".format(subcloud_to_test)) assert output_central_before == output_central_after and output_sub_before != \ output_sub_after add_routes_to_subcloud(subcloud_to_test, subcloud_table) dc_helper.wait_for_subcloud_status(subcloud_to_test, avail=SubcloudStatus.AVAIL_ONLINE, sync=SubcloudStatus.SYNCED, timeout=DCTimeout.SYNC, con_ssh=ssh_central) alarm_summary_add_and_del(subcloud_to_test) finally: cli.dcmanager("subcloud show {}".format(subcloud_to_test), ssh_client=ssh_central, fail_ok=True) add_routes_to_subcloud(subcloud_to_test, subcloud_table, fail_ok=True) LOG.info("Clear alarm on subcloud: {}".format(subcloud_to_test)) ssh_subcloud.exec_cmd('fmClientCli -D host=testhost-0') check_alarm_summary_match_subcloud(subcloud=subcloud_to_test)