Exemple #1
0
    def revert():
        reverted = False
        try:
            LOG.fixture_step("Manage primary subcloud {} if unmanaged".format(primary_subcloud))
            dc_helper.manage_subcloud(primary_subcloud)

            LOG.fixture_step("Revert NTP config if changed")
            res = system_helper.modify_ntp(ntp_servers=central_ntp, auth_info=central_auth, check_first=True,
                                           clear_alarm=False)[0]
            if res != -1:
                LOG.fixture_step("Lock unlock config out-of-date hosts in central region")
                system_helper.wait_and_clear_config_out_of_date_alarms(auth_info=central_auth,
                                                                       wait_with_best_effort=True)

                LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(primary_subcloud))
                dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=central_ntp,
                                                       clear_alarm=True)

                if managed_subcloud:
                    LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(managed_subcloud))
                    dc_helper.wait_for_subcloud_ntp_config(subcloud=managed_subcloud, expected_ntp=central_ntp,
                                                           clear_alarm=True)

            if subclouds_to_revert:
                LOG.fixture_step("Manage unmanaged subclouds and check they are unaffected")
                for subcloud in subclouds_to_revert:
                    dc_helper.manage_subcloud(subcloud)
                    assert not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                                        auth_info=Tenant.get('admin_platform', dc_region=subcloud))
            reverted = True

        finally:
            if not reverted:
                for subcloud in subclouds_to_revert:
                    dc_helper.manage_subcloud(subcloud)
Exemple #2
0
    def del_alarms():
        LOG.fixture_step(
            "Delete 300.005 alarms and ensure they are removed from alarm-list"
        )
        alarms_tab = system_helper.get_alarms_table(uuid=True)
        alarm_uuids = table_parser.get_values(table_=alarms_tab,
                                              target_header='UUID',
                                              **{'Alarm ID': alarm_id})
        if alarm_uuids:
            system_helper.delete_alarms(alarms=alarm_uuids)

        post_del_alarms = system_helper.get_alarms(alarm_id=alarm_id)
        assert not post_del_alarms, "300.005 alarm still exits after deletion"
Exemple #3
0
def test_alarms():
    output = """+------+----------+-------------+-----------+----------+------------+
| UUID | Alarm ID | Reason Text | Entity ID | Severity | Time Stamp |
+------+----------+-------------+-----------+----------+------------+
+------+----------+-------------+-----------+----------+------------+
Mon Apr  3 19:41:50 UTC 2017
controller-0:~$ """

    table_ = table_parser.table(output)
    print("empty table: {}".format(table_))
    alarms = system_helper.get_alarms()
    # system_helper.delete_alarms()
    # system_helper.get_alarms()
    system_helper.get_alarms_table()
Exemple #4
0
def check_alarms(before_alarms,
                 timeout=300,
                 auth_info=Tenant.get('admin_platform'),
                 con_ssh=None,
                 fail_ok=False):
    after_alarms = system_helper.get_alarms(auth_info=auth_info,
                                            con_ssh=con_ssh)
    new_alarms = []
    check_interval = 5
    for item in after_alarms:
        if item not in before_alarms:
            alarm_id, entity_id = item.split('::::')
            if alarm_id == EventLogID.CPU_USAGE_HIGH:
                check_interval = 45
            elif alarm_id == EventLogID.NTP_ALARM:
                # NTP alarm handling
                LOG.info("NTP alarm found, checking ntpq stats")
                host = entity_id.split('host=')[1].split('.ntp')[0]
                system_helper.wait_for_ntp_sync(host=host,
                                                fail_ok=False,
                                                auth_info=auth_info,
                                                con_ssh=con_ssh)
                continue

            new_alarms.append((alarm_id, entity_id))

    res = True
    remaining_alarms = None
    if new_alarms:
        LOG.info("New alarms detected. Waiting for new alarms to clear.")
        res, remaining_alarms = \
            system_helper.wait_for_alarms_gone(new_alarms,
                                               fail_ok=True,
                                               timeout=timeout,
                                               check_interval=check_interval,
                                               auth_info=auth_info,
                                               con_ssh=con_ssh)

    if not res:
        msg = "New alarm(s) found and did not clear within {} seconds. " \
              "Alarm IDs and Entity IDs: {}".format(timeout, remaining_alarms)
        LOG.warning(msg)
        if not fail_ok:
            assert res, msg

    return res, remaining_alarms
Exemple #5
0
def wait_for_con_drbd_sync_complete():
    if len(
            system_helper.get_controllers(
                administrative=HostAdminState.UNLOCKED)) < 2:
        LOG.info(
            "Less than two unlocked controllers on system. Do not wait for drbd sync"
        )
        return False

    host = 'controller-1'
    LOG.fixture_step(
        "Waiting for controller-1 drbd sync alarm gone if present")
    end_time = time.time() + 1200
    while time.time() < end_time:
        drbd_alarms = system_helper.get_alarms(
            alarm_id=EventLogID.CON_DRBD_SYNC,
            reason_text='drbd-',
            entity_id=host,
            strict=False)

        if not drbd_alarms:
            LOG.info("{} drbd sync alarm is cleared".format(host))
            break
        time.sleep(10)

    else:
        assert False, "drbd sync alarm {} is not cleared within timeout".format(
            EventLogID.CON_DRBD_SYNC)

    LOG.fixture_step(
        "Wait for {} becomes available in system host-list".format(host))
    system_helper.wait_for_host_values(host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False,
                                       check_interval=10)

    LOG.fixture_step(
        "Wait for {} drbd-cinder in sm-dump to reach desired state".format(
            host))
    host_helper.wait_for_sm_dump_desired_states(host,
                                                'drbd-',
                                                strict=False,
                                                timeout=30,
                                                fail_ok=False)
    return True
Exemple #6
0
def __verify_central_alarms(request, scope):
    region = 'RegionOne'
    auth_info = Tenant.get('admin_platform', dc_region=region)
    con_ssh = ControllerClient.get_active_controller(name=region)
    LOG.fixture_step(
        "({}) Gathering fm alarms in central region before test {} begins.".
        format(scope, scope))
    before_alarms = system_helper.get_alarms(auth_info=auth_info,
                                             con_ssh=con_ssh)

    def verify_alarms():
        LOG.fixture_step(
            "({}) Verifying system alarms in central region after test {} ended..."
            .format(scope, scope))
        check_helper.check_alarms(before_alarms=before_alarms,
                                  auth_info=auth_info,
                                  con_ssh=con_ssh)
        LOG.info("({}) fm alarms verified in central region.".format(scope))

    request.addfinalizer(verify_alarms)
Exemple #7
0
def activate_upgrade(con_ssh=None, fail_ok=False):
    """
    Activates upgrade
    Args:
        con_ssh (SSHClient):
        fail_ok (bool):

    Returns (tuple):
        (0, dict/list) - success
        (1, <stderr>)   # cli returns stderr, applicable if fail_ok is true

    """
    rc, output = cli.system('upgrade-activate', ssh_client=con_ssh, fail_ok=True)
    if rc != 0:
        err_msg = "CLI system upgrade-activate failed: {}".format(output)
        LOG.warning(err_msg)
        if fail_ok:
            return rc, output
        else:
            raise exceptions.CLIRejected(err_msg)

    if not system_helper.wait_for_alarm_gone("250.001", con_ssh=con_ssh, timeout=900, check_interval=60, fail_ok=True):

        alarms = system_helper.get_alarms(alarm_id="250.001")
        err_msg = "After activating upgrade alarms are not cleared : {}".format(alarms)
        LOG.warning(err_msg)
        if fail_ok:
            return 1, err_msg
        else:
            raise exceptions.HostError(err_msg)

    if not wait_for_upgrade_activate_complete(fail_ok=True):
        err_msg = "Upgrade activate failed"
        LOG.warning(err_msg)
        if fail_ok:
            return 1, err_msg
        else:
            raise exceptions.HostError(err_msg)

    LOG.info("Upgrade activation complete")
    return 0, None
    def teardown():
        """
        If DNS servers are not set, set them.  Deprovision internal DNS.
        """
        global UNRESTORED_DNS_SERVERS
        global HOSTS_AFFECTED

        if UNRESTORED_DNS_SERVERS:
            LOG.fixture_step("Restoring DNS entries to: {}".format(UNRESTORED_DNS_SERVERS))
            subnet_list = network_helper.get_subnets(network=mgmt_net_id)
            set_dns_servers(subnet_list, UNRESTORED_DNS_SERVERS, fail_ok=True)
            UNRESTORED_DNS_SERVERS = []

        if system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE):
            LOG.fixture_step("Config out-of-date alarm(s) present, check {} and lock/unlock if host config out-of-date".
                             format(HOSTS_AFFECTED))
            for host in HOSTS_AFFECTED:
                if system_helper.get_host_values(host, 'config_status')[0] == 'Config out-of-date':
                    LOG.info("Lock/unlock {} to clear config out-of-date status".format(host))
                    host_helper.lock_unlock_hosts(hosts=host)
                HOSTS_AFFECTED.remove(host)
Exemple #9
0
def check_volumes_spaces(con_ssh):
    from keywords import cinder_helper
    LOG.info('Checking cinder volumes and space usage')
    usage_threshold = 0.70
    free_space, total_space, unit = cinder_helper.get_lvm_usage(con_ssh)

    if total_space and free_space < usage_threshold * total_space:
        if total_space:
            LOG.info(
                'cinder LVM over-used: free:{}, total:{}, ration:{}%'.format(
                    free_space, total_space, free_space / total_space * 100))

        LOG.info('Deleting known LVM alarms')

        expected_reason = r'Cinder LVM .* Usage threshold exceeded; threshold: (\d+(\.\d+)?)%, actual: (\d+(\.\d+)?)%'
        expected_entity = 'host=controller'
        value_titles = ('UUID', 'Alarm ID', 'Reason Text', 'Entity ID')
        lvm_pool_usage = system_helper.get_alarms(fields=value_titles,
                                                  con_ssh=con_ssh)

        if not lvm_pool_usage:
            LOG.warn('Cinder LVM pool is used up to 75%, but no alarm for it')
        else:
            if len(lvm_pool_usage) > 1:
                LOG.warn(
                    'More than one alarm existing for Cinder LVM over-usage')
            elif len(lvm_pool_usage) < 1:
                LOG.warn('No LVM cinder over-used alarms, got:{}'.format(
                    lvm_pool_usage))

            for lvm_alarm in lvm_pool_usage:
                alarm_uuid, alarm_id, reason_text, entity_id = lvm_alarm.split(
                    '::::')

                if re.match(expected_reason, reason_text) and re.search(
                        expected_entity, entity_id):
                    LOG.info('Expected alarm:{}, reason:{}'.format(
                        alarm_uuid, reason_text))
                    LOG.info('Deleting it')
                    system_helper.delete_alarms(alarms=alarm_uuid)
def test_alarm_suppression(alarm_test_prep):
    """
       Verify suppression and unsuppression of active alarm and query alarms.

       Test Setup:
           - Unsuppress all alarms
             Generate alarms
       Test Steps:

            Suppress alarms
            Verify alarm supressed
            Generate alarm again
            Verify suppressed alarms no in active
            Unsuppressed alarm
            Verify unsuppressed in active alarm list.
            Delete last active alarm
       Test Teardown:
           - Unsuppress all alarms
    """
    LOG.tc_step('Suppress generated alarm and Verify it is suppressed')
    alarm_uuid = alarm_test_prep
    query_active_alarm = system_helper.get_alarms_table(query_key='uuid',
                                                        query_value=alarm_uuid)
    alarm_id = table_parser.get_values(table_=query_active_alarm,
                                       target_header='Alarm ID',
                                       **{"UUID": alarm_uuid})[0]
    assert '300.005' == alarm_id
    # alarm_id = ''.join(alarm_id)
    system_helper.suppress_event(alarm_id=alarm_id)

    LOG.tc_step('Generate Alarm again and Verify not in the Active list')
    system_helper.generate_event(event_id=alarm_id)
    alarms = system_helper.get_alarms(alarm_id=alarm_id)
    assert not alarms, "300.005 alarm appears in the active alarms table after regenerating"

    LOG.tc_step('UnSuppress alarm and verify it is unsuppressed')
    system_helper.unsuppress_event(alarm_id=alarm_id)
Exemple #11
0
 def restore_default_parameters():
     LOG.fixture_step(
         'Check MNFA service parameter values and revert if needed')
     mnfa_threshold_current_val = system_helper.get_service_parameter_values(
         service='platform', section='maintenance', name='mnfa_threshold')
     mnfa_timeout_default_current_val = system_helper.get_service_parameter_values(
         service='platform', section='maintenance', name='mnfa_timeout')
     alarms = system_helper.get_alarms(
         alarm_id=EventLogID.CONFIG_OUT_OF_DATE)
     if alarms or mnfa_threshold_current_val != mnfa_threshold_default_val or mnfa_timeout_default_val != \
             mnfa_timeout_default_current_val:
         system_helper.modify_service_parameter(
             service='platform',
             section='maintenance',
             name='mnfa_threshold',
             apply=False,
             value=mnfa_threshold_default_val[0])
         system_helper.modify_service_parameter(
             service='platform',
             check_first=False,
             section='maintenance',
             name='mnfa_timeout',
             apply=True,
             value=mnfa_timeout_default_val[0])
Exemple #12
0
def ntp_precheck(request, check_alarms):

    LOG.info("Gather NTP config and subcloud management info")
    central_auth = Tenant.get('admin_platform', dc_region='RegionOne')
    central_ntp = system_helper.get_ntp_servers(auth_info=central_auth)

    primary_subcloud = ProjVar.get_var('PRIMARY_SUBCLOUD')
    subcloud_auth = Tenant.get('admin_platform', dc_region=primary_subcloud)
    subcloud_ntp = system_helper.get_ntp_servers(auth_info=subcloud_auth)

    if not central_ntp == subcloud_ntp:
        dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud)

    managed_subclouds = dc_helper.get_subclouds(mgmt='managed', avail='online')
    ssh_map = ControllerClient.get_active_controllers_map()
    managed_subclouds = [subcloud for subcloud in managed_subclouds if subcloud in ssh_map]

    if primary_subcloud in managed_subclouds:
        managed_subclouds.remove(primary_subcloud)

    managed_subcloud = None
    if managed_subclouds:
        managed_subcloud = managed_subclouds.pop()
        LOG.fixture_step("Leave only one subcloud besides primary subcloud to be managed: {}".format(managed_subcloud))

    subclouds_to_revert = []
    if managed_subclouds:
        LOG.info("Unmange: {}".format(managed_subclouds))
        for subcloud in managed_subclouds:
            if not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                            auth_info=Tenant.get('admin_platform', subcloud)):
                subclouds_to_revert.append(subcloud)
                dc_helper.unmanage_subcloud(subcloud)

    def revert():
        reverted = False
        try:
            LOG.fixture_step("Manage primary subcloud {} if unmanaged".format(primary_subcloud))
            dc_helper.manage_subcloud(primary_subcloud)

            LOG.fixture_step("Revert NTP config if changed")
            res = system_helper.modify_ntp(ntp_servers=central_ntp, auth_info=central_auth, check_first=True,
                                           clear_alarm=False)[0]
            if res != -1:
                LOG.fixture_step("Lock unlock config out-of-date hosts in central region")
                system_helper.wait_and_clear_config_out_of_date_alarms(auth_info=central_auth,
                                                                       wait_with_best_effort=True)

                LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(primary_subcloud))
                dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=central_ntp,
                                                       clear_alarm=True)

                if managed_subcloud:
                    LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(managed_subcloud))
                    dc_helper.wait_for_subcloud_ntp_config(subcloud=managed_subcloud, expected_ntp=central_ntp,
                                                           clear_alarm=True)

            if subclouds_to_revert:
                LOG.fixture_step("Manage unmanaged subclouds and check they are unaffected")
                for subcloud in subclouds_to_revert:
                    dc_helper.manage_subcloud(subcloud)
                    assert not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                                        auth_info=Tenant.get('admin_platform', dc_region=subcloud))
            reverted = True

        finally:
            if not reverted:
                for subcloud in subclouds_to_revert:
                    dc_helper.manage_subcloud(subcloud)

    request.addfinalizer(revert)

    return primary_subcloud, managed_subcloud, central_ntp
Exemple #13
0
def test_system_patch_orchestration(patch_orchestration_setup):
    """
    This test verifies the patch orchestration operation procedures for release patches. The patch orchestration
    automatically patches all hosts on a system in the following order: controllers, storages, and computes.
    The test creates a patch  orchestration strategy or plan for automated patching operation with the following
    options to customize the test:

    --controller-apply-type : specifies how controllers are patched serially or in parallel.  By default controllers are
    patched always in serial regardless of the selection.
    --storage-apply-type : specifies how the storages are patched. Possible values are: serial, parallel or ignore. The
    default value is serial.
   --compute-apply-type : specifies how the computes are patched. Possible values are: serial, parallel or ignore. The
    default value is serial.
    --max-parallel-compute-hosts: specifies the maximum number of computes to patch in parallel. Possible values
    [2 - 100]The default is 2.
    --instance-action - For reboot-required patches,  specifies how the VM instances are moved from compute hosts being
    patched. Possible choices are:
        start-stop - VMs are stopped before compute host is patched.
        migrate - VMs are either live migrated or cold migrated off the compute before applying the patches.


    Args:
        patch_orchestration_setup:

    Returns:

    """

    lab = patch_orchestration_setup['lab']
    patching_helper.check_system_health(check_patch_ignored_alarms=False)

    LOG.info("Starting patch orchestration for lab {} .....".format(lab))

    patches = patch_orchestration_setup['patches']
    patch_ids = ' '.join(patches.keys())

    LOG.tc_step("Uploading  patches {} ... ".format(patch_ids))

    patch_dest_dir = HostLinuxUser.get_home() + '/patches'
    rc = patching_helper.run_patch_cmd('upload-dir', args=patch_dest_dir)[0]
    assert rc in [0,
                  1], "Fail to upload patches in dir {}".format(patch_dest_dir)

    uploaded = patching_helper.get_available_patches()
    if rc == 0:
        LOG.info("Patches uploaded: {}".format(uploaded))
    else:
        LOG.info("Patches are already in repo")

    if len(uploaded) > 0:
        LOG.tc_step("Applying patches ...")
        uploaded_patch_ids = ' '.join(uploaded)
        applied = patching_helper.apply_patches(
            patch_ids=uploaded_patch_ids)[1]

        LOG.info("Patches applied: {}".format(applied))
    else:
        LOG.info("No Patches are applied; Patches may be already applied: {}")

    partial_patches_ids = patching_helper.get_patches_in_state(
        (PatchState.PARTIAL_APPLY, PatchState.PARTIAL_REMOVE))
    if len(partial_patches_ids) > 0:

        current_alarms_ids = system_helper.get_alarms(mgmt_affecting=True,
                                                      combine_entries=False)
        affecting_alarms = [
            id_ for id_ in current_alarms_ids
            if id_[0] not in orchestration_helper.IGNORED_ALARM_IDS
        ]
        if len(affecting_alarms) > 0:
            assert system_helper.wait_for_alarms_gone(alarms=affecting_alarms, timeout=240, fail_ok=True)[0],\
                "Alarms present: {}".format(affecting_alarms)

        LOG.tc_step("Installing patches through orchestration  .....")
        patching_helper.orchestration_patch_hosts(
            controller_apply_type=patch_orchestration_setup[
                'controller_apply_strategy'],
            storage_apply_type=patch_orchestration_setup[
                'storage_apply_strategy'],
            compute_apply_type=patch_orchestration_setup[
                'compute_apply_strategy'],
            max_parallel_computes=patch_orchestration_setup[
                'max_parallel_computes'],
            instance_action=patch_orchestration_setup['instance_action'],
            alarm_restrictions=patch_orchestration_setup['alarm_restrictions'])

        LOG.info(
            " Applying Patch orchestration strategy completed for {} ....".
            format(partial_patches_ids))

        LOG.tc_step("Deleting  patches  orchestration strategy .....")
        delete_patch_strategy()
        LOG.info("Deleted  patch orchestration strategy .....")
    else:
        pytest.skip("All patches in  patch-dir are already in system.")
Exemple #14
0
def __get_alarms(scope):
    LOG.fixture_step("({}) Gathering system health info before test {} "
                     "begins.".format(scope, scope))
    alarms = system_helper.get_alarms()
    return alarms