Ejemplo n.º 1
0
def install_non_active_node(node_name, lab):
    """
    Install the non-active controller node, usually it is controller-1, the second controller
        on a non-AIO SX system.

    Args:
        node_name:
            - the name of the host/node, usually 'controller-1'
        lab:
            - lab to test
    """

    boot_interfaces = lab['boot_device_dict']
    LOG.tc_step("Restoring {}".format(node_name))
    install_helper.open_vlm_console_thread(node_name,
                                           boot_interface=boot_interfaces,
                                           vlm_power_on=True)

    LOG.info(
        "Verifying {} is Locked, Disabled and Online ...".format(node_name))
    system_helper.wait_for_hosts_states(node_name,
                                        administrative=HostAdminState.LOCKED,
                                        operational=HostOperState.DISABLED,
                                        availability=HostAvailState.ONLINE)

    LOG.info("Unlocking {} ...".format(node_name))
    rc, output = host_helper.unlock_host(node_name, available_only=False)

    assert rc == 0 or rc == 4, "Host {} failed to unlock: rc = {}, msg: {}".format(
        node_name, rc, output)

    if rc == 4:
        LOG.warn('{} now is in degraded status'.format(node_name))

    LOG.info('{} is installed'.format(node_name))
Ejemplo n.º 2
0
def upgrade_controller0():
    """
    Upgrades controller-0
    Returns:

    """

    # upgrade  controller-0
    LOG.tc_step("Upgrading  controller-0......")

    controller0 = 'controller-0'
    LOG.info("Ensure controller-0 is provisioned before upgrade.....")
    ensure_host_provisioned(controller0)
    LOG.info("Host {} is provisioned for upgrade.....".format(controller0))

    # open vlm console for controller-0 for boot through mgmt interface
    LOG.info("Opening a vlm console for controller-0 .....")
    install_helper.open_vlm_console_thread(controller0)

    LOG.info("Starting {} upgrade.....".format(controller0))
    upgrade_host(controller0, lock=True)
    LOG.info("controller-0 is upgraded successfully.....")

    # unlock upgraded controller-0
    LOG.tc_step("Unlocking controller-0 after upgrade......")
    host_helper.unlock_host(controller0, available_only=True)
    LOG.info("Host {} unlocked after upgrade......".format(controller0))
Ejemplo n.º 3
0
def _test_storage_profile(personality, from_backing, to_backing):
    """
    This test creates a storage profile and then applies it to a node with
    identical hardware, assuming one exists.

    Storage profiles do not apply on controller nodes.  Storage profiles can be
    applied on controller+compute nodes, compute nodes and storage nodes.

    Arguments:
    - personality (string) - controller, compute or storage
    - from_backing (string) - image, remote or None
    - to_backing (string) - image, remote or None

    Test Steps:
    1.  Query system and determine which nodes have compatible hardware.
    2.  Create a storage profile on one of those nodes
    3.  Apply the created storage profile on a compatible node*
    4.  Ensure the storage profiles have been successfully applied.

    * If the node is a compute node or a controller+compute, we will also change
      the backend if required for additional coverage.

    Returns:
    - Nothing
    """

    global PROFILES_TO_DELETE
    PROFILES_TO_DELETE = []

    # Skip if test is not applicable to hardware under test
    if personality == 'controller' and not system_helper.is_aio_system():
        skip("Test does not apply to controller hosts without subtype compute")

    hosts = system_helper.get_hosts(personality=personality)
    if not hosts:
        skip("No hosts of type {} available".format(personality))

    if (from_backing == "remote" or to_backing
            == "remote") and not system_helper.is_storage_system():
        skip("This test doesn't apply to systems without storage hosts")

    LOG.tc_step("Identify hardware compatible hosts")
    hash_to_hosts = get_hw_compatible_hosts(hosts)

    # Pick the hardware group that has the most compatible hosts
    current_size = 0
    candidate_hosts = []
    for value in hash_to_hosts:
        candidate_size = len(hash_to_hosts[value])
        if candidate_size > current_size:
            current_size = candidate_size
            candidate_hosts = hash_to_hosts[value]
    LOG.info(
        "This is the total set of candidate hosts: {}".format(candidate_hosts))

    if len(candidate_hosts) < 2:
        skip("Insufficient hardware compatible hosts to run test")

    # Rsync lab setup dot files between controllers
    con_ssh = ControllerClient.get_active_controller()
    _rsync_files_to_con1(con_ssh=con_ssh, file_to_check="force.txt")

    # Take the hardware compatible hosts and check if any of them already have
    # the backend that we want.  This will save us test time.
    new_to_backing = None
    if personality == "compute":
        from_hosts = []
        to_hosts = []
        for host in candidate_hosts:
            host_backing = host_helper.get_host_instance_backing(host)
            if host_backing == from_backing:
                from_hosts.append(host)
            elif host_backing == to_backing:
                to_hosts.append(host)
            else:
                pass
        LOG.info(
            "Candidate hosts that already have the right from backing {}: {}".
            format(from_backing, from_hosts))
        LOG.info(
            "Candidate hosts that already have the right to backing {}: {}".
            format(to_backing, to_hosts))

        # Determine what hosts to use
        if not from_hosts and to_hosts:
            to_host = random.choice(to_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        elif not to_hosts and from_hosts:
            from_host = random.choice(from_hosts)
            candidate_hosts.remove(from_host)
            to_host = random.choice(candidate_hosts)
        elif not to_hosts and not from_hosts:
            to_host = random.choice(candidate_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        else:
            to_host = random.choice(to_hosts)
            from_host = random.choice(from_hosts)

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)
        system_helper.wait_for_host_values(
            from_host,
            availability=HostAvailState.AVAILABLE,
            timeout=120,
            fail_ok=False)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    elif personality == "controller":
        # For now, we don't want to host reinstall controller-0 since it will default to
        # pxeboot, but this could be examined as a possible enhancement.
        from_host = "controller-0"
        to_host = "controller-1"

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    else:
        # Backing doesn't apply to storage nodes so just pick from compatible hardware
        from_host = random.choice(candidate_hosts)
        candidate_hosts.remove(from_host)
        to_host = random.choice(candidate_hosts)

    LOG.tc_step(
        "Create storage and interface profiles on the from host {}".format(
            from_host))
    prof_name = 'storprof_{}_{}'.format(
        from_host, time.strftime('%Y%m%d_%H%M%S', time.localtime()))
    storage_helper.create_storage_profile(from_host, profile_name=prof_name)
    PROFILES_TO_DELETE.append(prof_name)

    # Deleting VMs in case the remaining host(s) cannot handle all VMs
    # migrating on lock, particularly important in the case of AIO-DX systems.
    LOG.tc_step(
        "Delete all VMs and lock the host before applying the storage profile")
    vm_helper.delete_vms()
    HostsToRecover.add(to_host, scope='function')
    system_helper.wait_for_host_values(from_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)
    system_helper.wait_for_host_values(to_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)

    # Negative test #1 - attempt to apply profile on unlocked host (should be rejected)
    LOG.tc_step('Apply the storage-profile {} onto unlocked host:{}'.format(
        prof_name, to_host))
    cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
    rc, msg = cli.system(cmd, fail_ok=True)
    assert rc != 0, msg
    host_helper.lock_host(to_host, swact=True)

    # 3 conditions to watch for: no partitions, ready partitions and in-use
    # partitions on the compute.  If in-use, delete and freshly install host.
    # If ready, delete all ready partitions to make room for potentially new
    # partitions.  If no partitions, just delete nova-local lvg.
    if personality == "compute":

        # Negative test #2 - attempt to apply profile onto host with existing
        # nova-local (should be rejected)
        LOG.tc_step(
            'Apply the storage-profile {} onto host with existing nova-local:{}'
            .format(prof_name, to_host))
        cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
        rc, msg = cli.system(cmd, fail_ok=True)
        assert rc != 0, msg

        # If we were simply switching backing (without applying a storage
        # profile), the nova-local lvg deletion can be omitted according to design
        LOG.tc_step("Delete nova-local lvg on to host {}".format(to_host))
        cli.system("host-lvg-delete {} nova-local".format(to_host))

        in_use = storage_helper.get_host_partitions(to_host, "In-Use")

        if in_use:

            # Negative test #3 - attempt to apply profile onto host with existing
            # in-use partitions (should be rejected)
            LOG.tc_step('Apply the storage-profile {} onto host with existing \
                         in-use partitions:{}'.format(prof_name, to_host))
            cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
            rc, msg = cli.system(cmd, fail_ok=True)
            assert rc != 0, msg

            LOG.tc_step(
                "In-use partitions found.  Must delete the host and freshly install before proceeding."
            )
            LOG.info("Host {} has in-use partitions {}".format(
                to_host, in_use))
            lab = InstallVars.get_install_var("LAB")
            lab.update(create_node_dict(lab['compute_nodes'], 'compute'))
            lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
            install_helper.open_vlm_console_thread(to_host)

            LOG.tc_step("Delete the host {}".format(to_host))
            cli.system("host-bulk-export")
            cli.system("host-delete {}".format(to_host))
            assert len(
                system_helper.get_controllers()) > 1, "Host deletion failed"

            cli.system("host-bulk-add hosts.xml")
            system_helper.wait_for_host_values(
                to_host, timeout=6000, availability=HostAvailState.ONLINE)

            wait_for_disks(to_host)

        ready = storage_helper.get_host_partitions(to_host, "Ready")
        if ready:
            LOG.tc_step(
                "Ready partitions have been found.  Must delete them before profile application"
            )
            LOG.info("Host {} has Ready partitions {}".format(to_host, ready))
            for uuid in reversed(ready):
                storage_helper.delete_host_partition(to_host, uuid)
            # Don't bother restoring in this case since the system should be
            # functional after profile is applied.

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"

    if personality == "storage":
        if not storage_helper.is_ceph_healthy():
            skip("Cannot run test when ceph is not healthy")

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        # Re-provision interfaces through lab_setup.sh
        LOG.tc_step("Reprovision the host as necessary")
        files = ['interfaces']
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

    if personality == "controller":

        # Note, install helper doesn't work on all labs.  Some labs don't
        # display BIOS type which causes install helper to fail
        lab = InstallVars.get_install_var("LAB")
        lab.update(create_node_dict(lab['controller_nodes'], 'controller'))
        lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
        install_helper.open_vlm_console_thread(to_host)

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        assert len(system_helper.get_controllers()) > 1, "Host deletion failed"

        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step("Apply the storage-profile {} onto host:{}".format(
            prof_name, to_host))
        cli.system("host-apply-storprofile {} {}".format(to_host, prof_name))

        # Need to re-provision everything on node through lab_setup (except storage)
        LOG.tc_step("Reprovision the host as necessary")
        files = [
            'interfaces', 'cinder_device', 'vswitch_cpus', 'shared_cpus',
            'extend_cgts_vg', 'addresses'
        ]
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"
Ejemplo n.º 4
0
def test_install_cloned_image(install_clone_setup):

    controller1 = 'controller-1'

    lab = InstallVars.get_install_var('LAB')
    install_output_dir = ProjVar.get_var('LOG_DIR')

    controller0_node = lab['controller-0']
    hostnames = install_clone_setup['hostnames']
    system_mode = install_clone_setup['system_mode']
    lab_name = lab['name']
    LOG.info("Starting install-clone on AIO lab {} .... ".format(lab_name))
    LOG.tc_step("Booting controller-0 ... ")

    if controller0_node.telnet_conn is None:
        controller0_node.telnet_conn = install_helper.open_telnet_session(
            controller0_node, install_output_dir)
        try:
            controller0_node.telnet_conn.login()
        except:
            LOG.info("Telnet Login failed. Attempting to reset password")
            try:
                controller0_node.telnet_conn.login(reset=True)
            except:
                if controller0_node.telnet_conn:
                    controller0_node.telnet_conn.close()
                    controller0_node.telnet_conn = None

    if controller0_node.telnet_conn:
        install_helper.wipe_disk_hosts(hostnames)

    # power off hosts
    LOG.tc_step("Powring off system hosts ... ")
    install_helper.power_off_host(hostnames)

    install_helper.boot_controller(boot_usb=True,
                                   small_footprint=True,
                                   clone_install=True)

    # establish telnet connection with controller
    LOG.tc_step(
        "Establishing telnet connection with controller-0 after install-clone ..."
    )

    node_name_in_ini = '{}.*\~\$ '.format(controller0_node.host_name)
    normalized_name = re.sub(r'([^\d])0*(\d+)', r'\1\2', node_name_in_ini)

    # controller_prompt = Prompt.TIS_NODE_PROMPT_BASE.format(lab['name'].split('_')[0]) \
    #                     + '|' + Prompt.CONTROLLER_0 \
    #                     + '|{}'.format(node_name_in_ini) \
    #                     + '|{}'.format(normalized_name)

    if controller0_node.telnet_conn:
        controller0_node.telnet_conn.close()

    output_dir = ProjVar.get_var('LOG_DIR')
    controller0_node.telnet_conn = install_helper.open_telnet_session(
        controller0_node, output_dir)
    controller0_node.telnet_conn.login()
    controller0_node.telnet_conn.exec_cmd("xterm")

    LOG.tc_step("Verify install-clone status ....")
    install_helper.check_clone_status(
        tel_net_session=controller0_node.telnet_conn)

    LOG.info("Source Keystone user admin environment ...")

    #controller0_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc")

    LOG.tc_step("Checking controller-0 hardware ....")
    install_helper.check_cloned_hardware_status('controller-0')

    if system_mode == 'duplex':
        LOG.tc_step("Booting controller-1 ... ")
        boot_interfaces = lab['boot_device_dict']
        install_helper.open_vlm_console_thread('controller-1',
                                               boot_interface=boot_interfaces,
                                               vlm_power_on=True,
                                               wait_for_thread=True)

        LOG.info("waiting for {} to boot ...".format(controller1))

        LOG.info("Verifying {} is Locked, Disabled and Online ...".format(
            controller1))
        system_helper.wait_for_hosts_states(
            controller1,
            check_interval=20,
            use_telnet=True,
            con_telnet=controller0_node.telnet_conn,
            administrative=HostAdminState.LOCKED,
            operational=HostOperState.DISABLED,
            availability=HostAvailState.ONLINE)

        LOG.info("Unlocking {} ...".format(controller1))

        rc, output = host_helper.unlock_host(
            controller1,
            use_telnet=True,
            con_telnet=controller0_node.telnet_conn)
        assert rc == 0, "Host {} unlock failed: {}".format(controller1, output)

        LOG.info("Host {} unlocked successfully ...".format(controller1))

        LOG.info("Host controller-1  booted successfully... ")

        LOG.tc_step("Checking controller-1 hardware ....")
        install_helper.check_cloned_hardware_status(controller1)
    #
    LOG.tc_step("Customizing the cloned system ....")
    LOG.info("Changing the OAM IP configuration ... ")
    install_helper.update_oam_for_cloned_system(system_mode=system_mode)

    LOG.tc_step("Downloading lab specific license, config and scripts ....")
    software_version = system_helper.get_sw_version()
    load_path = BuildServerPath.LATEST_HOST_BUILD_PATHS[software_version]
    install_helper.download_lab_config_files(
        lab, install_clone_setup['build_server'], load_path)

    LOG.tc_step("Running lab cleanup to removed source attributes ....")
    install_helper.run_setup_script(script='lab_cleanup')

    LOG.tc_step(
        "Running lab setup script to upadate cloned system attributes ....")
    rc, output = install_helper.run_lab_setup()
    assert rc == 0, "Lab setup run failed: {}".format(output)

    time.sleep(30)
    LOG.tc_step(
        "Checking config status of controller-0 and perform lock/unlock if necessary..."
    )
    if system_helper.get_host_values(
            'controller-0', 'config_status')[0] == 'Config out-of-date':
        host_helper.lock_unlock_controllers()

    LOG.tc_step("Verifying system health after restore ...")
    system_helper.wait_for_all_alarms_gone(timeout=300)
    rc, failed = system_helper.get_system_health_query()
    assert rc == 0, "System health not OK: {}".format(failed)
def test_system_upgrade(vms_with_upgrade, upgrade_setup,
                        check_system_health_query_upgrade):
    LOG.info("Boot VM before upgrade ")
    vms = vms_with_upgrade
    vm_helper.ping_vms_from_natbox(vms)
    lab = upgrade_setup['lab']
    current_version = upgrade_setup['current_version']
    upgrade_version = upgrade_setup['upgrade_version']

    controller0 = lab['controller-0']
    upgrade_helper.ensure_host_provisioned(controller0.name)
    force = False
    LOG.tc_step("Checking system health for upgrade .....")
    if check_system_health_query_upgrade[0] == 0:
        LOG.info("System health OK for upgrade......")
    elif check_system_health_query_upgrade[0] == 2:
        LOG.info(
            "System health indicate minor alarms; using --force option to start upgrade......"
        )
        force = True
    else:
        assert False, "System health query upgrade failed: {}".format(
            check_system_health_query_upgrade[1])

    LOG.tc_step("Starting upgrade from release {} to target release {}".format(
        current_version, upgrade_version))
    upgrade_helper.system_upgrade_start(force=force)
    LOG.info("upgrade started successfully......")

    # upgrade standby controller
    LOG.tc_step("Upgrading controller-1")
    upgrade_helper.upgrade_host("controller-1", lock=True)
    LOG.info("Host controller-1 is upgraded successfully......")

    vm_helper.ping_vms_from_natbox(vms)
    # unlock upgraded controller-1
    LOG.tc_step("Unlocking controller-1 after upgrade......")
    host_helper.unlock_host("controller-1",
                            available_only=True,
                            check_hypervisor_up=False)
    LOG.info("Host controller-1 unlocked after upgrade......")

    # Swact to standby controller-1
    LOG.tc_step("Swacting to controller-1 .....")
    rc, output = host_helper.swact_host(hostname="controller-0")
    assert rc == 0, "Failed to swact: {}".format(output)
    LOG.info("Swacted and  controller-1 has become active......")

    # upgrade  controller-0
    LOG.tc_step("Upgrading  controller-0......")

    LOG.info("Ensure controller-0 is provisioned before upgrade.....")
    upgrade_helper.ensure_host_provisioned(controller0.name)
    LOG.info("Host {} is provisioned for upgrade.....".format(
        controller0.name))

    # open vlm console for controller-0 for boot through mgmt interface
    LOG.info("Opening a vlm console for controller-0 .....")
    install_helper.open_vlm_console_thread("controller-0")

    LOG.info("Starting {} upgrade.....".format(controller0.name))
    upgrade_helper.upgrade_host(controller0.name, lock=True)
    LOG.info("controller-0 is upgraded successfully.....")

    # unlock upgraded controller-0
    LOG.tc_step("Unlocking controller-0 after upgrade......")
    host_helper.unlock_host(controller0.name, available_only=True)
    LOG.info("Host {} unlocked after upgrade......".format(controller0.name))
    vm_helper.ping_vms_from_natbox(vms)
    upgrade_hosts = install_helper.get_non_controller_system_hosts()
    LOG.info(
        "Starting upgrade of the other system hosts: {}".format(upgrade_hosts))

    for host in upgrade_hosts:
        LOG.tc_step("Starting {} upgrade.....".format(host))
        if "storage" in host:
            # wait for replication  to be healthy
            storage_helper.wait_for_ceph_health_ok()

        upgrade_helper.upgrade_host(host, lock=True)
        LOG.info("{} is upgraded successfully.....".format(host))
        LOG.tc_step("Unlocking {} after upgrade......".format(host))
        host_helper.unlock_host(host, available_only=True)
        LOG.info("Host {} unlocked after upgrade......".format(host))
        LOG.info("Host {} upgrade complete.....".format(host))
        vm_helper.ping_vms_from_natbox(vms)

    # Activate the upgrade
    LOG.tc_step("Activating upgrade....")
    upgrade_helper.activate_upgrade()
    LOG.info("Upgrade activate complete.....")

    # Make controller-0 the active controller
    # Swact to standby controller-0
    LOG.tc_step("Making controller-0 active.....")
    rc, output = host_helper.swact_host(hostname="controller-1")
    assert rc == 0, "Failed to swact: {}".format(output)
    LOG.info("Swacted to controller-0 ......")

    # Complete upgrade
    LOG.tc_step("Completing upgrade from  {} to {}".format(
        current_version, upgrade_version))
    upgrade_helper.complete_upgrade()
    LOG.info("Upgrade is complete......")

    LOG.info("Lab: {} upgraded successfully".format(lab['name']))

    # Delete the previous load
    LOG.tc_step("Deleting  {} load... ".format(current_version))
    upgrade_helper.delete_imported_load()
    LOG.tc_step("Delete  previous load version {}".format(current_version))
Ejemplo n.º 6
0
def test_system_upgrade_controllers(upgrade_setup,
                                    check_system_health_query_upgrade):
    lab = upgrade_setup['lab']
    current_version = upgrade_setup['current_version']
    upgrade_version = upgrade_setup['upgrade_version']

    # run system upgrade-start
    # must be run in controller-0
    active_controller = system_helper.get_active_controller_name()
    LOG.tc_step("Checking if active controller is controller-0......")
    assert "controller-0" in active_controller, "The active controller is not " \
                                                "controller-0. Make controller-0 " \
                                                "active before starting upgrade"

    force = False
    LOG.tc_step("Checking system health for upgrade .....")
    if check_system_health_query_upgrade[0] == 0:
        LOG.info("System health OK for upgrade......")
    elif check_system_health_query_upgrade[0] == 2:
        LOG.info(
            "System health indicate minor alarms; using --force option to start upgrade......"
        )
        force = True
    else:
        assert False, "System health query upgrade failed: {}".format(
            check_system_health_query_upgrade[1])

    LOG.info("Starting upgrade from release {} to target release {}".format(
        current_version, upgrade_version))
    upgrade_helper.system_upgrade_start(force=force)
    LOG.tc_step("upgrade started successfully......")

    # upgrade standby controller
    LOG.tc_step("Upgrading controller-1")
    upgrade_helper.upgrade_host("controller-1", lock=True)
    LOG.tc_step("Host controller-1 is upgraded successfully......")

    # unlock upgraded controller-1
    LOG.tc_step("Unlocking controller-1 after upgrade......")
    host_helper.unlock_host("controller-1",
                            available_only=True,
                            check_hypervisor_up=False)
    LOG.tc_step("Host controller-1 unlocked after upgrade......")

    time.sleep(60)
    # Before Swacting ensure the controller-1 is in available state
    if not system_helper.wait_for_host_values(
            "controller-1",
            timeout=360,
            fail_ok=True,
            operational=HostOperState.ENABLED,
            availability=HostAvailState.AVAILABLE):
        err_msg = " Swacting to controller-1 is not possible because controller-1 is not in available state " \
                  "within  the specified timeout"
        assert False, err_msg

    # Swact to standby controller-1
    LOG.tc_step("Swacting to controller-1 .....")
    rc, output = host_helper.swact_host(hostname="controller-0")
    assert rc == 0, "Failed to swact: {}".format(output)
    LOG.info("Swacted and  controller-1 has become active......")
    time.sleep(60)
    # upgrade  controller-0
    LOG.tc_step("Upgrading  controller-0......")
    controller0 = lab['controller-0']

    LOG.info("Ensure controller-0 is provisioned before upgrade.....")
    upgrade_helper.ensure_host_provisioned(controller0.name)
    LOG.info("Host {} is provisioned for upgrade.....".format(
        controller0.name))

    # open vlm console for controller-0 for boot through mgmt interface
    LOG.info("Opening a vlm console for controller-0 .....")
    install_helper.open_vlm_console_thread("controller-0")

    LOG.info("Starting {} upgrade.....".format(controller0.name))
    upgrade_helper.upgrade_host(controller0.name, lock=True)
    LOG.info("controller-0 is upgraded successfully.....")

    # unlock upgraded controller-0
    LOG.tc_step("Unlocking controller-0 after upgrade......")
    host_helper.unlock_host(controller0.name, available_only=True)
    LOG.info("Host {} unlocked after upgrade......".format(controller0.name))
Ejemplo n.º 7
0
def test_system_upgrade(upgrade_setup, check_system_health_query_upgrade):
    lab = upgrade_setup['lab']
    current_version = upgrade_setup['current_version']
    upgrade_version = upgrade_setup['upgrade_version']
    bld_server = upgrade_setup['build_server']
    collect_kpi = upgrade_setup['col_kpi']
    missing_manifests = False
    cinder_configuration = False
    force = False

    controller0 = lab['controller-0']
    if not upgrade_helper.is_host_provisioned(controller0.name):
        rc, output = upgrade_helper.upgrade_host_lock_unlock(controller0.name)
        assert rc == 0, "Failed to lock/unlock host {}: {}".format(
            controller0.name, output)

    # update health query
    # system_upgrade_health = list(upgrade_helper.get_system_health_query_upgrade())
    system_upgrade_health = list(
        upgrade_helper.get_system_health_query_upgrade_2())

    LOG.tc_step("Checking system health for upgrade .....")
    if system_upgrade_health[0] == 0:
        LOG.info("System health OK for upgrade......")
    elif system_upgrade_health[0] == 2:
        if system_upgrade_health[2] and "lock_unlock" in system_upgrade_health[
                2].keys():
            controller_nodes = system_upgrade_health[2]["lock_unlock"][0]
            LOG.info("Locking/Unlocking required for {} ......".format(
                controller_nodes))
            if 'controller-1' in controller_nodes:
                rc, output = upgrade_helper.upgrade_host_lock_unlock(
                    'controller-1')
                assert rc == 0, "Failed to lock/unlock host {}: {}".format(
                    'controller-1', output)
            if 'controller-0' in controller_nodes:
                rc, output = upgrade_helper.upgrade_host_lock_unlock(
                    'controller-0')
                assert rc == 0, "Failed to lock/unlock host {}: {}".format(
                    'controller-0', output)
                time.sleep(60)
                # system_upgrade_health[2]["swact"][0] = False
        if system_upgrade_health[2]["swact"][0]:
            LOG.info("Swact Required: {}".format(
                system_upgrade_health[2]["swact"][1]))
            host_helper.swact_host('controller-0')
            time.sleep(60)
            host_helper.swact_host('controller-1')
            time.sleep(60)
        if system_upgrade_health[2]["force_upgrade"][0]:
            LOG.info("{}; using --force option to start upgrade......".format(
                system_upgrade_health[2]["force_upgrade"][1]))
            force = True

    else:
        assert False, "System health query upgrade failed: {}".format(
            system_upgrade_health[1])

    # if system_upgrade_health[0] == 0:
    #     LOG.info("System health OK for upgrade......")
    # if system_upgrade_health[0] == 1:
    #     assert False, "System health query upgrade failed: {}".format(system_upgrade_health[1])
    #
    # if system_upgrade_health[0] == 4 or system_upgrade_health[0] == 2:
    #     LOG.info("System health indicate missing manifests; lock/unlock controller-0 to resolve......")
    #     missing_manifests = True
    #     if any("Cinder configuration" in k for k in system_upgrade_health[1].keys()):
    #         cinder_configuration = True
    #
    # if system_upgrade_health[0] == 3 or system_upgrade_health[0] == 2:
    #
    #     LOG.info("System health indicate minor alarms; using --force option to start upgrade......")
    #     force = True
    #
    # if missing_manifests:
    #     LOG.info("Locking/Unlocking to resolve missing manifests in controller......")
    #
    #     lock_unlock_hosts = []
    #     if any("controller-1" in k for k in system_upgrade_health[1].keys()):
    #         lock_unlock_hosts.append('controller-1')
    #     if any("controller-0" in k for k in system_upgrade_health[1].keys()):
    #         lock_unlock_hosts.append('controller-0')
    #         cinder_configuration = False
    #
    #     for host in lock_unlock_hosts:
    #         rc, output = upgrade_helper.upgrade_host_lock_unlock(host)
    #         assert rc == 0, "Failed to lock/unlock host {}: {}".format(host, output)
    #
    # if cinder_configuration:
    #     LOG.info("Invalid Cinder configuration: Swact to controller-1 and back to synchronize.......")
    #     host_helper.swact_host('controller-0')
    #     time.sleep(60)
    #     host_helper.swact_host('controller-1')

    LOG.tc_step("Starting upgrade from release {} to target release {}".format(
        current_version, upgrade_version))
    upgrade_helper.system_upgrade_start(force=force)
    upgrade_helper.wait_for_upgrade_states("started")
    LOG.info("upgrade started successfully......")
    if collect_kpi:
        upgrade_helper.collect_upgrade_start_kpi(lab, collect_kpi)

    # upgrade standby controller
    LOG.tc_step("Upgrading controller-1")
    upgrade_helper.upgrade_host("controller-1", lock=True)
    LOG.info("Host controller-1 is upgraded successfully......")

    # unlock upgraded controller-1
    LOG.tc_step("Unlocking controller-1 after upgrade......")
    host_helper.unlock_host("controller-1",
                            timeout=(HostTimeout.CONTROLLER_UNLOCK + 10),
                            available_only=True,
                            check_hypervisor_up=False)
    LOG.info("Host controller-1 unlocked after upgrade......")

    time.sleep(60)

    # Before Swacting ensure the controller-1 is in available state
    if not system_helper.wait_for_host_values(
            "controller-1",
            timeout=600,
            fail_ok=True,
            operational=HostOperState.ENABLED,
            availability=HostAvailState.AVAILABLE):
        err_msg = " Swacting to controller-1 is not possible because controller-1 is not in available state " \
                  "within  the specified timeout"
        assert False, err_msg

    # Swact to standby contime.sleep(60)  troller-1
    LOG.tc_step("Swacting to controller-1 .....")
    rc, output = host_helper.swact_host(hostname="controller-0")
    assert rc == 0, "Failed to swact: {}".format(output)
    LOG.info("Swacted and  controller-1 has become active......")

    time.sleep(60)

    # upgrade  controller-0
    LOG.tc_step("Upgrading  controller-0......")
    controller0 = lab['controller-0']

    # open vlm console for controller-0 for boot through mgmt interface
    if 'vbox' not in lab['name']:
        LOG.info("Opening a vlm console for controller-0 .....")
        install_helper.open_vlm_console_thread("controller-0", upgrade=True)

    LOG.info("Starting {} upgrade.....".format(controller0.name))
    upgrade_helper.upgrade_host(controller0.name, lock=True)
    LOG.info("controller-0 is upgraded successfully.....")

    # unlock upgraded controller-0
    LOG.tc_step("Unlocking controller-0 after upgrade......")
    host_helper.unlock_host(controller0.name, available_only=True)
    LOG.info("Host {} unlocked after upgrade......".format(controller0.name))

    upgrade_hosts = install_helper.get_non_controller_system_hosts()
    LOG.info(
        "Starting upgrade of the other system hosts: {}".format(upgrade_hosts))

    for host in upgrade_hosts:
        LOG.tc_step("Starting {} upgrade.....".format(host))
        if "storage" in host:
            # wait for replication  to be healthy
            ceph_health_timeout = 300
            if 'vbox' in lab['name']:
                ceph_health_timeout = 3600
            storage_helper.wait_for_ceph_health_ok(timeout=ceph_health_timeout)

        upgrade_helper.upgrade_host(host, lock=True)
        LOG.info("{} is upgraded successfully.....".format(host))
        LOG.tc_step("Unlocking {} after upgrade......".format(host))
        host_helper.unlock_host(host, available_only=True)
        LOG.info("Host {} unlocked after upgrade......".format(host))
        LOG.info("Host {} upgrade complete.....".format(host))

    # Activate the upgrade
    LOG.tc_step("Activating upgrade....")
    upgrade_helper.activate_upgrade()
    LOG.info("Upgrade activate complete.....")

    # Make controller-0 the active controller
    # Swact to standby controller-0
    LOG.tc_step("Making controller-0 active.....")
    rc, output = host_helper.swact_host(hostname="controller-1")
    assert rc == 0, "Failed to swact: {}".format(output)
    LOG.info("Swacted to controller-0 ......")

    # Complete upgrade
    LOG.tc_step("Completing upgrade from  {} to {}".format(
        current_version, upgrade_version))
    upgrade_helper.complete_upgrade()
    LOG.info("Upgrade is complete......")

    LOG.info("Lab: {} upgraded successfully".format(lab['name']))

    # Delete the previous load
    LOG.tc_step("Deleting  {} load... ".format(current_version))
    upgrade_helper.delete_imported_load()
    LOG.tc_step("Delete  previous load version {}".format(current_version))

    LOG.tc_step(
        "Downloading images to upgraded {} lab ".format(upgrade_version))
    install_helper.download_image(
        lab, bld_server, BuildServerPath.GUEST_IMAGE_PATHS[upgrade_version])

    load_path = upgrade_setup['load_path']

    LOG.tc_step(
        "Downloading heat temples to upgraded {} lab ".format(upgrade_version))
    install_helper.download_heat_templates(lab, bld_server, load_path)

    LOG.tc_step("Downloading lab config scripts to upgraded {} lab ".format(
        upgrade_version))
    install_helper.download_lab_config_files(lab, bld_server, load_path)
Ejemplo n.º 8
0
def test_restore(restore_setup):
    controller1 = 'controller-1'
    controller0 = 'controller-0'

    lab = restore_setup["lab"]
    is_aio_lab = lab.get('system_type', 'Standard') == 'CPE'
    is_sx = is_aio_lab and (len(lab['controller_nodes']) < 2)

    tis_backup_files = restore_setup['tis_backup_files']
    backup_src = RestoreVars.get_restore_var('backup_src'.upper())
    backup_src_path = RestoreVars.get_restore_var('backup_src_path'.upper())

    controller_node = lab[controller0]
    con_ssh = ControllerClient.get_active_controller(name=lab['short_name'],
                                                     fail_ok=True)
    sys_prompt = Prompt.TIS_NODE_PROMPT_BASE.format('.*' +
                                                    lab['name'].split('_')[0])
    controller_prompt = '{}|{}'.format(sys_prompt, Prompt.CONTROLLER_0)
    controller_node.telnet_conn.set_prompt(controller_prompt)

    if not con_ssh:
        LOG.info("Establish ssh connection with {}".format(controller0))
        controller_node.ssh_conn = install_helper.ssh_to_controller(
            controller_node.host_ip, initial_prompt=controller_prompt)
        controller_node.ssh_conn.deploy_ssh_key()
        con_ssh = controller_node.ssh_conn
        ControllerClient.set_active_controller(con_ssh)

    LOG.info("Restore system from backup....")
    system_backup_file = [
        file for file in tis_backup_files if "system.tgz" in file
    ].pop()
    images_backup_file = [
        file for file in tis_backup_files if "images.tgz" in file
    ].pop()

    LOG.tc_step("Restoring {}".format(controller0))

    LOG.info("System config restore from backup file {} ...".format(
        system_backup_file))

    if backup_src.lower() == 'usb':
        system_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            system_backup_file)
    else:
        system_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           system_backup_file)

    compute_configured = install_helper.restore_controller_system_config(
        system_backup=system_backup_path, is_aio=is_aio_lab)[2]

    # return

    LOG.info('re-connect to the active controller using ssh')
    con_ssh.close()
    controller_node.ssh_conn = install_helper.ssh_to_controller(
        controller_node.host_ip, initial_prompt=controller_prompt)
    LOG.info("Source Keystone user admin environment ...")
    LOG.info("set prompt to:{}, telnet_conn:{}".format(
        controller_prompt, controller_node.telnet_conn))

    controller_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc")
    con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
    controller_node.ssh_conn = con_ssh
    ControllerClient.set_active_controller(con_ssh)

    make_sure_all_hosts_locked(con_ssh)

    if backup_src.lower() == 'local':
        images_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           images_backup_file)
        common.scp_from_test_server_to_active_controller(
            "{}/{}".format(backup_src_path, images_backup_file),
            HostLinuxUser.get_home())
    else:
        images_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            images_backup_file)

    LOG.info(
        "Images restore from backup file {} ...".format(images_backup_file))

    new_prompt = r'{}.*~.*\$ |controller\-0.*~.*\$ '.format(
        lab['name'].split('_')[0])
    LOG.info('set prompt to:{}'.format(new_prompt))
    con_ssh.set_prompt(new_prompt)

    install_helper.restore_controller_system_images(
        images_backup=images_backup_path,
        tel_net_session=controller_node.telnet_conn)
    # this is a workaround for CGTS-8190
    install_helper.update_auth_url(con_ssh)

    LOG.tc_step(
        "Verifying  restoring controller-0 is complete and is in available state ..."
    )
    LOG.debug('Wait for system ready in 60 seconds')
    time.sleep(60)

    timeout = HostTimeout.REBOOT + 60
    availability = HostAvailState.AVAILABLE
    is_available = system_helper.wait_for_hosts_states(
        controller0,
        availability=HostAvailState.AVAILABLE,
        fail_ok=True,
        timeout=timeout)
    if not is_available:
        LOG.warn(
            'After {} seconds, the first node:{} does NOT reach {}'.format(
                timeout, controller0, availability))
        LOG.info('Check if drbd is still synchronizing data')
        con_ssh.exec_sudo_cmd('drbd-overview')
        is_degraded = system_helper.wait_for_hosts_states(
            controller0,
            availability=HostAvailState.DEGRADED,
            fail_ok=True,
            timeout=300)
        if is_degraded:
            LOG.warn('Node: {} is degraded: {}'.format(
                controller0, HostAvailState.DEGRADED))
            con_ssh.exec_sudo_cmd('drbd-overview')
        else:
            LOG.fatal('Node:{} is NOT in Available nor Degraded status')
            # the customer doc does have wording regarding this situation, continue
            # assert False, 'Node:{} is NOT in Available nor Degraded status'

    # delete the system backup files from sysadmin home
    LOG.tc_step("Copying backup files to /opt/backups ... ")
    if backup_src.lower() == 'local':
        con_ssh.exec_cmd("rm -f {} {}".format(system_backup_path,
                                              images_backup_path))

        cmd_rm_known_host = r'sed -i "s/^[^#]\(.*\)"/#\1/g /etc/ssh/ssh_known_hosts; \sync'
        con_ssh.exec_sudo_cmd(cmd_rm_known_host)

        # transfer all backup files to /opt/backups from test server
        with con_ssh.login_as_root():
            con_ssh.scp_on_dest(source_user=TestFileServer.get_user(),
                                source_ip=TestFileServer.get_server(),
                                source_pswd=TestFileServer.get_password(),
                                source_path=backup_src_path + "/*",
                                dest_path=StxPath.BACKUPS + '/',
                                timeout=1200)

    else:
        # copy all backupfiles from USB to /opt/backups
        cmd = " cp  {}/* {}".format(BackupRestore.USB_BACKUP_PATH,
                                    StxPath.BACKUPS)
        con_ssh.exec_sudo_cmd(cmd, expect_timeout=600)

    LOG.tc_step("Checking if backup files are copied to /opt/backups ... ")
    assert int(con_ssh.exec_cmd("ls {} | wc -l".format(StxPath.BACKUPS))[1]) >= 2, \
        "Missing backup files in {}".format(StxPath.BACKUPS)

    if is_aio_lab:
        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756)')
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.login()
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects=[' will reboot on completion'])

        LOG.info('- wait untill reboot completes, ')
        time.sleep(120)
        LOG.info('- confirm the active controller is actually back online')
        controller_node.telnet_conn.login()

        LOG.tc_step(
            "reconnecting to the active controller after restore-complete")
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)

        if not compute_configured:
            LOG.tc_step(
                'Latest 18.07 EAR1 or Old-load on AIO/CPE lab: config its '
                'compute functionalities')
            # install_helper.run_cpe_compute_config_complete(controller_node, controller0)

            # LOG.info('closing current ssh connection')
            # con_ssh.close()

            LOG.tc_step('Run restore-complete (CGTS-9756)')
            controller_node.telnet_conn.login()

            cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.\
                format(HostLinuxUser.get_password())
            controller_node.telnet_conn.exec_cmd(cmd,
                                                 extra_expects=' will reboot ')
            controller_node.telnet_conn.close()

            LOG.info(
                'Wait until "config_controller" reboot the active controller')
            time.sleep(180)

            controller_node.telnet_conn = install_helper.open_telnet_session(
                controller_node)
            controller_node.telnet_conn.login()
            time.sleep(120)

            con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
            controller_node.ssh_conn = con_ssh

            ControllerClient.set_active_controller(con_ssh)

            host_helper.wait_for_hosts_ready(controller0)

        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        if not is_sx:
            install_non_active_node(controller1, lab)

    elif len(lab['controller_nodes']) >= 2:
        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        install_non_active_node(controller1, lab)

        boot_interfaces = lab['boot_device_dict']

        hostnames = system_helper.get_hosts()
        storage_hosts = [host for host in hostnames if 'storage' in host]
        compute_hosts = [
            host for host in hostnames
            if 'storage' not in host and 'controller' not in host
        ]

        if len(storage_hosts) > 0:
            # con_ssh.exec_sudo_cmd('touch /etc/ceph/ceph.client.None.keyring')
            for storage_host in storage_hosts:
                LOG.tc_step("Restoring {}".format(storage_host))
                install_helper.open_vlm_console_thread(
                    storage_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        storage_host))
                system_helper.wait_for_hosts_states(
                    storage_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)

                LOG.info("Unlocking {} ...".format(storage_host))
                rc, output = host_helper.unlock_host(storage_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    storage_host, rc, output)

            LOG.info("Veryifying the Ceph cluster is healthy ...")
            storage_helper.wait_for_ceph_health_ok(timeout=600)

            LOG.info("Importing images ...")
            image_backup_files = install_helper.get_backup_files(
                IMAGE_BACKUP_FILE_PATTERN, StxPath.BACKUPS, con_ssh)
            LOG.info("Image backup found: {}".format(image_backup_files))
            imported = install_helper.import_image_from_backup(
                image_backup_files)
            LOG.info("Images successfully imported: {}".format(imported))

        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756), regular lab')
        controller_node.telnet_conn.login()
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects='controller-0 login:'******'rebuild ssh connection')
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
        controller_node.ssh_conn = con_ssh

        LOG.tc_step("Restoring Compute Nodes ...")
        if len(compute_hosts) > 0:
            for compute_host in compute_hosts:
                LOG.tc_step("Restoring {}".format(compute_host))
                install_helper.open_vlm_console_thread(
                    compute_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        compute_host))
                system_helper.wait_for_hosts_states(
                    compute_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)
                LOG.info("Unlocking {} ...".format(compute_host))
                rc, output = host_helper.unlock_host(compute_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    compute_host, rc, output)

        LOG.info("All nodes {} are restored ...".format(hostnames))
    else:
        LOG.warn('Only 1 controller, but not AIO lab!!??')

    LOG.tc_step("Delete backup files from {} ....".format(StxPath.BACKUPS))
    con_ssh.exec_sudo_cmd("rm -rf {}/*".format(StxPath.BACKUPS))

    LOG.tc_step('Perform post-restore testing/checking')
    post_restore_test(con_ssh)

    LOG.tc_step("Waiting until all alarms are cleared ....")
    timeout = 300
    healthy, alarms = system_helper.wait_for_all_alarms_gone(timeout=timeout,
                                                             fail_ok=True)
    if not healthy:
        LOG.warn('Alarms exist: {}, after waiting {} seconds'.format(
            alarms, timeout))
        rc, message = con_ssh.exec_sudo_cmd('drbd-overview')

        if rc != 0 or (r'[===>' not in message
                       and r'] sync\'ed: ' not in message):
            LOG.warn('Failed to get drbd-overview information')

        LOG.info('Wait for the system to be ready in {} seconds'.format(
            HostTimeout.REBOOT))
        system_helper.wait_for_all_alarms_gone(timeout=HostTimeout.REBOOT,
                                               fail_ok=False)

    LOG.tc_step("Verifying system health after restore ...")
    rc, failed = system_helper.get_system_health_query(con_ssh=con_ssh)
    assert rc == 0, "System health not OK: {}".format(failed)

    collect_logs()