Beispiel #1
0
def osd_encryption_verification():
    """
    Verify if OSD encryption at rest if successfully deployed on OCS

    Raises:
        UnsupportedFeatureError: OCS version is smaller than 4.6
        EnvironmentError: The OSD is not encrypted
    """
    ocs_version = float(config.ENV_DATA["ocs_version"])
    if ocs_version < 4.6:
        error_message = "Encryption at REST can be enabled only on OCS >= 4.6!"
        raise UnsupportedFeatureError(error_message)

    osd_node_names = get_osd_running_nodes()
    osd_size = get_osd_size()
    lsblk_output_list = []
    for worker_node in osd_node_names:
        lsblk_cmd = "oc debug node/" + worker_node + " -- chroot /host lsblk"
        out = run_cmd(lsblk_cmd)
        log.info(f"the output from lsblk command is {out}")
        lsblk_output_list.append(out)

    for node_output_lsblk in lsblk_output_list:
        node_lsb = node_output_lsblk.split()
        # Search 'crypt' in node_lsb list
        if "crypt" not in node_lsb:
            raise EnvironmentError("OSD is not encrypted")
        index_crypt = node_lsb.index("crypt")
        encrypted_component_size = int(
            (re.findall(r"\d+", node_lsb[index_crypt - 2]))[0])
        # Verify that OSD is encrypted, and not another component like sda
        if encrypted_component_size != osd_size:
            raise EnvironmentError(
                "The OSD is not encrypted, another mount encrypted.")
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        if pod_name_of_node == 'couchbase':
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == 'osd':
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == 'master':
            node_list = get_master_nodes()

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type='master',
                                                   print_table=True)
        # Restart relevant node
        nodes.restart_nodes(node_1)
        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete...")
        self.sanity_helpers.health_check()
Beispiel #3
0
def select_osd_node_name():
    """
    select randomly one of the osd nodes

    Returns:
        str: the selected osd node name

    """
    osd_node_names = node.get_osd_running_nodes()
    osd_node_name = random.choice(osd_node_names)
    log.info(f"Selected OSD is {osd_node_name}")
    return osd_node_name
Beispiel #4
0
def check_automated_recovery_from_stopped_node(nodes):
    """
    1) Stop node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) The node should power on automatically, or if removed from the cluster,
       a new node should create automatically.
    4) The new osd pods with the same ids should start on the stopped node after it powered on,
       or to start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node_name)

    nodes.stop_nodes([osd_node], wait=True)
    log.info(f"Successfully powered off node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    try:
        log.info(f"Wait for the node: {osd_node_name} to power on")
        wait_for_nodes_status([osd_node_name])
        log.info(f"Successfully powered on node {osd_node_name}")
    except ResourceWrongStatusException as e:
        log.info(
            f"The worker node {osd_node_name} didn't start due to the exception {str(e)} "
            f"Probably it has been removed from the cluster. Waiting for a new node to come up..."
        )
        new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)
        osd_node_name = new_wnode.name

    assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                            old_osd_pod_ids,
                                            timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
    )
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
Beispiel #6
0
def check_automated_recovery_from_drain_node(nodes):
    """
    1) Drain one worker node.
    2) Delete the OSD pods associated with the node.
    3) The new OSD pods with the same ids that come up, should be in a Pending state.
    4) Schedule the worker node.
    5) The OSD pods associated with the node, should back into a Running state, and come up
        on the same node.

    """
    osd_node_name = random.choice(get_osd_running_nodes())
    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")
    node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids)

    unschedule_nodes([osd_node_name])
    log.info(f"Successfully unschedule the node: {osd_node_name}")

    log.info("Delete the node osd pods")
    delete_pods(node_osd_pods)

    new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids)
    new_osd_pod_names = [p.name for p in new_osd_pods]

    wnodes = get_worker_nodes()
    if len(wnodes) <= 3:
        expected_pods_status = constants.STATUS_PENDING
    else:
        expected_pods_status = constants.STATUS_RUNNING

    log.info(
        f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state"
    )
    res = wait_for_pods_to_be_in_statuses(
        [expected_pods_status],
        new_osd_pod_names,
        raise_pod_not_found_error=True,
    )
    assert res, f"Not all the node osd pods are in a {expected_pods_status} state"

    log.info(f"Wait for the node: {osd_node_name} to be scheduled")
    schedule_nodes([osd_node_name])
    log.info(f"Successfully scheduled the node {osd_node_name}")

    if len(wnodes) <= 3:
        assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids)
        log.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )
Beispiel #7
0
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Choose a node based on pod it contains
        if pod_name == 'postgres':
            node_list = pgsql.get_pgsql_nodes()
        elif pod_name == 'osd':
            node_list = get_osd_running_nodes()
        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
def delete_and_create_osd_node(osd_node_name):
    """
    Delete an osd node, and create a new one to replace it

    Args:
        osd_node_name (str): The osd node name to delete

    """
    new_node_name = None
    osd_pod = node.get_node_pods(osd_node_name,
                                 pods_to_search=pod.get_osd_pods())[0]
    old_osd_id = pod.get_osd_pod_id(osd_pod)

    old_osd_node_names = node.get_osd_running_nodes()

    # error message for invalid deployment configuration
    msg_invalid = ("ocs-ci config 'deployment_type' value "
                   f"'{config.ENV_DATA['deployment_type']}' is not valid, "
                   f"results of this test run are all invalid.")
    # TODO: refactor this so that AWS is not a "special" platform
    if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
        if config.ENV_DATA["deployment_type"] == "ipi":
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)

        elif config.ENV_DATA["deployment_type"] == "upi":
            new_node_name = node.delete_and_create_osd_node_aws_upi(
                osd_node_name)
        else:
            log.error(msg_invalid)
            pytest.fail(msg_invalid)
    elif config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS:
        if config.ENV_DATA["deployment_type"] == "ipi":
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)
        else:
            log.error(msg_invalid)
            pytest.fail(msg_invalid)
    elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
        if is_lso_cluster():
            new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso(
                osd_node_name, use_existing_node=False)

        else:
            new_node_name = node.delete_and_create_osd_node_vsphere_upi(
                osd_node_name, use_existing_node=False)

    log.info("Start node replacement verification steps...")
    check_node_replacement_verification_steps(osd_node_name, new_node_name,
                                              old_osd_node_names, old_osd_id)
Beispiel #9
0
def delete_and_create_osd_node(osd_node_name):
    """
    Delete an osd node, and create a new one to replace it

    Args:
        osd_node_name (str): The osd node name to delete

    """
    new_node_name = None
    old_osd_ids = node.get_node_osd_ids(osd_node_name)

    old_osd_node_names = node.get_osd_running_nodes()

    # error message for invalid deployment configuration
    msg_invalid = ("ocs-ci config 'deployment_type' value "
                   f"'{config.ENV_DATA['deployment_type']}' is not valid, "
                   f"results of this test run are all invalid.")

    if config.ENV_DATA["deployment_type"] == "ipi":
        if is_lso_cluster():
            # TODO: Implement functionality for Internal-Attached devices mode
            # once ocs-ci issue #4545 is resolved
            # https://github.com/red-hat-storage/ocs-ci/issues/4545
            pytest.skip(
                "Functionality not implemented for this deployment mode")
        else:
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)

    elif config.ENV_DATA["deployment_type"] == "upi":
        if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
            new_node_name = node.delete_and_create_osd_node_aws_upi(
                osd_node_name)
        elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            if is_lso_cluster():
                new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso(
                    osd_node_name, use_existing_node=False)
            else:
                new_node_name = node.delete_and_create_osd_node_vsphere_upi(
                    osd_node_name, use_existing_node=False)
    else:
        log.error(msg_invalid)
        pytest.fail(msg_invalid)

    log.info("Start node replacement verification steps...")
    check_node_replacement_verification_steps(osd_node_name, new_node_name,
                                              old_osd_node_names, old_osd_ids)
Beispiel #10
0
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=1, transactions=transactions)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Select a node where pgbench is not running and reboot
        osd_nodes_list = get_osd_running_nodes()
        node_list = pgsql.filter_pgbench_nodes_from_nodeslist(osd_nodes_list)

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])
        log.info(f"Selected node {node_1} for reboot operation")

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
Beispiel #11
0
    def test_osd_node_restart_and_check_osd_pods_status(self, nodes):
        """
        1) Restart one of the osd nodes.
        2) Check that the osd pods associated with the node should change to a Terminating state.
        3) Wait for the node to reach Ready state.
        4) Check that the new osd pods with the same ids start on the same node.
        5) Check the worker nodes security groups.
        """
        # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162
        if is_ms_consumer_cluster():
            logger.info(
                "The test is applicable only for an MS provider cluster. "
                "Switching to the provider cluster...")
            config.switch_to_provider()

        self.create_resources()

        osd_node_name = random.choice(get_osd_running_nodes())
        osd_node = get_node_objs([osd_node_name])[0]

        old_osd_pod_ids = get_node_osd_ids(osd_node_name)
        logger.info(f"osd pod ids: {old_osd_pod_ids}")
        node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids)
        node_osd_pod_names = [p.name for p in node_osd_pods]

        logger.info(f"Going to restart the node {osd_node_name}")
        nodes.restart_nodes(nodes=[osd_node], wait=False)

        logger.info("Verify the node osd pods go into a Terminating state")
        res = pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], node_osd_pod_names)
        assert res, "Not all the node osd pods are in a Terminating state"

        wait_for_nodes_status(node_names=[osd_node_name])
        assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                                old_osd_pod_ids,
                                                timeout=300)
        logger.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )

        logger.info(
            "Verify the worker nodes security groups on the provider...")
        assert verify_worker_nodes_security_groups()
Beispiel #12
0
def check_automated_recovery_from_terminated_node(nodes):
    """
    1) Terminate node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) A new node should be created automatically
    4) The new osd pods with the same ids of the terminated node should start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node.name)

    nodes.terminate_nodes([osd_node], wait=True)
    log.info(f"Successfully terminated the node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)

    wait_for_osd_ids_come_up_on_node(new_wnode.name,
                                     old_osd_pod_ids,
                                     timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}"
    )
def osd_node_reboot():
    """
    Rebooting worker node that running OSD

    Raises:
        AssertionError: in case the ceph-tools pod was not recovered

    """
    nodes = PlatformNodesFactory().get_nodes_platform()
    osd_nodes_names = get_osd_running_nodes()
    osd_node_to_reboot = list()
    for node in get_nodes():
        node_name = get_node_name(node)
        if node_name == osd_nodes_names[0]:
            osd_node_to_reboot.append(node)
    log.info(f"Rebooting OSD node: {get_node_name(osd_node_to_reboot[0])}")
    nodes.restart_nodes(osd_node_to_reboot)

    log.info("Sleeping 5 minutes")
    time.sleep(320)
    assert (
        wait_for_ct_pod_recovery()
    ), "Ceph tools pod failed to come up on another node"
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                    else:
                        raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #15
0
    def test_node_replacement_reactive_aws_ipi(
        self, nodes, pvc_factory, pod_factory, dc_pod_factory,
        failure, interface
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(
                interface=interface, node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name
        )
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name
        )
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}"
        )

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(
            annotation=annotation, machine_name=machine_name
        )

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes)
        )
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(
            resource_name=new_spun_node[0],
            label=constants.OPERATOR_NODE_LABEL
        )
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label"
        )

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(
            dc_pod_obj, timeout=1200
        )
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE
        )
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj, state=constants.STATUS_RUNNING,
                        timeout=1800
                    )
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE
                        )
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        failure,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        new_ocs_node_names = add_new_node_and_label_it(machineset_name)
        failure_domain = get_failure_domain()
        log.info("Wait for the nodes racks or zones to appear...")
        wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names)

        new_ocs_node = get_node_objs(new_ocs_node_names)[0]
        osd_node_in_same_rack_or_zone = get_another_osd_node_in_same_rack_or_zone(
            failure_domain, new_ocs_node, common_nodes)
        # Get the failure node obj
        failure_node_obj = get_node_objs([osd_node_in_same_rack_or_zone.name])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods(timeout=300)

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            tries = 200
        else:
            tries = 40

        self.sanity_helpers.health_check(tries=tries)
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_node_replacement_reactive_aws_ipi(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        failure,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(
                f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(annotation=annotation,
                                          machine_name=machine_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes))
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind="node")
        node_obj.add_label(resource_name=new_spun_node[0],
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label")

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_automated_recovery_from_stopped_node_and_start(
            self, nodes, additional_node):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI

        0) A - add new node, B - don't add new node
        1) Stop node
        2) Validate result:
             A - pods should respin on the new node
             B - pods should remain in Pending state on the stopped node
        3) Start node
        4) Validate result:
             A - pods should start on the new node
             B - pods should start on the stopped node after starting it
        """
        wnode_name = get_worker_nodes()[0]
        machine_name = machine.get_machine_from_node_name(wnode_name)
        self.machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        self.start_ready_replica_count = machine.get_ready_replica_count(
            self.machineset_name)

        if additional_node:
            new_ocs_node_names = add_new_node_and_label_it(
                self.machineset_name)
            failure_domain = get_failure_domain()
            log.info("Wait for the nodes racks or zones to appear...")
            wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names)

            new_ocs_node = get_node_objs(new_ocs_node_names)[0]
            log.info(
                f"Successfully created a new OCS node '{new_ocs_node.name}'")
            self.extra_node = True
            log.info("Get another OSD node in the same rack or zone...")
            self.osd_worker_node = get_another_osd_node_in_same_rack_or_zone(
                failure_domain, new_ocs_node)
            assert (self.osd_worker_node
                    ), "Didn't find another osd node in the same rack or zone"
        else:
            osd_node_names = get_osd_running_nodes()
            self.osd_worker_node = get_node_objs(osd_node_names)[0]

        osd_pods = get_osd_pods()
        temp_osd = get_node_pods(self.osd_worker_node.name,
                                 pods_to_search=osd_pods)[0]
        osd_real_name = "-".join(temp_osd.name.split("-")[:-1])

        nodes.stop_nodes([self.osd_worker_node], wait=True)
        log.info(f"Successfully powered off node: {self.osd_worker_node.name}")

        timeout = 420
        assert pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], [temp_osd.name], timeout=timeout
        ), (f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} "
            f"after {timeout} seconds")

        # Validate that the OSD in terminate state has a new OSD in Pending
        all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        new_osd = None
        for pod_obj in all_pod_obj:
            if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and (
                    temp_osd.name != pod_obj.name):
                new_osd = pod_obj
                break

        nodes.start_nodes(nodes=[self.osd_worker_node], wait=True)
        log.info(f"Successfully powered on node: {self.osd_worker_node.name}")
        wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180)
        if additional_node:
            new_osd_node = get_pod_node(new_osd)
            assert (new_osd_node.name != self.osd_worker_node.name
                    ), "New OSD is expected to run on the new additional node"
    def test_simultaneous_drain_of_two_ocs_nodes(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key="dc",
                          label_value="fedora")
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == "rbd" else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if "rook-ceph-crashcollector" in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = "-".join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #22
0
def osd_encryption_verification():
    """
    Verify if OSD encryption at rest if successfully deployed on OCS

    Raises:
        UnsupportedFeatureError: OCS version is smaller than 4.6
        EnvironmentError: The OSD is not encrypted

    """
    ocs_version = version.get_semantic_ocs_version_from_config()
    if ocs_version < version.VERSION_4_6:
        error_message = "Encryption at REST can be enabled only on OCS >= 4.6!"
        raise UnsupportedFeatureError(error_message)

    log.info("Get 'lsblk' command output on nodes where osd running")
    osd_node_names = get_osds_per_node()
    for worker_node in osd_node_names:
        lsblk_cmd = f"oc debug node/{worker_node} -- chroot /host lsblk"
        lsblk_out = run_cmd(lsblk_cmd)
        log.info(
            f"the output of lsblk command on node {worker_node} is:\n {lsblk_out}"
        )
        osd_node_names[worker_node].append(lsblk_out)

    log.info("Verify 'lsblk' command results are as expected")
    for worker_node in osd_node_names:
        osd_number_per_node = len(osd_node_names[worker_node]) - 1
        lsblk_output = osd_node_names[worker_node][-1]
        lsblk_output_split = lsblk_output.split()
        log.info(f"lsblk split:{lsblk_output_split}")
        log.info(f"osd_node_names dictionary: {osd_node_names}")
        log.info(f"count crypt {lsblk_output_split.count('crypt')}")
        log.info(f"osd_number_per_node = {osd_number_per_node}")
        if lsblk_output_split.count("crypt") != osd_number_per_node:
            log.error(
                f"The output of lsblk command on node {worker_node} is not as expected:\n{lsblk_output}"
            )
            raise ValueError("OSD is not encrypted")

    # skip OCS 4.8 as the fix for luks header info is still not available on it
    if ocs_version > version.VERSION_4_6 and ocs_version != version.VERSION_4_8:
        log.info("Verify luks header label for encrypted devices")
        worker_nodes = get_osd_running_nodes()
        failures = 0
        failure_message = ""
        node_obj = OCP(kind="node")
        for node in worker_nodes:
            luks_devices = get_encrypted_osd_devices(node_obj, node)
            for luks_device_name in luks_devices:
                luks_device_name = luks_device_name.strip()
                log.info(
                    f"Checking luks header label on Luks device {luks_device_name} for node {node}"
                )
                cmd = "cryptsetup luksDump /dev/" + str(luks_device_name)
                cmd_out = node_obj.exec_oc_debug_cmd(node=node, cmd_list=[cmd])

                if "(no label)" in str(cmd_out) or "(no subsystem)" in str(
                        cmd_out):
                    failures += 1
                    failure_message += (
                        f"\nNo label found on Luks header information for node {node}\n"
                    )

        if failures != 0:
            log.error(failure_message)
            raise ValueError("Luks header label is not found")
        log.info("Luks header info found for all the encrypted osds")