Beispiel #1
0
    def get_node_info(self, node_type="master"):
        """
        Getting node type hardware information and update the main environment
        dictionary.

        Args:
            node_type (str): the node type to collect data about,
              can be : master / worker - the default is master

        """
        if node_type == "master":
            nodes = node.get_master_nodes()
        elif node_type == "worker":
            nodes = node.get_worker_nodes()
        else:
            log.warning(f"Node type ({node_type}) is invalid")
            return

        oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        self.environment[f"{node_type}_nodes_num"] = len(nodes)
        self.environment[f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd(
            node=nodes[0],
            cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"],
        ).rstrip()
        self.environment[f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd(
            node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"]
        ).rstrip()
Beispiel #2
0
    def stop_baremetal_machines(self, baremetal_machine, force=True):
        """
        Stop Baremetal Machines

        Args:
            baremetal_machine (list): BM objects
            force (bool): True for BM ungraceful power off, False for
                graceful BM shutdown

        Raises:
            UnexpectedBehaviour: If baremetal machine is still up

        """
        for node in baremetal_machine:
            if force:
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    logger.info(f"Powering Off {node.name}")
                    ipmi_ctx.chassis_control_power_down()
            else:
                ocp = OCP(kind="node")
                ocp.exec_oc_debug_cmd(
                    node=node.name, cmd_list=["shutdown now"], timeout=60
                )
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    for status in TimeoutSampler(
                        600, 5, self.get_power_status, ipmi_ctx
                    ):
                        logger.info(
                            f"Waiting for Baremetal Machine {node.name} to power off"
                            f"Current Baremetal status: {status}"
                        )
                        if status == VM_POWERED_OFF:
                            logger.info(
                                f"Baremetal Machine {node.name} reached poweredOff status"
                            )
                            break
        logger.info("Verifing machine is down")
        ret = TimeoutSampler(
            timeout=300,
            sleep=3,
            func=self.verify_machine_is_down,
            node=node,
        )
        logger.info(ret)
        if not ret.wait_for_func_status(result=True):
            raise UnexpectedBehaviour("Machine {node.name} is still Running")
Beispiel #3
0
def get_node_logs(node_name):
    """
    Get logs from a given node

    pod_name (str): Name of the node

    Returns:
        str: Output of 'dmesg' run on node
    """
    node = OCP(kind="node")
    return node.exec_oc_debug_cmd(node_name, ["dmesg"])
Beispiel #4
0
def osd_encryption_verification():
    """
    Verify if OSD encryption at rest if successfully deployed on OCS

    Raises:
        UnsupportedFeatureError: OCS version is smaller than 4.6
        EnvironmentError: The OSD is not encrypted

    """
    ocs_version = version.get_semantic_ocs_version_from_config()
    if ocs_version < version.VERSION_4_6:
        error_message = "Encryption at REST can be enabled only on OCS >= 4.6!"
        raise UnsupportedFeatureError(error_message)

    log.info("Get 'lsblk' command output on nodes where osd running")
    osd_node_names = get_osds_per_node()
    for worker_node in osd_node_names:
        lsblk_cmd = f"oc debug node/{worker_node} -- chroot /host lsblk"
        lsblk_out = run_cmd(lsblk_cmd)
        log.info(
            f"the output of lsblk command on node {worker_node} is:\n {lsblk_out}"
        )
        osd_node_names[worker_node].append(lsblk_out)

    log.info("Verify 'lsblk' command results are as expected")
    for worker_node in osd_node_names:
        osd_number_per_node = len(osd_node_names[worker_node]) - 1
        lsblk_output = osd_node_names[worker_node][-1]
        lsblk_output_split = lsblk_output.split()
        log.info(f"lsblk split:{lsblk_output_split}")
        log.info(f"osd_node_names dictionary: {osd_node_names}")
        log.info(f"count crypt {lsblk_output_split.count('crypt')}")
        log.info(f"osd_number_per_node = {osd_number_per_node}")
        if lsblk_output_split.count("crypt") != osd_number_per_node:
            log.error(
                f"The output of lsblk command on node {worker_node} is not as expected:\n{lsblk_output}"
            )
            raise ValueError("OSD is not encrypted")

    # skip OCS 4.8 as the fix for luks header info is still not available on it
    if ocs_version > version.VERSION_4_6 and ocs_version != version.VERSION_4_8:
        log.info("Verify luks header label for encrypted devices")
        worker_nodes = get_osd_running_nodes()
        failures = 0
        failure_message = ""
        node_obj = OCP(kind="node")
        for node in worker_nodes:
            luks_devices = get_encrypted_osd_devices(node_obj, node)
            for luks_device_name in luks_devices:
                luks_device_name = luks_device_name.strip()
                log.info(
                    f"Checking luks header label on Luks device {luks_device_name} for node {node}"
                )
                cmd = "cryptsetup luksDump /dev/" + str(luks_device_name)
                cmd_out = node_obj.exec_oc_debug_cmd(node=node, cmd_list=[cmd])

                if "(no label)" in str(cmd_out) or "(no subsystem)" in str(
                        cmd_out):
                    failures += 1
                    failure_message += (
                        f"\nNo label found on Luks header information for node {node}\n"
                    )

        if failures != 0:
            log.error(failure_message)
            raise ValueError("Luks header label is not found")
        log.info("Luks header info found for all the encrypted osds")
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")
class TestMonDataAvailWarn(E2ETest):
    """
    Testing MON disk low threshold.
    Ceph health enters '' state once mon disk reaches >= 85%

    """

    mon_pod = None
    worker_node = None
    oc_cmd = None
    mon_suffix = None
    workloads_dir = None
    dd_seek_count = 0

    @pytest.fixture()
    def workloads_dir_setup(self, request):
        """
        Setting up the environment for the test

        """
        if config.DEPLOYMENT.get("local_storage"):
            self.worker_node = node.get_worker_nodes()[0]
            self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            mon_pod_name = self.oc_cmd.exec_oc_debug_cmd(
                node=self.worker_node,
                cmd_list=["ls /var/lib/rook/ | grep mon"],
            )
            mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "")

            mon_pods_info = pod.get_pods_having_label(
                label=f"ceph_daemon_id={mon_pod_id}",
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
            self.mon_pod = pod.get_pod_obj(
                name=mon_pods_info[0]["metadata"]["name"],
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
        else:
            self.mon_pod = random.choice(pod.get_mon_pods())
        self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get(
            "mon")

        self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads"
        log.info(f"Selected mon '{self.mon_pod.name}'")
        self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}")
        self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}")

        def finalizer():
            self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}")
            time.sleep(SLEEP_TIMEOUT)
            utils.ceph_health_check()

        request.addfinalizer(finalizer)

    def get_used_percentage(self):
        """
        Get used percentage on /var/lib/ceph/mon/ceph-[a/b/c]

        Returns:
            int: Used space percentage

        """
        path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}"
        if config.DEPLOYMENT.get("local_storage"):
            path = "/etc/hosts"
        cmd = f"df -Th | grep {path}"
        mount_details = self.mon_pod.exec_sh_cmd_on_pod(command=cmd, sh="sh")
        used_percent = mount_details.split()[5].replace("%", "")
        return int(used_percent)

    def exec_dd_cmd(self):
        """
        Append 1G to tmp file using dd command
        """
        of_path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads"
        if config.DEPLOYMENT.get("local_storage"):
            of_path = f"/var/lib/rook/mon-{self.mon_suffix}/data/workloads"

        write_cmd = f"dd if=/dev/urandom of={of_path}/{TEMP_FILE} "

        write_cmd += f"bs={DD_BLOCK_SIZE}M count={DD_COUNT} "
        write_cmd += f"seek={self.dd_seek_count * DD_BLOCK_SIZE * DD_COUNT}"

        if config.DEPLOYMENT.get("local_storage"):
            self.oc_cmd.exec_oc_debug_cmd(
                node=self.worker_node,
                cmd_list=[write_cmd],
            )
        else:
            self.mon_pod.exec_sh_cmd_on_pod(command=write_cmd, sh="sh")
        self.dd_seek_count += 1

    @pytest.mark.usefixtures(workloads_dir_setup.__name__)
    def test_mon_data_avail_warn(self):
        """
        Test mon disk threshold

        Steps:
          - Write to temp file using dd until reaches >= 85% (1G each)
          - Check ceph health from 80% and above
          - From 85% and above, ceph health status should be
            'HEALTH_WARN' with warning message regarding low space
        """

        used_percent = self.get_used_percentage()
        log.info(f"Used percentage on {self.workloads_dir}: {used_percent}%")

        should_keep_writing = True
        while should_keep_writing:
            self.exec_dd_cmd()
            used_percent = self.get_used_percentage()
            log.info(
                f"Used percentage on {self.workloads_dir}: {used_percent}%")
            if used_percent >= 80:
                time.sleep(SLEEP_TIMEOUT)
                if used_percent >= 85:
                    time.sleep(SLEEP_TIMEOUT)
                    ceph_status = CephCluster().get_ceph_health()
                    log.info(f"Ceph status is: {ceph_status}")
                    assert run_cmd_verify_cli_output(
                        cmd="ceph health detail",
                        expected_output_lst={
                            "HEALTH_WARN", "low on available space"
                        },
                        cephtool_cmd=True,
                    ), "Ceph status should be HEALTH_WARN containing 'low on available space'"
                    should_keep_writing = False
                else:
                    utils.ceph_health_check()