def get_node_info(self, node_type="master"): """ Getting node type hardware information and update the main environment dictionary. Args: node_type (str): the node type to collect data about, can be : master / worker - the default is master """ if node_type == "master": nodes = node.get_master_nodes() elif node_type == "worker": nodes = node.get_worker_nodes() else: log.warning(f"Node type ({node_type}) is invalid") return oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) self.environment[f"{node_type}_nodes_num"] = len(nodes) self.environment[f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"], ).rstrip() self.environment[f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"] ).rstrip()
def stop_baremetal_machines(self, baremetal_machine, force=True): """ Stop Baremetal Machines Args: baremetal_machine (list): BM objects force (bool): True for BM ungraceful power off, False for graceful BM shutdown Raises: UnexpectedBehaviour: If baremetal machine is still up """ for node in baremetal_machine: if force: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) logger.info(f"Powering Off {node.name}") ipmi_ctx.chassis_control_power_down() else: ocp = OCP(kind="node") ocp.exec_oc_debug_cmd( node=node.name, cmd_list=["shutdown now"], timeout=60 ) if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) for status in TimeoutSampler( 600, 5, self.get_power_status, ipmi_ctx ): logger.info( f"Waiting for Baremetal Machine {node.name} to power off" f"Current Baremetal status: {status}" ) if status == VM_POWERED_OFF: logger.info( f"Baremetal Machine {node.name} reached poweredOff status" ) break logger.info("Verifing machine is down") ret = TimeoutSampler( timeout=300, sleep=3, func=self.verify_machine_is_down, node=node, ) logger.info(ret) if not ret.wait_for_func_status(result=True): raise UnexpectedBehaviour("Machine {node.name} is still Running")
def get_node_logs(node_name): """ Get logs from a given node pod_name (str): Name of the node Returns: str: Output of 'dmesg' run on node """ node = OCP(kind="node") return node.exec_oc_debug_cmd(node_name, ["dmesg"])
def osd_encryption_verification(): """ Verify if OSD encryption at rest if successfully deployed on OCS Raises: UnsupportedFeatureError: OCS version is smaller than 4.6 EnvironmentError: The OSD is not encrypted """ ocs_version = version.get_semantic_ocs_version_from_config() if ocs_version < version.VERSION_4_6: error_message = "Encryption at REST can be enabled only on OCS >= 4.6!" raise UnsupportedFeatureError(error_message) log.info("Get 'lsblk' command output on nodes where osd running") osd_node_names = get_osds_per_node() for worker_node in osd_node_names: lsblk_cmd = f"oc debug node/{worker_node} -- chroot /host lsblk" lsblk_out = run_cmd(lsblk_cmd) log.info( f"the output of lsblk command on node {worker_node} is:\n {lsblk_out}" ) osd_node_names[worker_node].append(lsblk_out) log.info("Verify 'lsblk' command results are as expected") for worker_node in osd_node_names: osd_number_per_node = len(osd_node_names[worker_node]) - 1 lsblk_output = osd_node_names[worker_node][-1] lsblk_output_split = lsblk_output.split() log.info(f"lsblk split:{lsblk_output_split}") log.info(f"osd_node_names dictionary: {osd_node_names}") log.info(f"count crypt {lsblk_output_split.count('crypt')}") log.info(f"osd_number_per_node = {osd_number_per_node}") if lsblk_output_split.count("crypt") != osd_number_per_node: log.error( f"The output of lsblk command on node {worker_node} is not as expected:\n{lsblk_output}" ) raise ValueError("OSD is not encrypted") # skip OCS 4.8 as the fix for luks header info is still not available on it if ocs_version > version.VERSION_4_6 and ocs_version != version.VERSION_4_8: log.info("Verify luks header label for encrypted devices") worker_nodes = get_osd_running_nodes() failures = 0 failure_message = "" node_obj = OCP(kind="node") for node in worker_nodes: luks_devices = get_encrypted_osd_devices(node_obj, node) for luks_device_name in luks_devices: luks_device_name = luks_device_name.strip() log.info( f"Checking luks header label on Luks device {luks_device_name} for node {node}" ) cmd = "cryptsetup luksDump /dev/" + str(luks_device_name) cmd_out = node_obj.exec_oc_debug_cmd(node=node, cmd_list=[cmd]) if "(no label)" in str(cmd_out) or "(no subsystem)" in str( cmd_out): failures += 1 failure_message += ( f"\nNo label found on Luks header information for node {node}\n" ) if failures != 0: log.error(failure_message) raise ValueError("Luks header label is not found") log.info("Luks header info found for all the encrypted osds")
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][ "kubernetes.io/hostname" ] log.info(f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS ) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "" ) ) new_data["data"] = ",".join( [ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ] ) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "") ) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info(f"Waiting for {sleep_time} seconds before deleting another mon") time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}" ) log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
class TestMonDataAvailWarn(E2ETest): """ Testing MON disk low threshold. Ceph health enters '' state once mon disk reaches >= 85% """ mon_pod = None worker_node = None oc_cmd = None mon_suffix = None workloads_dir = None dd_seek_count = 0 @pytest.fixture() def workloads_dir_setup(self, request): """ Setting up the environment for the test """ if config.DEPLOYMENT.get("local_storage"): self.worker_node = node.get_worker_nodes()[0] self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) mon_pod_name = self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=["ls /var/lib/rook/ | grep mon"], ) mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "") mon_pods_info = pod.get_pods_having_label( label=f"ceph_daemon_id={mon_pod_id}", namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) self.mon_pod = pod.get_pod_obj( name=mon_pods_info[0]["metadata"]["name"], namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) else: self.mon_pod = random.choice(pod.get_mon_pods()) self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get( "mon") self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" log.info(f"Selected mon '{self.mon_pod.name}'") self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}") self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}") def finalizer(): self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}") time.sleep(SLEEP_TIMEOUT) utils.ceph_health_check() request.addfinalizer(finalizer) def get_used_percentage(self): """ Get used percentage on /var/lib/ceph/mon/ceph-[a/b/c] Returns: int: Used space percentage """ path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}" if config.DEPLOYMENT.get("local_storage"): path = "/etc/hosts" cmd = f"df -Th | grep {path}" mount_details = self.mon_pod.exec_sh_cmd_on_pod(command=cmd, sh="sh") used_percent = mount_details.split()[5].replace("%", "") return int(used_percent) def exec_dd_cmd(self): """ Append 1G to tmp file using dd command """ of_path = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" if config.DEPLOYMENT.get("local_storage"): of_path = f"/var/lib/rook/mon-{self.mon_suffix}/data/workloads" write_cmd = f"dd if=/dev/urandom of={of_path}/{TEMP_FILE} " write_cmd += f"bs={DD_BLOCK_SIZE}M count={DD_COUNT} " write_cmd += f"seek={self.dd_seek_count * DD_BLOCK_SIZE * DD_COUNT}" if config.DEPLOYMENT.get("local_storage"): self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=[write_cmd], ) else: self.mon_pod.exec_sh_cmd_on_pod(command=write_cmd, sh="sh") self.dd_seek_count += 1 @pytest.mark.usefixtures(workloads_dir_setup.__name__) def test_mon_data_avail_warn(self): """ Test mon disk threshold Steps: - Write to temp file using dd until reaches >= 85% (1G each) - Check ceph health from 80% and above - From 85% and above, ceph health status should be 'HEALTH_WARN' with warning message regarding low space """ used_percent = self.get_used_percentage() log.info(f"Used percentage on {self.workloads_dir}: {used_percent}%") should_keep_writing = True while should_keep_writing: self.exec_dd_cmd() used_percent = self.get_used_percentage() log.info( f"Used percentage on {self.workloads_dir}: {used_percent}%") if used_percent >= 80: time.sleep(SLEEP_TIMEOUT) if used_percent >= 85: time.sleep(SLEEP_TIMEOUT) ceph_status = CephCluster().get_ceph_health() log.info(f"Ceph status is: {ceph_status}") assert run_cmd_verify_cli_output( cmd="ceph health detail", expected_output_lst={ "HEALTH_WARN", "low on available space" }, cephtool_cmd=True, ), "Ceph status should be HEALTH_WARN containing 'low on available space'" should_keep_writing = False else: utils.ceph_health_check()