def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check()
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def get_node_info(self, node_type="master"): """ Getting node type hardware information and update the main environment dictionary. Args: node_type (str): the node type to collect data about, can be : master / worker - the default is master """ if node_type == "master": nodes = node.get_master_nodes() elif node_type == "worker": nodes = node.get_worker_nodes() else: log.warning(f"Node type ({node_type}) is invalid") return oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) self.environment[f"{node_type}_nodes_num"] = len(nodes) self.environment[f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"], ).rstrip() self.environment[f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"] ).rstrip()
def create_dummy_zone_labels(): """ Create dummy zone labels on cluster nodes: try to label all master and worker nodes based on values of ``worker_availability_zones`` and ``master_availability_zones`` options, but only if there are no zone labels already defined. Raises: UnexpectedDeploymentConfiguration: when either cluster or ocs-ci config file are in conflict with dummy zone labels. """ logger.info("trying to setup dummy_zone_node_labels") if are_zone_labels_missing(): to_label = [ ("master_availability_zones", get_master_nodes()), ("worker_availability_zones", get_worker_nodes()), ] for zone_opt, nodes in to_label: zones = config.ENV_DATA.get(zone_opt) if zones is None: msg = f"{zone_opt} is not defined in ENV_DATA conf" logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) assign_dummy_zones(zones, nodes) else: # don't use dummy zone labeling on a cluster with actuall zones msg = ("Cluster in unexpected state before dummy zone labeling: " "at least one node already have a zone label.") logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg)
def start_baremetal_machines_with_ipmi_ctx(self, ipmi_ctxs, wait=True): """ Start Baremetal Machines using Ipmi ctx Args: ipmi_ctxs (list): List of BM ipmi_ctx wait (bool): Wait for BMs to start """ for ipmi_ctx in ipmi_ctxs: ipmi_ctx.chassis_control_power_up() if wait: for ipmi_ctx in ipmi_ctxs: for status in TimeoutSampler(600, 5, self.get_power_status, ipmi_ctx): logger.info( f"Waiting for Baremetal Machine to power on. " f"Current Baremetal status: {status}" ) if status == VM_POWERED_ON: logger.info("Baremetal Machine reached poweredOn status") break wait_for_cluster_connectivity(tries=400) wait_for_nodes_status( node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800 ) wait_for_nodes_status( node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800 )
def setup(self, project_factory, pvc_factory, pod_factory): """ Create PVC and pods """ self.pvc_size = 10 self.pvc_obj = pvc_factory( interface=constants.CEPHBLOCKPOOL, size=self.pvc_size, access_mode=constants.ACCESS_MODE_RWX, status=constants.STATUS_BOUND, volume_mode=constants.VOLUME_MODE_BLOCK, size_unit="Mi", ) worker_nodes_list = node.get_worker_nodes() self.pod_objs = [] for node_name in worker_nodes_list: pod_obj = pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=self.pvc_obj, status=constants.STATUS_RUNNING, node_name=node_name, pod_dict_path=constants.CSI_RBD_RAW_BLOCK_POD_YAML, raw_block_pv=True, ) self.pod_objs.append(pod_obj)
def setup(self, add_nodes): """ Check that we have the right configurations before we start the test """ osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= constants.MAX_OSDS: pytest.skip("We have maximum of OSDs in the cluster") # If we use vSphere we may need to add more worker nodes # to the cluster before starting the test if (config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM and number_of_osd_pods_before >= 9): num_of_expected_wnodes = 6 wnodes = node.get_worker_nodes() num_of_wnodes = len(wnodes) logging.info( f"We have {number_of_osd_pods_before} OSDs in the cluster, " f"and {num_of_wnodes} worker nodes in the cluster") if num_of_wnodes < num_of_expected_wnodes: num_of_wnodes_to_add = num_of_expected_wnodes - num_of_wnodes logging.info( f"Adding more {num_of_wnodes_to_add} worker nodes to the cluster" ) add_nodes(ocs_nodes=False, node_count=num_of_wnodes_to_add) wnodes_not_in_ocs = node.get_worker_nodes_not_in_ocs() if wnodes_not_in_ocs: logging.info("Label the worker nodes that are not in OCS") node.label_nodes(wnodes_not_in_ocs)
def start_powernodes_machines(self, powernode_machines, timeout=900, wait=True, force=True): """ Start PowerNode Machines Args: powernode_machines (list): List of PowerNode machines timeout (int): time in seconds to wait for node to reach 'not ready' state, and 'ready' state. wait (bool): Wait for PowerNodes to start - for future use force (bool): True for PowerNode ungraceful power off, False for graceful PowerNode shutdown - for future use """ ocpversion = get_ocp_version("-") for node in powernode_machines: result = exec_cmd( f"sudo virsh start test-ocp{ocpversion}-{node.name}") logger.info(f"Result of shutdown {result}") wait_for_cluster_connectivity(tries=900) wait_for_nodes_status(node_names=get_master_nodes(), status=constants.NODE_READY, timeout=timeout) wait_for_nodes_status(node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=timeout)
def verify_disks_lso_attached(self, timeout=600, sleep=20): """ Verify Disks Attached Args: timeout (int): Time in seconds to wait sleep (int): Sampling time in seconds """ osd_size = config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE) number_worker_nodes = get_worker_nodes() capacity = int(osd_size) * len(number_worker_nodes) if capacity >= 1024: capacity_str = str( capacity / 1024).rstrip("0").rstrip(".") + " TiB" else: capacity_str = str(capacity) + " GiB" sample = TimeoutSampler( timeout=timeout, sleep=sleep, func=self.check_element_text, expected_text=capacity_str, ) if not sample.wait_for_func_status(result=True): logger.error(f" after {timeout} seconds") raise TimeoutExpiredError
def get_node_name_where_jenkins_pod_not_hosted(self, node_type=constants. WORKER_MACHINE, num_of_nodes=1): """ get nodes Args: node_type (str): The node type (e.g. worker, master) num_of_nodes (int): The number of nodes to be returned Returns: list: List of compute node names """ if node_type == constants.MASTER_MACHINE: nodes_drain = [ node.name for node in get_nodes(node_type=node_type, num_of_nodes=num_of_nodes) ] elif node_type == constants.WORKER_MACHINE: pod_objs = [] for project in self.projects: pod_names = get_pod_name_by_pattern(pattern="jenkins", namespace=project) pod_obj = [ get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names ] pod_objs += pod_obj nodes_app_name = set(get_app_pod_running_nodes(pod_objs)) nodes_worker_name = set(get_worker_nodes()) nodes_drain = nodes_worker_name - nodes_app_name else: raise ValueError("The node type is worker or master") return list(nodes_drain)[:num_of_nodes]
def test_automated_recovery_from_stopped_node_and_start( self, nodes, additional_node ): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI 0) A - add new node, B - don't add new node 1) Stop node 2) Validate result: A - pods should respin on the new node B - pods should remain in Pending state on the stopped node 3) Start node 4) Validate result: A - pods should start on the new node B - pods should start on the stopped node after starting it """ wnode_name = get_worker_nodes()[0] machine_name = machine.get_machine_from_node_name(wnode_name) self.machineset_name = machine.get_machineset_from_machine_name(machine_name) self.start_ready_replica_count = machine.get_ready_replica_count( self.machineset_name ) temp_osd = get_osd_pods()[0] osd_real_name = "-".join(temp_osd.name.split("-")[:-1]) self.osd_worker_node = [get_pod_node(temp_osd)] if additional_node: self.add_new_storage_node(self.osd_worker_node[0].name) self.extra_node = True nodes.stop_nodes(self.osd_worker_node, wait=True) log.info(f"Successfully powered off node: {self.osd_worker_node[0].name}") timeout = 420 assert wait_for_rook_ceph_pod_status( temp_osd, constants.STATUS_TERMINATING, timeout ), ( f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} " f"after {timeout} seconds" ) # Validate that the OSD in terminate state has a new OSD in Pending all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE) new_osd = None for pod_obj in all_pod_obj: if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and ( temp_osd.name != pod_obj.name ): new_osd = pod_obj break nodes.start_nodes(nodes=self.osd_worker_node, wait=True) log.info(f"Successfully powered on node: {self.osd_worker_node[0].name}") wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180) if additional_node: new_osd_node = get_pod_node(new_osd) assert ( new_osd_node.name != self.osd_worker_node[0].name ), "New OSD is expected to run on the new additional node"
def test_nodereplacement_proactive_with_io_running( self, pvc_factory, pod_factory, dc_pod_factory, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node Replacement proactive when IO running in the background """ # Get worker nodes worker_node_list = node.get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_node_name = select_osd_node_name() log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) delete_and_create_osd_node(osd_node_name) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Verify OSD is encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def test_node_kernel_crash_ceph_fsync(self, pvc_factory, teardown_factory, dc_pod_factory, interface_type): """ 1. Create 1GiB PVC 2. Attach PVC to an application pod 3. Copy file fsync.py to pod 4. Execute create delete file operation parallely with fsync.py 5. Check Node gets Panic or not """ worker_nodes_list = get_worker_nodes() # Create a Cephfs, rbd PVC pvc_obj = pvc_factory(interface=interface_type, ) # Create a pod on a particular node selected_node = random.choice(worker_nodes_list) log.info( f"Creating a pod on node: {selected_node} with pvc {pvc_obj.name}") pod_obj = dc_pod_factory( interface=interface_type, pvc=pvc_obj, ) file = constants.FSYNC cmd = f"oc cp {file} {pvc_obj.namespace}/{pod_obj.name}:/" helpers.run_cmd(cmd=cmd) log.info("Files copied successfully ") command = f"mkdir {self.result_dir}" pod_obj.exec_cmd_on_pod(command=command) log.info("Starting creation and deletion of files on volume") # Create and delete files on mount point create_executor = ThreadPoolExecutor(max_workers=1) self.create_thread = create_executor.submit(self.creates_files, pod_obj) sleep(3) log.info("Started deletion of files on volume") delete_executor = ThreadPoolExecutor(max_workers=1) self.delete_thread = delete_executor.submit(self.remove_files, pod_obj) fsync_executor = ThreadPoolExecutor(max_workers=1) self.fsync_thread = fsync_executor.submit(pod_obj.exec_sh_cmd_on_pod, command="python3 fsync.py") # Check Node gets Panic or not try: node.wait_for_nodes_status(selected_node, status=constants.NODE_NOT_READY, timeout=60) except ResourceWrongStatusException: log.info(f"(No kernel panic observed on {selected_node})") else: assert f"({selected_node} is in Not Ready state)"
def __init__(self, **kwargs): """ Initializer function. Initialize object variables, clone the benchmark operator repo. and label the worker nodes. Args: kwargs (dict): Following kwargs are valid repo: benchmark-operator repo to used - a github link branch: branch to use from the repo Example Usage: r1 = BenchmarkOperator() r1.deploy() # use oc apply to apply custom modified bench my_custom_bench = my_custom_bench.yaml run_cmd('oc apply -f my_custom_bench') """ log.info("Initialize the benchmark-operator object") self.args = kwargs self.repo = self.args.get("repo", BMO_REPO) self.branch = self.args.get("branch", "master") # the namespace is a constant for the benchmark-operator self.namespace = BMO_NAME self.pgsql_is_setup = False self.ocp = OCP() self.ns_obj = OCP(kind="namespace") self.pod_obj = OCP(namespace=BMO_NAME, kind="pod") # list of worker nodes to label self.worker_nodes = get_worker_nodes() self._clone_operator() self.dir += f"/{BMO_NAME}" # to use the cache dropping pod, worker nodes need to be labeled. log.info("Labeling the worker nodes for cache-dropping enable.") try: helpers.label_worker_node(self.worker_nodes, label_key=BMO_LABEL, label_value="yes") except CommandFailed: # this is probably because of the nodes are already labeled, so, # checking if nodes are labeled and continue anyway. result = self.pod_obj.exec_oc_cmd(f"get node -l {BMO_LABEL}") found = [ node for node in self.worker_nodes if re.search(node, result) ] if len(found) == len(self.worker_nodes): log.info("All worker nodes are labeled") else: log.warning( "Labeling nodes failed, Not all workers node are labeled !" )
def check_automated_recovery_from_stopped_node(nodes): """ 1) Stop node. 2) The rook ceph pods associated with the node should change to a Terminating state. 3) The node should power on automatically, or if removed from the cluster, a new node should create automatically. 4) The new osd pods with the same ids should start on the stopped node after it powered on, or to start on the new osd node. """ old_wnodes = get_worker_nodes() log.info(f"Current worker nodes: {old_wnodes}") osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] machine_name = machine.get_machine_from_node_name(osd_node_name) machineset = machine.get_machineset_from_machine_name(machine_name) log.info(f"machineset name: {machineset}") old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate( osd_node_name) nodes.stop_nodes([osd_node], wait=True) log.info(f"Successfully powered off node: {osd_node_name}") log.info("Verify the node rook ceph pods go into a Terminating state") res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING], pod_names_expected_to_terminate) assert res, "Not all the node rook ceph pods are in a Terminating state" try: log.info(f"Wait for the node: {osd_node_name} to power on") wait_for_nodes_status([osd_node_name]) log.info(f"Successfully powered on node {osd_node_name}") except ResourceWrongStatusException as e: log.info( f"The worker node {osd_node_name} didn't start due to the exception {str(e)} " f"Probably it has been removed from the cluster. Waiting for a new node to come up..." ) new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes) osd_node_name = new_wnode.name assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" )
def check_automated_recovery_from_drain_node(nodes): """ 1) Drain one worker node. 2) Delete the OSD pods associated with the node. 3) The new OSD pods with the same ids that come up, should be in a Pending state. 4) Schedule the worker node. 5) The OSD pods associated with the node, should back into a Running state, and come up on the same node. """ osd_node_name = random.choice(get_osd_running_nodes()) old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids) unschedule_nodes([osd_node_name]) log.info(f"Successfully unschedule the node: {osd_node_name}") log.info("Delete the node osd pods") delete_pods(node_osd_pods) new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids) new_osd_pod_names = [p.name for p in new_osd_pods] wnodes = get_worker_nodes() if len(wnodes) <= 3: expected_pods_status = constants.STATUS_PENDING else: expected_pods_status = constants.STATUS_RUNNING log.info( f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state" ) res = wait_for_pods_to_be_in_statuses( [expected_pods_status], new_osd_pod_names, raise_pod_not_found_error=True, ) assert res, f"Not all the node osd pods are in a {expected_pods_status} state" log.info(f"Wait for the node: {osd_node_name} to be scheduled") schedule_nodes([osd_node_name]) log.info(f"Successfully scheduled the node {osd_node_name}") if len(wnodes) <= 3: assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" )
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() log.info("Get the machine set name from one of the worker node names") machine_name = machine.get_machine_from_node_name(worker_nodes[0]) machineset_name = machine.get_machineset_from_machine_name(machine_name) log.info( "Verify that the current replica count is equal to the ready replica count" ) machine.change_current_replica_count_to_ready_replica_count(machineset_name) ceph_health_check()
def start_baremetal_machines(self, baremetal_machine, wait=True): """ Start Baremetal Machines Args: baremetal_machine (list): BM objects wait (bool): Wait for BMs to start """ for node in baremetal_machine: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) logger.info(f"Powering On {node.name}") ipmi_ctx.chassis_control_power_up() if wait: if self.mgmt_details[node.name]: ipmi_ctx = self.get_ipmi_ctx( host=self.mgmt_details[node.name]["mgmt_console"], user=self.mgmt_details[node.name]["mgmt_username"], password=self.mgmt_details[node.name]["mgmt_password"], ) for status in TimeoutSampler( 600, 5, self.get_power_status, ipmi_ctx ): logger.info( f"Waiting for Baremetal Machine {node.name} to power on. " f"Current Baremetal status: {status}" ) if status == VM_POWERED_ON: logger.info( f"Baremetal Machine {node.name} reached poweredOn status" ) ipmi_ctx.session.close() break wait_for_cluster_connectivity(tries=400) wait_for_nodes_status( node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800 ) wait_for_nodes_status( node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800 )
def test_rwo_pvc_assign_pod_node(self, interface, pvc_factory, teardown_factory): """ Test assign nodeName to a pod using RWO pvc """ worker_nodes_list = get_worker_nodes() # Create a RWO PVC pvc_obj = pvc_factory( interface=interface, access_mode=constants.ACCESS_MODE_RWO, status=constants.STATUS_BOUND, ) # Create a pod on a particular node selected_node = random.choice(worker_nodes_list) logger.info( f"Creating a pod on node: {selected_node} with pvc {pvc_obj.name}") pod_obj = helpers.create_pod( interface_type=interface, pvc_name=pvc_obj.name, namespace=pvc_obj.namespace, node_name=selected_node, pod_dict_path=constants.NGINX_POD_YAML, ) teardown_factory(pod_obj) # Confirm that the pod is running on the selected_node helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=120) pod_obj.reload() assert pod.verify_node_name( pod_obj, selected_node ), "Pod is running on a different node than the selected node" # Run IO logger.info(f"Running IO on pod {pod_obj.name}") pod_obj.run_io(storage_type="fs", size="512M", runtime=30, invalidate=0) pod.get_fio_rw_iops(pod_obj)
def test_scale_node_and_capacity(self): """ Test for scaling 12 OCS worker nodes to the cluster Scale 12*3 = 36 OSDs """ expected_worker_count = 12 osds_per_node = 3 try: # Gather existing deviceset, OSD and node count in setup existing_ocs_worker_list = get_worker_nodes() existing_deviceset_count = storage_cluster.get_deviceset_count() osd_replication_count = storage_cluster.get_osd_replica_count() expected_deviceset_count = (expected_worker_count / osds_per_node) * osd_replication_count # Check existing OCS worker node count and add nodes if required if len(existing_ocs_worker_list) < expected_worker_count: scale_worker_count = expected_worker_count - len( existing_ocs_worker_list) if not scale_lib.scale_ocs_node(node_count=scale_worker_count): raise OCSWorkerScaleFailed( "OCS worker nodes scaling Failed") # Check existing OSD count and add OSDs if required if existing_deviceset_count < expected_deviceset_count: additional_deviceset = int(expected_deviceset_count - existing_deviceset_count) if not scale_lib.scale_capacity_with_deviceset( add_deviceset_count=additional_deviceset, timeout=600): raise OSDScaleFailed("Scaling OSDs Failed") # Check ceph health statuss utils.ceph_health_check(tries=30) except (OCSWorkerScaleFailed, OSDScaleFailed, Exception) as ex: TestAddNode.skip_all = True logging.warning( f"Due to Exception set TestAddNode.skip_all to {TestAddNode.skip_all}" ) logging.error(f"Cluster not in expected state. {ex}")
def test_scale_node_and_capacity(self): """ Test for scaling 12 OCS worker nodes to the cluster Scale 12*3 = 36 OSDs """ expected_worker_count = 12 osds_per_node = 3 try: # Gather existing deviceset, OSD and node count in setup existing_ocs_worker_list = get_worker_nodes() existing_deviceset_count = storage_cluster.get_deviceset_count() osd_replication_count = storage_cluster.get_osd_replica_count() expected_deviceset_count = ( expected_worker_count / osds_per_node ) * osd_replication_count # Check existing OCS worker node count and add nodes if required if len(existing_ocs_worker_list) < expected_worker_count: scale_worker_count = expected_worker_count - len( existing_ocs_worker_list ) assert scale_lib.scale_ocs_node(node_count=scale_worker_count) # Check existing OSD count and add OSDs if required if existing_deviceset_count < expected_deviceset_count: add_deviceset_count = ( expected_deviceset_count - existing_deviceset_count ) assert scale_lib.scale_capacity_with_deviceset( add_deviceset_count=add_deviceset_count ) # Check ceph health statuss utils.ceph_health_check(tries=30) except UnexpectedBehaviour: TestAddNode.skip_all = True logging.info("Cluster is not in expected state, unexpected behaviour") raise
def check_automated_recovery_from_terminated_node(nodes): """ 1) Terminate node. 2) The rook ceph pods associated with the node should change to a Terminating state. 3) A new node should be created automatically 4) The new osd pods with the same ids of the terminated node should start on the new osd node. """ old_wnodes = get_worker_nodes() log.info(f"Current worker nodes: {old_wnodes}") osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] machine_name = machine.get_machine_from_node_name(osd_node_name) machineset = machine.get_machineset_from_machine_name(machine_name) log.info(f"machineset name: {machineset}") old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate( osd_node.name) nodes.terminate_nodes([osd_node], wait=True) log.info(f"Successfully terminated the node: {osd_node_name}") log.info("Verify the node rook ceph pods go into a Terminating state") res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING], pod_names_expected_to_terminate) assert res, "Not all the node rook ceph pods are in a Terminating state" new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes) wait_for_osd_ids_come_up_on_node(new_wnode.name, old_osd_pod_ids, timeout=300) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}" )
def setup(self, interface_type, reclaim_policy, storageclass_factory): """ Creates storage class with specified interface and reclaim policy. Fetches all worker nodes Args: interface_type (str): The type of the interface (e.g. CephBlockPool, CephFileSystem) reclaim_policy (str): The type of reclaim policy (eg., 'Delete', 'Retain') storageclass_factory: A fixture to create new storage class Returns: tuple: containing the storage class instance and list of worker nodes """ # Create storage class sc_obj = storageclass_factory(interface=interface_type, reclaim_policy=reclaim_policy) worker_nodes_list = node.get_worker_nodes() return sc_obj, worker_nodes_list
def workloads_dir_setup(self, request): """ Setting up the environment for the test """ if config.DEPLOYMENT.get("local_storage"): self.worker_node = node.get_worker_nodes()[0] self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) mon_pod_name = self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=["ls /var/lib/rook/ | grep mon"], ) mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "") mon_pods_info = pod.get_pods_having_label( label=f"ceph_daemon_id={mon_pod_id}", namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) self.mon_pod = pod.get_pod_obj( name=mon_pods_info[0]["metadata"]["name"], namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) else: self.mon_pod = random.choice(pod.get_mon_pods()) self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get( "mon") self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" log.info(f"Selected mon '{self.mon_pod.name}'") self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}") self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}") def finalizer(): self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}") time.sleep(SLEEP_TIMEOUT) utils.ceph_health_check() request.addfinalizer(finalizer)
def check_and_add_enough_worker(worker_count): """ Function to check if there is enough workers available to scale pods. IF there is no enough worker then worker will be added based on supported platforms Function also adds scale label to the respective worker nodes. Args: worker_count (int): Expected worker count to be present in the setup Returns: book: True is there is enough worker count else raise exception. """ # Check either to use OCS workers for scaling app pods # Further continue to label the worker with scale label else not worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) if config.RUN.get("use_ocs_worker_for_scale"): if not scale_worker: helpers.label_worker_node(node_list=worker_list, label_key="scale-label", label_value="app-scale") else: if not scale_worker: for node_item in ocs_worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node( node_list=worker_list, label_key="scale-label", label_value="app-scale", ) scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL) logging.info(f"Print existing scale worker {scale_worker_list}") # Check if there is enough nodes to continue scaling of app pods if len(scale_worker_list) >= worker_count: logging.info(f"Setup has expected worker count {worker_count} " "to continue scale of pods") return True else: logging.info( "There is no enough worker in the setup, will add enough worker " "for the automation supported platforms") # Add enough worker for AWS if (config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws"): # Create machineset for app worker nodes on each aws zone # Each zone will have one app worker node ms_name = list() labels = [("node-role.kubernetes.io/app", "app-scale")] for obj in machine.get_machineset_objs(): if "app" in obj.name: ms_name.append(obj.name) if not ms_name: if len(machine.get_machineset_objs()) == 3: for zone in ["a", "b", "c"]: ms_name.append( machine.create_custom_machineset( instance_type="m5.4xlarge", labels=labels, zone=zone, )) else: ms_name.append( machine.create_custom_machineset( instance_type="m5.4xlarge", labels=labels, zone="a", )) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) if len(ms_name) == 3: exp_count = int(worker_count / 3) else: exp_count = worker_count for name in ms_name: machine.add_node(machine_set=name, count=exp_count) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes( constants.OPERATOR_NODE_LABEL) scale_label_worker = machine.get_labeled_nodes( constants.SCALE_LABEL) ocs_worker_list.extend(scale_label_worker) final_list = list(dict.fromkeys(ocs_worker_list)) for node_item in final_list: if node_item in worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node( node_list=worker_list, label_key="scale-label", label_value="app-scale", ) return True elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "vsphere"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "baremetal"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "azure"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") else: raise UnavailableResourceException( "There is no enough worker nodes to continue app pod scaling")
def log_reader_writer_parallel(self): """ Write and read logfile stored on cephfs volume, from all worker nodes of a cluster via k8s Deployment. Raise: NotFoundError: When given volume is not found in given spec UnexpectedBehaviour: When an unexpected problem with starting the workload occurred """ # get deployment dict for the reproducer logwriter workload with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file: self.deploy_dict = yaml.safe_load(deployment_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image( self.deploy_dict["spec"]["template"]) # we need to match deployment replicas with number of worker nodes self.deploy_dict["spec"]["replicas"] = len(get_worker_nodes()) # drop topology spread constraints related to zones topology.drop_topology_constraint( self.deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL) # and link the deployment with the pvc try: link_spec_volume( self.deploy_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", self.pvc_dict["metadata"]["name"], ) except (exceptions.NotFoundError, KeyError) as ex: logger.warning( "Failed to link the deployment with the pvc. We may need to check if the " "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test" ) raise ex # prepare k8s yaml file for deployment self.workload_file = ObjectConfFile( "log_reader_writer_parallel", [self.pvc_dict, self.deploy_dict], self.project, self.tmp_path, ) # deploy the workload, starting the log reader/writer pods logger.info( "starting log reader/writer workload via Deployment, one pod per worker" ) self.workload_file.create() logger.info("waiting for all pods of the workload Deployment to run") self.ocp_pod = ocp.OCP(kind="Pod", namespace=self.project.namespace) try: self.ocp_pod.wait_for_resource( resource_count=self.deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) except Exception as ex: # this is not a problem with feature under test, but with infra, # cluster configuration or unrelated bug which must have happened # before this test case error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken" logger.exception(error_msg) logger.debug(self.workload_file.describe()) raise exceptions.UnexpectedBehaviour(error_msg) from ex
def test_pod_reattach_time_performance( self, storageclass_factory, copies, timeout, total_time_limit ): """ Test assign nodeName to a pod using RWX pvc Each kernel (unzipped) is 892M and 61694 files The test creates samples_num pvcs and pods, writes kernel files multiplied by number of copies and calculates average total and csi reattach times and standard deviation """ kernel_url = "https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.5.tar.gz" download_path = "tmp" samples_num = 7 if self.dev_mode: samples_num = 3 test_start_time = PASTest.get_time() helpers.pull_images(constants.PERF_IMAGE) # Download a linux Kernel dir_path = os.path.join(os.getcwd(), download_path) file_path = os.path.join(dir_path, "file.gz") if not os.path.exists(dir_path): os.makedirs(dir_path) urllib.request.urlretrieve(kernel_url, file_path) worker_nodes_list = node.get_worker_nodes() assert len(worker_nodes_list) > 1 node_one = worker_nodes_list[0] node_two = worker_nodes_list[1] time_measures, csi_time_measures, files_written_list, data_written_list = ( [], [], [], [], ) self.sc_obj = storageclass_factory(self.interface) for sample_index in range(1, samples_num + 1): csi_start_time = self.get_time("csi") logger.info(f"Start creating PVC number {sample_index}.") pvc_obj = helpers.create_pvc( sc_name=self.sc_obj.name, size="100Gi", namespace=self.namespace ) helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) # Create a pod on one node logger.info(f"Creating Pod with pvc {pvc_obj.name} on node {node_one}") pvc_obj.reload() self.pvc_list.append(pvc_obj) try: pod_obj1 = helpers.create_pod( interface_type=self.interface, pvc_name=pvc_obj.name, namespace=pvc_obj.namespace, node_name=node_one, pod_dict_path=constants.PERF_POD_YAML, ) except Exception as e: logger.error( f"Pod on PVC {pvc_obj.name} was not created, exception {str(e)}" ) raise PodNotCreated("Pod on PVC was not created.") # Confirm that pod is running on the selected_nodes logger.info("Checking whether pods are running on the selected nodes") helpers.wait_for_resource_state( resource=pod_obj1, state=constants.STATUS_RUNNING, timeout=timeout ) pod_name = pod_obj1.name pod_path = "/mnt" _ocp = OCP(namespace=pvc_obj.namespace) rsh_cmd = f"rsync {dir_path} {pod_name}:{pod_path}" _ocp.exec_oc_cmd(rsh_cmd) rsh_cmd = ( f"exec {pod_name} -- tar xvf {pod_path}/tmp/file.gz -C {pod_path}/tmp" ) _ocp.exec_oc_cmd(rsh_cmd) for x in range(copies): rsh_cmd = f"exec {pod_name} -- mkdir -p {pod_path}/folder{x}" _ocp.exec_oc_cmd(rsh_cmd) rsh_cmd = ( f"exec {pod_name} -- cp -r {pod_path}/tmp {pod_path}/folder{x}" ) _ocp.exec_oc_cmd(rsh_cmd) rsh_cmd = f"exec {pod_name} -- sync" _ocp.exec_oc_cmd(rsh_cmd) logger.info("Getting the amount of data written to the PVC") rsh_cmd = f"exec {pod_name} -- df -h {pod_path}" data_written_str = _ocp.exec_oc_cmd(rsh_cmd).split()[-4] logger.info(f"The amount of written data is {data_written_str}") data_written = float(data_written_str[:-1]) rsh_cmd = f"exec {pod_name} -- find {pod_path} -type f" files_written = len(_ocp.exec_oc_cmd(rsh_cmd).split()) logger.info( f"For {self.interface} - The number of files written to the pod is {files_written}" ) files_written_list.append(files_written) data_written_list.append(data_written) logger.info("Deleting the pod") rsh_cmd = f"delete pod {pod_name}" _ocp.exec_oc_cmd(rsh_cmd) logger.info(f"Creating Pod with pvc {pvc_obj.name} on node {node_two}") try: pod_obj2 = helpers.create_pod( interface_type=self.interface, pvc_name=pvc_obj.name, namespace=pvc_obj.namespace, node_name=node_two, pod_dict_path=constants.PERF_POD_YAML, ) except Exception as e: logger.error( f"Pod on PVC {pvc_obj.name} was not created, exception {str(e)}" ) raise PodNotCreated("Pod on PVC was not created.") start_time = time.time() pod_name = pod_obj2.name helpers.wait_for_resource_state( resource=pod_obj2, state=constants.STATUS_RUNNING, timeout=timeout ) end_time = time.time() total_time = end_time - start_time if total_time > total_time_limit: logger.error( f"Pod creation time is {total_time} and greater than {total_time_limit} seconds" ) raise ex.PerformanceException( f"Pod creation time is {total_time} and greater than {total_time_limit} seconds" ) csi_time = performance_lib.pod_attach_csi_time( self.interface, pvc_obj.backed_pv, csi_start_time, pvc_obj.namespace )[0] csi_time_measures.append(csi_time) logger.info( f"PVC #{pvc_obj.name} pod {pod_name} creation time took {total_time} seconds, " f"csi time is {csi_time} seconds" ) time_measures.append(total_time) logger.info("Deleting the pod") rsh_cmd = f"delete pod {pod_name}" _ocp.exec_oc_cmd(rsh_cmd) # teardown_factory(pod_obj2) average = statistics.mean(time_measures) logger.info( f"The average time of {self.interface} pod creation on {samples_num} PVCs is {average} seconds" ) st_deviation = statistics.stdev(time_measures) logger.info( f"The standard deviation of {self.interface} pod creation time on {samples_num} PVCs is {st_deviation}" ) csi_average = statistics.mean(csi_time_measures) logger.info( f"The average csi time of {self.interface} pod creation on {samples_num} PVCs is {csi_average} seconds" ) csi_st_deviation = statistics.stdev(csi_time_measures) logger.info( f"The standard deviation of {self.interface} csi pod creation time on {samples_num} PVCs " f"is {csi_st_deviation}" ) files_written_average = statistics.mean(files_written_list) data_written_average = statistics.mean(data_written_list) os.remove(file_path) os.rmdir(dir_path) # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pod_reattach_time_fullres", ) ) full_results.add_key("storageclass", self.sc) full_results.add_key("pod_reattach_time", time_measures) full_results.add_key("copies_number", copies) full_results.add_key("files_number_average", files_written_average) full_results.add_key("data_average", data_written_average) full_results.add_key("pod_reattach_time_average", average) full_results.add_key("pod_reattach_standard_deviation", st_deviation) full_results.add_key("pod_csi_reattach_time_average", csi_average) full_results.add_key("pod_csi_reattach_standard_deviation", csi_st_deviation) test_end_time = PASTest.get_time() # Add the test time to the ES report full_results.add_key( "test_time", {"start": test_start_time, "end": test_end_time} ) # Write the test results into the ES server if full_results.es_write(): res_link = full_results.results_link() logger.info(f"The Result can be found at : {res_link}") # Create text file with results of all subtest (4 - according to the parameters) self.results_path = get_full_test_logs_path( cname=self, fname="test_pod_reattach_time_performance" ) self.write_result_to_file(res_link)
def test_pvc_disruptive( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory, ): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(pod.get_mds_pods), "mon": partial(pod.get_mon_pods), "mgr": partial(pod.get_mgr_pods), "osd": partial(pod.get_osd_pods), "rbdplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(pod.get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(pod.get_rbdfsplugin_provisioner_pods), "operator": partial(pod.get_operator_pods), } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) DISRUPTION_OPS.set_resource(resource=resource_to_delete) access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) num_of_pvc = 8 access_mode_dist_ratio = [6, 2] # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ]) num_of_pvc = 9 access_mode_dist_ratio = [4, 3, 2] executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=5, access_modes=access_modes, access_modes_selection="distribute_random", access_mode_dist_ratio=access_mode_dist_ratio, status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False, timeout=90, ) if operation_to_disrupt == "create_pvc": # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, "increase") assert ret, "Wait timeout: PVCs are not being created." logger.info("PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() logger.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes(), ) if operation_to_disrupt == "create_pod": # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, "increase") assert ret, "Wait timeout: Pods are not being created." logger.info("Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90) pod_obj.reload() logger.info("Verified: All pods are Running.") # Do setup on pods for running IO logger.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: logger.info( f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: logger.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break logger.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file1", ) logger.info("FIO started on all pods.") if operation_to_disrupt == "run_io": DISRUPTION_OPS.delete_resource() logger.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods( pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes(), ) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90) pod_obj.reload() logger.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) logger.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) logger.info("Ceph cluster health is OK")
def setup_base(self, interface, multi_pvc_factory, pod_factory): """ Create PVCs and pods """ access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) self.num_of_pvcs = 10 access_mode_dist_ratio = [8, 2] # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in filesystem type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend( [ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ] ) self.num_of_pvcs = 12 access_mode_dist_ratio = [5, 5, 2] pvc_objs = multi_pvc_factory( interface=interface, project=None, storageclass=None, size=self.pvc_size, access_modes=access_modes, access_modes_selection="distribute_random", access_mode_dist_ratio=access_mode_dist_ratio, status=constants.STATUS_BOUND, num_of_pvc=self.num_of_pvcs, wait_each=False, ) pod_objs = [] rwx_pod_objs = [] nodes_iter = cycle(node.get_worker_nodes()) # Create one pod using each RWO PVC and two pods using each RWX PVC for pvc_obj in pvc_objs: pvc_info = pvc_obj.get() if pvc_info["spec"]["volumeMode"] == "Block": pod_dict = constants.CSI_RBD_RAW_BLOCK_POD_YAML raw_block_pv = True else: raw_block_pv = False pod_dict = "" if pvc_obj.access_mode == constants.ACCESS_MODE_RWX: pod_obj = pod_factory( interface=interface, pvc=pvc_obj, status="", node_name=next(nodes_iter), pod_dict_path=pod_dict, raw_block_pv=raw_block_pv, ) rwx_pod_objs.append(pod_obj) pod_obj = pod_factory( interface=interface, pvc=pvc_obj, status="", node_name=next(nodes_iter), pod_dict_path=pod_dict, raw_block_pv=raw_block_pv, ) pod_objs.append(pod_obj) # Wait for pods to be in Running state for pod_obj in pod_objs + rwx_pod_objs: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info(f"Created {len(pod_objs) + len(rwx_pod_objs)} pods.") return pvc_objs, pod_objs, rwx_pod_objs
def test_non_ocs_taint_and_tolerations(self): """ Test runs the following steps 1. Taint ocs nodes with non-ocs taint 2. Set tolerations on storagecluster, subscription, configmap and ocsinit 3. Respin all ocs pods and check if it runs on ocs nodes with tolerations 4. Add Capacity """ # Taint all nodes with non-ocs taint ocs_nodes = get_worker_nodes() taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule") # Add tolerations to the storagecluster storagecluster_obj = ocp.OCP( resource_name=constants.DEFAULT_CLUSTERNAME, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.STORAGECLUSTER, ) tolerations = ( '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",' '"operator": "Equal", "value": "true"}, ' '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", ' '"operator": "Equal", "value": "true"}]}') param = ( f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, ' f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}') storagecluster_obj.patch(params=param, format_type="merge") # Add tolerations to the subscription sub_list = ocp.get_all_resource_names_of_a_kind( kind=constants.SUBSCRIPTION) param = ( '{"spec": {"config": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}}') for sub in sub_list: sub_obj = ocp.OCP( resource_name=sub, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.SUBSCRIPTION, ) sub_obj.patch(params=param, format_type="merge") # Add tolerations to the ocsinitializations.ocs.openshift.io param = ( '{"spec": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}') ocsini_obj = ocp.OCP( resource_name=constants.OCSINIT, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.OCSINITIALIZATION, ) ocsini_obj.patch(params=param, format_type="merge") # Add tolerations to the configmap rook-ceph-operator-config configmap_obj = ocp.OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, resource_name=constants.ROOK_OPERATOR_CONFIGMAP, ) toleration = configmap_obj.get().get("data").get( "CSI_PLUGIN_TOLERATIONS") toleration += ( '\n- key: xyz\n operator: Equal\n value: "true"\n effect: NoSchedule' ) toleration = toleration.replace('"', '\\"').replace("\n", "\\n") param_cmd = ( f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, ' f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]' ) configmap_obj.patch(params=param_cmd, format_type="json") # After edit noticed few pod respins as expected assert wait_for_pods_to_be_running(timeout=600, sleep=15) # Respin all pods and check it if is still running pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) for pod in pod_list: pod.delete(wait=False) assert wait_for_pods_to_be_running(timeout=600, sleep=15) self.sanity_helpers.health_check() # Add capacity to check if new osds has toleration osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 assert pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=count * replica_count, ), "New OSDs failed to reach running state" check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)