def test_add_capacity(self, node_multiplier, capacity): """ Test to add variable capacity to the OSD cluster while IOs running Args: node_multiplier: the number of OSD to add per worker node capacity: the storage capacity of each OSD """ dt = config.ENV_DATA['deployment_type'] if dt == 'ipi': storage_cluster = machine_utils.get_storage_cluster( namespace=defaults.ROOK_CLUSTER_NAMESPACE) worker_nodes = len(get_typed_nodes()) machine_utils.add_capacity(storagecluster_name=storage_cluster, count=worker_nodes * node_multiplier) machine_utils.add_storage_capacity( storagecluster_name=storage_cluster, capacity=capacity) pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=worker_nodes * node_multiplier, timeout=600), "OSD pods failed to reach RUNNING state" else: pytest.skip("UPI not yet supported")
def initialize_data(): """ Initialize the data dictionary with cluster data Returns: dict: A dictionary contains the data to push to the dashboard """ worker_type = get_typed_nodes( num_of_nodes=1 )[0].data['metadata']['labels']['beta.kubernetes.io/instance-type'] (ocs_ver_info, _) = get_ocs_version() ocs_ver_full = ocs_ver_info['status']['desired']['version'] m = re.match(r"(\d.\d).(\d)-", ocs_ver_full) if m.group(1) is not None: ocs_ver = m.group(1) platform = config.ENV_DATA['platform'] if platform.lower() == 'aws': platform = platform.upper() + " " + worker_type data_template['commitid'] = ocs_ver_full data_template['project'] = f"OCS{ocs_ver}" data_template['branch'] = ocs_ver_info['spec']['channel'] data_template['executable'] = ocs_ver data_template['environment'] = platform return data_template
def initialize_data(): """ Initialize the data dictionary with cluster data Returns: dict: A dictionary contains the data to push to the dashboard """ # worker type is relevant only for cloud instances. log.info('Initializing the dashboard data') worker_lbl = get_typed_nodes(num_of_nodes=1)[0].data['metadata']['labels'] if 'beta.kubernetes.io/instance-type' in worker_lbl: worker_type = worker_lbl['beta.kubernetes.io/instance-type'] else: # TODO: Maybe for None cloud we can add the Arch ? # worker_type = worker_lbl['kubernetes.io/arch'] worker_type = "" log.info(f'The worker type is {worker_type}') (ocs_ver_info, _) = get_ocs_version() ocs_ver_full = ocs_ver_info['status']['desired']['version'] m = re.match(r"(\d.\d).(\d)", ocs_ver_full) if m and m.group(1) is not None: ocs_ver = m.group(1) log.info(f'ocs_ver is {ocs_ver_full}') platform = config.ENV_DATA['platform'] if platform.lower() not in ['vsphere', 'baremetal']: platform = f'{platform.upper()} {worker_type}' data_template['commitid'] = ocs_ver_full data_template['project'] = f"OCS{ocs_ver}" data_template['branch'] = ocs_ver_info['spec']['channel'] data_template['executable'] = ocs_ver data_template['environment'] = platform return data_template
def test_2_nodes_maintenance_same_type( self, pvc_factory, pod_factory, nodes_type ): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def node_operations_entry_criteria(self, node_type, number_of_nodes, operation_name="Node Operation", network_fail_time=None): """ Entry criteria function for node related operations Args: node_type (str): Type of node number_of_nodes (int): Number of nodes operation_name (str): Name of the node operation network_fail_time (int): Total time to fail the network in a node Returns: tuple: containing the params used in Node operations """ self.validate_cluster(node_status=True, operation_name=operation_name) logger.info(f"Getting parameters related to: {operation_name}") typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=number_of_nodes) if network_fail_time: return typed_nodes, network_fail_time else: return typed_nodes
def test_run_couchbase_node_drain(self, cb_setup, node_type='master'): """ Test couchbase workload with node drain """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top( node_type='worker', print_table=True ) # Node drain with specific node type typed_nodes = node.get_typed_nodes( node_type=node_type, num_of_nodes=1 ) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() for sample in TimeoutSampler(300, 5, self.cb.result.done): if sample: break else: logging.info( "#### ....Waiting for couchbase threads to complete..." ) utils.ceph_health_check()
def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def get_node_name_where_jenkins_pod_not_hosted( self, node_type=constants.WORKER_MACHINE, num_of_nodes=1 ): """ get nodes Args: node_type (str): The node type (e.g. worker, master) num_of_nodes (int): The number of nodes to be returned Returns: list: List of compute node names """ if node_type == constants.MASTER_MACHINE: nodes_drain = [node.name for node in get_typed_nodes( node_type=node_type, num_of_nodes=num_of_nodes )] elif node_type == constants.WORKER_MACHINE: pod_objs = [] for project in self.projects: pod_names = get_pod_name_by_pattern( pattern='jenkins', namespace=project ) pod_obj = [get_pod_obj(name=pod_name, namespace=project) for pod_name in pod_names] pod_objs += pod_obj nodes_app_name = set(get_app_pod_running_nodes(pod_objs)) nodes_worker_name = set(get_worker_nodes()) nodes_drain = nodes_worker_name - nodes_app_name else: raise ValueError('The node type is worker or master') return list(nodes_drain)[:num_of_nodes]
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_node_maintenance(self, reduce_cluster_load, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=90)
def verify_image_versions(old_images, upgrade_version): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images upgrade_version (packaging.version.Version): version of OCS """ number_of_worker_nodes = len(get_typed_nodes()) osd_count = get_osd_count() verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID, # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3 verify_pods_upgraded(old_images, selector=constants.NOOBAA_APP_LABEL, count=nooba_pods) verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) # OSD upgrade have timeout 10mins for new attempt if cluster is not health. # https://bugzilla.redhat.com/show_bug.cgi?id=1840729 setting timeout for # 12.5 minutes per OSD verify_pods_upgraded( old_images, selector=constants.OSD_APP_LABEL, count=osd_count, timeout=750 * osd_count, ) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS or ( config.ENV_DATA.get('platform') == constants.AZURE_PLATFORM): # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 # TODO: uncomment the below 1 line: # rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 else 1 # TODO: Delete the below 1 line rgw_count = 1 verify_pods_upgraded(old_images, selector=constants.RGW_APP_LABEL, count=rgw_count)
def get_new_device_paths(device_sets_required, osd_size_capacity_requested): """ Get new device paths to add capacity over Baremetal cluster Args: device_sets_required (int) : Count of device sets to be added osd_size_capacity_requested (int) : Requested OSD size capacity Returns: list : List containing added device paths """ ocp_obj = OCP(kind='localvolume', namespace=config.ENV_DATA['local_storage_namespace']) workers = get_typed_nodes(node_type="worker") worker_names = [worker.name for worker in workers] config.ENV_DATA['worker_replicas'] = len(worker_names) output = ocp_obj.get(resource_name='local-block') # Fetch device paths present in the current LVCR cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"] # Clone repo and run playbook to fetch all device paths from each node path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp") clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path) os.chdir(path) run_cmd("ansible-playbook devices_by_id.yml") # Filter unused/unallocated device paths with open("local-storage-block.yaml", "r") as cloned_file: with open("local-block.yaml", "w") as our_file: device_from_worker = [1] * config.ENV_DATA['worker_replicas'] cur_line = cloned_file.readline() while "devicePaths:" not in cur_line: our_file.write(cur_line) cur_line = cloned_file.readline() our_file.write(cur_line) cur_line = cloned_file.readline() # Add required number of device path from each worker node while cur_line: if str(osd_size_capacity_requested) in cur_line: for i in range(len(worker_names)): if device_from_worker[i] and (str(worker_names[i]) in cur_line): if not any(s in cur_line for s in cur_device_list): our_file.write(cur_line) device_from_worker[ i] = device_from_worker[i] - 1 cur_line = cloned_file.readline() local_block_yaml = open("local-block.yaml") lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader) new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"] logger.info(f"Newly added devices are: {new_dev_paths}") if new_dev_paths: assert len(new_dev_paths) == ( len(worker_names) * device_sets_required), ( f"Current devices available = {len(new_dev_paths)}") os.chdir(constants.TOP_DIR) shutil.rmtree(path) # Return list of old device paths and newly added device paths cur_device_list.extend(new_dev_paths) return cur_device_list
def get_new_device_paths(device_sets_required, osd_size_capacity_requested): """ Get new device paths to add capacity over Baremetal cluster Args: device_sets_required (int) : Count of device sets to be added osd_size_capacity_requested (int) : Requested OSD size capacity Returns: cur_device_list (list) : List containing added device paths """ ocp_obj = OCP() workers = get_typed_nodes(node_type="worker") worker_names = [worker.name for worker in workers] output = ocp_obj.exec_oc_cmd("get localvolume local-block -n local-storage -o yaml") cur_device_list = output["spec"]["storageClassDevices"][0]["devicePaths"] path = os.path.join(constants.EXTERNAL_DIR, "device-by-id-ocp") utils.clone_repo(constants.OCP_QE_DEVICEPATH_REPO, path) os.chdir(path) utils.run_cmd("ansible-playbook devices_by_id.yml") with open("local-storage-block.yaml", "r") as cloned_file: with open("local-block.yaml", "w") as our_file: device_from_worker1 = device_sets_required device_from_worker2 = device_sets_required device_from_worker3 = device_sets_required cur_line = cloned_file.readline() while "devicePaths:" not in cur_line: our_file.write(cur_line) cur_line = cloned_file.readline() our_file.write(cur_line) cur_line = cloned_file.readline() # Add required number of device path from each node while cur_line: if str(osd_size_capacity_requested) in cur_line: if device_from_worker1 and (str(worker_names[0]) in cur_line): if not any(s in cur_line for s in cur_device_list): our_file.write(cur_line) device_from_worker1 = device_from_worker1 - 1 if device_from_worker2 and (str(worker_names[1]) in cur_line): if not any(s in cur_line for s in cur_device_list): our_file.write(cur_line) device_from_worker2 = device_from_worker2 - 1 if device_from_worker3 and (str(worker_names[2]) in cur_line): if not any(s in cur_line for s in cur_device_list): our_file.write(cur_line) device_from_worker3 = device_from_worker3 - 1 cur_line = cloned_file.readline() local_block_yaml = open("local-block.yaml") lvcr = yaml.load(local_block_yaml, Loader=yaml.FullLoader) new_dev_paths = lvcr["spec"]["storageClassDevices"][0]["devicePaths"] log.info(f"Newly added devices are: {new_dev_paths}") assert len(new_dev_paths) == (len(worker_names) * device_sets_required), ( f"Current devices available = {len(new_dev_paths)}" ) os.chdir(constants.TOP_DIR) shutil.rmtree(path) cur_device_list.extend(new_dev_paths) return cur_device_list
def get_environment_info(): """ Getting the environment information, Information that will be collected Versions: OCP - version / build / channel OCS - version / build Ceph - version Rook - version Platform: BM / VmWare / Cloud provider etc. Instance type / architecture Cluster name User name that run the test Return: dict: dictionary that contain the environment information """ results = {} # getting the name and email of the user that running the test. try: user = utils.run_cmd('git config --get user.name').strip() email = utils.run_cmd('git config --get user.email').strip() results['user'] = f'{user} <{email}>' except CommandFailed: # if no git user define, the default user is none results['user'] = '' results['clustername'] = ocp.get_clustername() results['platform'] = node.get_provider() if results['platform'].lower() not in constants.ON_PREM_PLATFORMS: results['platform'] = results['platform'].upper() results['ocp_build'] = ocp.get_build() results['ocp_channel'] = ocp.get_ocp_channel() results['ocp_version'] = utils.get_ocp_version() results['ceph_version'] = utils.get_ceph_version() results['rook_version'] = utils.get_rook_version() results['ocs_build'] = ocp.get_ocs_version() # Extracting the version number x.y.z from full build name m = re.match(r"(\d.\d).(\d)", results['ocs_build']) if m and m.group(1) is not None: results['ocs_version'] = m.group(1) # Getting the instance type for cloud or Arch type for None cloud worker_lbl = node.get_typed_nodes( num_of_nodes=1)[0].data['metadata']['labels'] if 'beta.kubernetes.io/instance-type' in worker_lbl: results['worker_type'] = worker_lbl['beta.kubernetes.io/instance-type'] else: results['worker_type'] = worker_lbl['kubernetes.io/arch'] return results
def verify_image_versions(old_images, upgrade_version): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images upgrade_version (packaging.version.Version): version of OCS """ namespace = config.ENV_DATA['cluster_namespace'] number_of_worker_nodes = len(get_typed_nodes()) storage_cluster = StorageCluster( resource_name=config.ENV_DATA['storage_cluster_name'], namespace=namespace) osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID, # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3 verify_pods_upgraded(old_images, selector=constants.NOOBAA_APP_LABEL, count=nooba_pods) verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3) verify_pods_upgraded( old_images, selector=constants.OSD_APP_LABEL, count=osd_count, timeout=1800, ) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: verify_pods_upgraded(old_images, selector=constants.RGW_APP_LABEL, count=1)
def verify_image_versions(old_images, upgrade_version): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images upgrade_version (packaging.version.Version): version of OCS """ number_of_worker_nodes = len(get_typed_nodes()) osd_count = get_osd_count() verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) # in 4.3 app selector nooba have those pods: noobaa-core-ID, noobaa-db-ID, # noobaa-operator-ID but in 4.2 only 2: noobaa-core-ID, noobaa-operator-ID nooba_pods = 2 if upgrade_version < parse_version('4.3') else 3 verify_pods_upgraded(old_images, selector=constants.NOOBAA_APP_LABEL, count=nooba_pods) verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) # OSD upgrade have timeout 10mins for new attempt if cluster is not health. # https://bugzilla.redhat.com/show_bug.cgi?id=1840729 setting timeout for # 12.5 minutes per OSD verify_pods_upgraded( old_images, selector=constants.OSD_APP_LABEL, count=osd_count, timeout=750 * osd_count, ) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: verify_pods_upgraded(old_images, selector=constants.RGW_APP_LABEL, count=1)
def test_run_pgsql_node_drain(self, pgsql, transactions=900, node_type='master'): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) # Node drain with specific node type typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def add_worker_based_on_pods_count_per_node(node_count, expected_count, role_type=None, machineset_name=None): """ Function to evaluate number of pods up in node and add new node accordingly. Args: machineset_name (list): Machineset_names to add more nodes if required. node_count (int): Additional nodes to be added expected_count (int): Expected pod count in one node role_type (str): To add type to the nodes getting added Returns: bool: True if Nodes gets added, else false. """ # Check for POD running count on each nodes if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[ 'platform'].lower() == 'aws': app_nodes = node.get_typed_nodes(node_type=role_type) pod_count_dict = node.get_running_pod_count_from_node( node_type=role_type) high_count_nodes, less_count_nodes = ([] for i in range(2)) for node_obj in app_nodes: count = pod_count_dict[f"{node_obj.name}"] if count >= expected_count: high_count_nodes.append(node_obj.name) else: less_count_nodes.append(node_obj.name) if len(less_count_nodes) <= 1: for name in machineset_name: count = machine.get_replica_count(machine_set=name) machine.add_node(machine_set=name, count=(count + node_count)) machine.wait_for_new_node_to_be_ready(name) return True else: logging.info( f"Enough pods can be created with available nodes {pod_count_dict}" ) return False elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'vsphere': raise UnsupportedPlatformError("Unsupported Platform to add worker") elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'baremetal': raise UnsupportedPlatformError("Unsupported Platform to add worker") elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'azure': raise UnsupportedPlatformError("Unsupported Platform to add worker")
def add_worker_based_on_cpu_utilization(node_count, expected_percent, role_type=None, machineset_name=None): """ Function to evaluate CPU utilization of nodes and add node if required. Args: machineset_name (list): Machineset_names to add more nodes if required. node_count (int): Additional nodes to be added expected_percent (int): Expected utilization precent role_type (str): To add type to the nodes getting added Returns: bool: True if Nodes gets added, else false. """ # Check for CPU utilization on each nodes if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[ 'platform'].lower() == 'aws': app_nodes = node.get_typed_nodes(node_type=role_type) uti_dict = node.get_node_resource_utilization_from_oc_describe( node_type=role_type) uti_high_nodes, uti_less_nodes = ([] for i in range(2)) for node_obj in app_nodes: utilization_percent = uti_dict[f"{node_obj.name}"]['cpu'] if utilization_percent > expected_percent: uti_high_nodes.append(node_obj.name) else: uti_less_nodes.append(node_obj.name) if len(uti_less_nodes) <= 1: for name in machineset_name: count = machine.get_replica_count(machine_set=name) machine.add_node(machine_set=name, count=(count + node_count)) machine.wait_for_new_node_to_be_ready(name) return True else: logging.info( f"Enough resource available for more pod creation {uti_dict}") return False elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'vsphere': raise UnsupportedPlatformError("Unsupported Platform to add worker") elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'baremetal': raise UnsupportedPlatformError("Unsupported Platform to add worker") elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'azure': raise UnsupportedPlatformError("Unsupported Platform to add worker")
def verify_image_versions(old_images): """ Verify if all the images of OCS objects got upgraded Args: old_images (set): set with old images """ namespace = config.ENV_DATA['cluster_namespace'] number_of_worker_nodes = len(get_typed_nodes()) storage_cluster = StorageCluster( resource_name=config.ENV_DATA['storage_cluster_name'], namespace=namespace) osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) verify_pods_upgraded(old_images, selector=constants.OCS_OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.OPERATOR_LABEL) verify_pods_upgraded(old_images, selector=constants.NOOBAA_APP_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_CEPHFSPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded( old_images, selector=constants.CSI_RBDPLUGIN_LABEL, count=number_of_worker_nodes, ) verify_pods_upgraded(old_images, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, count=2) verify_pods_upgraded(old_images, selector=constants.MGR_APP_LABEL) verify_pods_upgraded(old_images, selector=constants.MON_APP_LABEL, count=3) verify_pods_upgraded(old_images, selector=constants.OSD_APP_LABEL, count=osd_count) verify_pods_upgraded(old_images, selector=constants.MDS_APP_LABEL, count=2) if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: verify_pods_upgraded(old_images, selector=constants.RGW_APP_LABEL, count=1)
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) get_node_resource_utilization_from_adm_top(node_type='master', print_table=True) if pod_name_of_node == 'couchbase': node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == 'osd': node_list = get_osd_running_nodes() elif pod_name_of_node == 'master': master_node = get_typed_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == 'master': nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(ocp.wait_for_cluster_connectivity(tries=400)) retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check()
def test_monitoring_after_rebooting_master_node(self, nodes, pods): """ Test case to validate rebooting master node shouldn't delete the data collected on prometheus pod """ # Get the master node list master_nodes = get_typed_nodes(node_type='master') # Reboot one after one master nodes for node in master_nodes: nodes.restart_nodes([node]) wait_for_nodes_status_and_prometheus_health_check(pods) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check()
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup): """ Test case to validate rebooting master node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_typed_nodes(node_type, num_of_nodes=1) # Reboot one master nodes nodes.restart_nodes(node, wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(ocp.wait_for_cluster_connectivity(tries=400)) retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition='Running', resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1) assert worker, "Failed to find a worker node for the test" worker = worker[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) assert instance, f"Failed to get ec2 instances for node {worker.name}" instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.sanity_helpers.health_check()
def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) assert workers, "Failed to find worker nodes for the test" # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) assert instances, ( f"Failed to get ec2 instances for node {[w.name for w in workers]}" ) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def test_detach_attach_worker_volume(self, aws_obj, resources): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1)[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.validate_cluster(resources=resources, nodes=list(instance.values()), health_check=False) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.health_check()