def add_capacity_test(): osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def test_add_ocs_node(self, add_nodes): """ Test to add ocs nodes and wait till rebalance is completed """ add_nodes(ocs_nodes=True) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=3600), "Data re-balance failed to complete"
def test_nodereplacement_proactive(self): """ Knip-894 Node Replacement proactive(without IO running) """ osd_node_name = select_osd_node_name() delete_and_create_osd_node(osd_node_name) # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=90) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=1800), ("Data re-balance failed to complete")
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() existing_osd_pods = get_osd_pods() existing_osd_pod_names = [pod.name for pod in existing_osd_pods] result = storage_cluster.add_capacity(osd_size) osd_pods_post_expansion = get_osd_pods() osd_pod_names_post_expansion = [ pod.name for pod in osd_pods_post_expansion ] restarted_osds = list() logger.info( "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)" ) for pod in existing_osd_pod_names: if pod not in osd_pod_names_post_expansion: restarted_osds.append(pod) assert ( len(restarted_osds) == 0 ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}" pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def test_add_node(self): """ Test for adding worker nodes to the cluster while IOs """ new_nodes = 3 if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS: dt = config.ENV_DATA["deployment_type"] if dt == "ipi": machines = machine_utils.get_machinesets() logger.info( f"The worker nodes number before expansion {len(node.get_worker_nodes())}" ) for machine in machines: add_new_node_and_label_it(machine) logger.info( f"The worker nodes number after expansion {len(node.get_worker_nodes())}" ) else: logger.info( f"The worker nodes number before expansion {len(node.get_worker_nodes())}" ) if config.ENV_DATA.get("rhel_workers"): node_type = constants.RHEL_OS else: node_type = constants.RHCOS assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" logger.info( f"The worker nodes number after expansion {len(node.get_worker_nodes())}" ) elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521") # Issue to remove skip code https://github.com/red-hat-storage/ocs-ci/issues/2403 # logger.info(f'The worker nodes number before expansion {len(node.get_worker_nodes())}') # if config.ENV_DATA.get('rhel_user'): # pytest.skip("Skipping add RHEL node, code unavailable") # node_type = constants.RHCOS # assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed" # logger.info(f'The worker nodes number after expansion {len(node.get_worker_nodes())}') ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=3600), "Data re-balance failed to complete"
def test_add_capacity_internal(self, setup_ui): """ Test Add Capacity on Internal cluster via UI """ logger.info("Get osd pods before add capacity") osd_pods_before_add_capacity = get_osd_pods() osd_count = len(osd_pods_before_add_capacity) logger.info("Add capacity via UI") infra_ui_obj = AddReplaceDeviceUI(setup_ui) infra_ui_obj.add_capacity_ui() logging.info("Wait for osd pods to be in Running state") for osd_pods in TimeoutSampler( timeout=600, sleep=10, func=get_osd_pods, ): if len(osd_pods) == (osd_count + 3): break osd_pod_names = list() for osd_pod in osd_pods: wait_for_resource_state( resource=osd_pod, state=constants.STATUS_RUNNING, timeout=300 ) osd_pod_names.append(osd_pod.name) logger.info("Verify via ui, all osd pods in Running state") infra_ui_obj.verify_pod_status(pod_names=osd_pod_names) logger.info("Wait data re-balance to complete") ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400 ), "Data re-balance failed to complete" if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def test_upgrade_ocp(self, reduce_and_resume_cluster_load): """ Tests OCS stability when upgrading OCP """ ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): ocp_channel = config.UPGRADE.get("ocp_channel", ocp.get_ocp_upgrade_channel()) ocp_upgrade_version = config.UPGRADE.get("ocp_upgrade_version") if not ocp_upgrade_version: ocp_upgrade_version = get_latest_ocp_version( channel=ocp_channel) ocp_arch = config.UPGRADE["ocp_arch"] target_image = f"{ocp_upgrade_version}-{ocp_arch}" elif ocp_upgrade_version.endswith(".nightly"): target_image = expose_ocp_version(ocp_upgrade_version) logger.info(f"Target image; {target_image}") image_path = config.UPGRADE["ocp_upgrade_path"] cluster_operators = ocp.get_all_cluster_operators() logger.info(f" oc version: {ocp.get_current_oc_version()}") # Verify Upgrade subscription channel: ocp.patch_ocp_upgrade_channel(ocp_channel) for sampler in TimeoutSampler( timeout=250, sleep=15, func=ocp.verify_ocp_upgrade_channel, channel_variable=ocp_channel, ): if sampler: logger.info(f"OCP Channel:{ocp_channel}") break # Upgrade OCP logger.info(f"full upgrade path: {image_path}:{target_image}") ocp.upgrade_ocp(image=target_image, image_path=image_path) # Wait for upgrade for ocp_operator in cluster_operators: logger.info(f"Checking upgrade status of {ocp_operator}:") # ############ Workaround for issue 2624 ####### name_changed_between_versions = ( "service-catalog-apiserver", "service-catalog-controller-manager", ) if ocp_operator in name_changed_between_versions: logger.info(f"{ocp_operator} upgrade will not be verified") continue # ############ End of Workaround ############### ver = ocp.get_cluster_operator_version(ocp_operator) logger.info(f"current {ocp_operator} version: {ver}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.confirm_cluster_operator_version, target_version=target_image, cluster_operator=ocp_operator, ): if sampler: logger.info(f"{ocp_operator} upgrade completed!") break else: logger.info( f"{ocp_operator} upgrade did not completed yet!") # post upgrade validation: check cluster operator status cluster_operators = ocp.get_all_cluster_operators() for ocp_operator in cluster_operators: logger.info(f"Checking cluster status of {ocp_operator}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.verify_cluster_operator_status, cluster_operator=ocp_operator, ): if sampler: break else: logger.info(f"{ocp_operator} status is not valid") # Post upgrade validation: check cluster version status logger.info("Checking clusterversion status") for sampler in TimeoutSampler( timeout=900, sleep=15, func=ocp.validate_cluster_version_status): if sampler: logger.info("Upgrade Completed Successfully!") break new_ceph_cluster = CephCluster() new_ceph_cluster.wait_for_rebalance(timeout=1800) ceph_health_check(tries=90, delay=30)
def test_add_capacity_with_resource_delete(self, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= constants.MAX_OSDS: pytest.skip("We have maximum of OSDs in the cluster") d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=90) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=1800), ("Data re-balance failed to complete")