def add_new_node_to_lvd_and_lvs(old_node_name, new_node_name): """ Replace the old node with the new node in localVolumeDiscovery and localVolumeSet, as described in the documents of node replacement with LSO Args: old_node_name (str): The old node name to remove from the local volume new_node_name (str): the new node name to add to the local volume Returns: bool: True in case if changes are applied. False otherwise """ old_node_index = get_node_index_in_local_block(old_node_name) path_to_old_node = f"/spec/nodeSelector/nodeSelectorTerms/0/matchExpressions/0/values/{old_node_index}" params = f"""[{{"op": "replace", "path": "{path_to_old_node}", "value": "{new_node_name}"}}]""" ocp_lvd_obj = OCP( kind=constants.LOCAL_VOLUME_DISCOVERY, namespace=defaults.LOCAL_STORAGE_NAMESPACE, ) ocp_lvs_obj = OCP( kind=constants.LOCAL_VOLUME_SET, namespace=defaults.LOCAL_STORAGE_NAMESPACE, resource_name=constants.LOCAL_BLOCK_RESOURCE, ) lvd_result = ocp_lvd_obj.patch(params=params, format_type="json") lvs_result = ocp_lvs_obj.patch(params=params, format_type="json") return lvd_result and lvs_result
def test_pv_scale_out(self, backingstore_factory): """ Test to check the scale out functionality of pv pool backing store. """ pv_backingstore = backingstore_factory( "OC", { "pv": [(1, MIN_PV_BACKINGSTORE_SIZE_IN_GB, "ocs-storagecluster-ceph-rbd")] }, )[0] logger.info(f"Scaling out PV Pool {pv_backingstore.name}") pv_backingstore.vol_num += 1 edit_pv_backingstore = OCP( kind="BackingStore", namespace=config.ENV_DATA["cluster_namespace"]) params = f'{{"spec":{{"pvPool":{{"numVolumes":{pv_backingstore.vol_num}}}}}}}' edit_pv_backingstore.patch(resource_name=pv_backingstore.name, params=params, format_type="merge") logger.info("Waiting for backingstore to return to OPTIMAL state") wait_for_pv_backingstore(pv_backingstore.name, config.ENV_DATA["cluster_namespace"]) logger.info("Check if PV Pool scale out was successful") backingstore_dict = edit_pv_backingstore.get(pv_backingstore.name) assert (backingstore_dict["spec"]["pvPool"]["numVolumes"] == pv_backingstore.vol_num), "Scale out PV Pool failed. " logger.info("Scale out was successful")
def change_noobaa_endpoints_count(nb_eps): """ Scale up or down the number of maximum NooBaa emdpoints Args: nb_eps (int): The number of required Noobaa endpoints """ log.info(f"Scaling up Noobaa endpoints to a maximum of {nb_eps}") params = f'{{"spec":{{"endpoints":{{"maxCount":{nb_eps},"minCount":1}}}}}}' noobaa = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) noobaa.patch(resource_name='noobaa', params=params, format_type='merge')
def change_noobaa_endpoints_count(min_nb_eps=None, max_nb_eps=None): """ Scale up or down the number of maximum NooBaa emdpoints Args: min_nb_eps (int): The number of required minimum Noobaa endpoints max_nb_eps (int): The number of required maximum Noobaa endpoints """ noobaa = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) if min_nb_eps: log.info(f"Changing minimum Noobaa endpoints to {min_nb_eps}") params = f'{{"spec":{{"endpoints":{{"minCount":{min_nb_eps}}}}}}}' noobaa.patch(resource_name='noobaa', params=params, format_type='merge') if max_nb_eps: log.info(f"Changing maximum Noobaa endpoints to {max_nb_eps}") params = f'{{"spec":{{"endpoints":{{"maxCount":{max_nb_eps}}}}}}}' noobaa.patch(resource_name='noobaa', params=params, format_type='merge')
def test_pv_scale_out(self, backingstore_factory): """ Test to check the scale out functionality of pv pool backing store. """ pv_backingstore = backingstore_factory( "OC", { "pv": [(1, MIN_PV_BACKINGSTORE_SIZE_IN_GB, "ocs-storagecluster-ceph-rbd")] }, )[0] logger.info(f"Scaling out PV Pool {pv_backingstore.name}") pv_backingstore.vol_num += 1 edit_pv_backingstore = OCP( kind="BackingStore", namespace=config.ENV_DATA["cluster_namespace"]) params = f'{{"spec":{{"pvPool":{{"numVolumes":{pv_backingstore.vol_num}}}}}}}' edit_pv_backingstore.patch(resource_name=pv_backingstore.name, params=params, format_type="merge") logger.info("Checking if backingstore went to SCALING state") sample = TimeoutSampler( timeout=60, sleep=5, func=check_pv_backingstore_status, backingstore_name=pv_backingstore.name, namespace=config.ENV_DATA["cluster_namespace"], desired_status="`SCALING`", ) assert sample.wait_for_func_status( result=True ), f"Backing Store {pv_backingstore.name} never reached SCALING state" logger.info("Waiting for backingstore to return to OPTIMAL state") wait_for_pv_backingstore(pv_backingstore.name, config.ENV_DATA["cluster_namespace"]) logger.info("Check if PV Pool scale out was successful") backingstore_dict = edit_pv_backingstore.get(pv_backingstore.name) assert (backingstore_dict["spec"]["pvPool"]["numVolumes"] == pv_backingstore.vol_num), "Scale out PV Pool failed. " logger.info("Scale out was successful")
def change_noobaa_endpoints_count(min_nb_eps=None, max_nb_eps=None): """ Scale up or down the number of maximum NooBaa emdpoints Args: min_nb_eps (int): The number of required minimum Noobaa endpoints max_nb_eps (int): The number of required maximum Noobaa endpoints """ if float(config.ENV_DATA['ocs_version']) < 4.6: noobaa = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) if min_nb_eps: log.info(f"Changing minimum Noobaa endpoints to {min_nb_eps}") params = f'{{"spec":{{"endpoints":{{"minCount":{min_nb_eps}}}}}}}' noobaa.patch(resource_name='noobaa', params=params, format_type='merge') if max_nb_eps: log.info(f"Changing maximum Noobaa endpoints to {max_nb_eps}") params = f'{{"spec":{{"endpoints":{{"maxCount":{max_nb_eps}}}}}}}' noobaa.patch(resource_name='noobaa', params=params, format_type='merge') else: sc = get_storage_cluster() if min_nb_eps: log.info(f"Changing minimum Noobaa endpoints to {min_nb_eps}") params = f'{{"spec":{{"multiCloudGateway":{{"endpoints":{{"minCount":{min_nb_eps}}}}}}}}}' sc.patch(resource_name=sc.get()['items'][0]['metadata']['name'], params=params, format_type='merge') if max_nb_eps: log.info(f"Changing maximum Noobaa endpoints to {max_nb_eps}") params = f'{{"spec":{{"multiCloudGateway":{{"endpoints":{{"maxCount":{max_nb_eps}}}}}}}}}' sc.patch(resource_name=sc.get()['items'][0]['metadata']['name'], params=params, format_type='merge')
class MonitorRecovery(object): """ Monitor recovery class """ def __init__(self): """ Initializer """ self.backup_dir = tempfile.mkdtemp(prefix="mon-backup-") self.keyring_dir = tempfile.mkdtemp(dir=self.backup_dir, prefix="keyring-") self.keyring_files = [] self.dep_ocp = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) self.ocp_obj = ocp.OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) def scale_rook_ocs_operators(self, replica=1): """ Scales rook and ocs operators based on replica Args: replica (int): replica count """ logger.info(f"Scaling rook-ceph operator to replica: {replica}") self.dep_ocp.exec_oc_cmd( f"scale deployment {constants.ROOK_CEPH_OPERATOR} --replicas={replica}" ) logger.info(f"Scaling ocs-operator to replica: {replica}") self.dep_ocp.exec_oc_cmd( f"scale deployment {defaults.OCS_OPERATOR_NAME} --replicas={replica}" ) if replica == 1: logger.info("Sleeping for 150 secs for cluster to stabilize") time.sleep(150) def patch_sleep_on_osds(self): """ Patch the OSD deployments to sleep and remove the `livenessProbe` parameter, """ osd_dep = get_deployments_having_label( label=constants.OSD_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) osd_deployments = [OCS(**osd) for osd in osd_dep] for osd in osd_deployments: logger.info( f"Patching OSD: {osd.name} with livenessProbe and sleep infinity" ) params = '[{"op":"remove", "path":"/spec/template/spec/containers/0/livenessProbe"}]' self.dep_ocp.patch( resource_name=osd.name, params=params, format_type="json", ) params = ( '{"spec": {"template": {"spec": {"containers": [{"name": "osd", "command":' ' ["sleep", "infinity"], "args": []}]}}}}' ) self.dep_ocp.patch( resource_name=osd.name, params=params, ) logger.info( "Sleeping for 15 seconds and waiting for OSDs to reach running state" ) time.sleep(15) for osd in get_osd_pods(): wait_for_resource_state(resource=osd, state=constants.STATUS_RUNNING) def prepare_monstore_script(self): """ Prepares the script to retrieve the `monstore` cluster map from OSDs """ recover_mon = """ #!/bin/bash ms=/tmp/monstore rm -rf $ms mkdir $ms for osd_pod in $(oc get po -l app=rook-ceph-osd -oname -n openshift-storage); do echo "Starting with pod: $osd_pod" podname=$(echo $osd_pod| cut -c5-) oc exec $osd_pod -- rm -rf $ms oc cp $ms $podname:$ms rm -rf $ms mkdir $ms dp=/var/lib/ceph/osd/ceph-$(oc get $osd_pod -ojsonpath='{ .metadata.labels.ceph_daemon_id }') op=update-mon-db ot=ceph-objectstore-tool echo "pod in loop: $osd_pod ; done deleting local dirs" oc exec $osd_pod -- $ot --type bluestore --data-path $dp --op $op --no-mon-config --mon-store-path $ms echo "Done with COT on pod: $osd_pod" oc cp $podname:$ms $ms echo "Finished pulling COT data from pod: $osd_pod" done """ with open(f"{self.backup_dir}/recover_mon.sh", "w") as file: file.write(recover_mon) exec_cmd(cmd=f"chmod +x {self.backup_dir}/recover_mon.sh") @retry(CommandFailed, tries=15, delay=5, backoff=1) def run_mon_store(self): """ Runs script to get the mon store from OSDs Raise: CommandFailed """ logger.info("Running mon-store script..") result = exec_cmd(cmd=f"sh {self.backup_dir}/recover_mon.sh") result.stdout = result.stdout.decode() logger.info(f"OSD mon store retrieval stdout {result.stdout}") result.stderr = result.stderr.decode() logger.info(f"OSD mon store retrieval stderr {result.stderr}") search_pattern = re.search( pattern="error|unable to open mon store", string=result.stderr ) if search_pattern: logger.info(f"Error found: {search_pattern}") raise CommandFailed logger.info("Successfully collected mon store from OSDs") def patch_sleep_on_mon(self): """ Patches sleep to infinity on monitors """ mon_dep = get_deployments_having_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_deployments = [OCS(**mon) for mon in mon_dep] for mon in mon_deployments: params = ( '{"spec": {"template": {"spec": {"containers":' ' [{"name": "mon", "command": ["sleep", "infinity"], "args": []}]}}}}' ) logger.info(f"Patching monitor: {mon.name} to sleep infinitely") self.dep_ocp.patch( resource_name=mon.name, params=params, ) def monitor_rebuild(self, mon_map_cmd): """ Rebuilds the monitor Args: mon_map_cmd (str): mon-store tool command """ logger.info("Re-spinning the mon pods") for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): mon.delete() mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING) mon_a = mon_pods[0] logger.info(f"Working on monitor: {mon_a.name}") logger.info(f"Copying mon-store into monitor: {mon_a.name}") self._exec_oc_cmd(f"cp /tmp/monstore {mon_a.name}:/tmp/") logger.info("Changing ownership of monstore to ceph") _exec_cmd_on_pod(cmd="chown -R ceph:ceph /tmp/monstore", pod_obj=mon_a) self.copy_and_import_keys(mon_obj=mon_a) logger.info("Creating monitor map") _exec_cmd_on_pod(cmd=mon_map_cmd, pod_obj=mon_a) rebuild_mon_cmd = "ceph-monstore-tool /tmp/monstore rebuild -- --keyring /tmp/keyring --monmap /tmp/monmap" logger.info("Running command to rebuild monitor") mon_a.exec_cmd_on_pod(command=rebuild_mon_cmd, out_yaml_format=False) logger.info(f"Copying store.db directory from monitor: {mon_a.name}") self._exec_oc_cmd( f"cp {mon_a.name}:/tmp/monstore/store.db {self.backup_dir}/store.db" ) logger.info("Copying store.db to rest of the monitors") for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): cmd = ( f"cp {self.backup_dir}/store.db {mon.name}:/var/lib/ceph/mon/ceph-" f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/ " ) logger.info(f"Copying store.db to monitor: {mon.name}") self._exec_oc_cmd(cmd) logger.info("Changing ownership of store.db to ceph:ceph") _exec_cmd_on_pod( cmd=f"chown -R ceph:ceph /var/lib/ceph/mon/ceph-" f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/store.db", pod_obj=mon, ) def copy_and_import_keys(self, mon_obj): """ Copies the keys and imports it using ceph-auth Args: mon_obj (obj): Monitor object """ logger.info(f"Copying keyring files to monitor: {mon_obj.name}") for k_file in self.keyring_files: cmd = f"cp {k_file} {mon_obj.name}:/tmp/" logger.info(f"Copying keyring: {k_file} into mon {mon_obj.name}") self._exec_oc_cmd(cmd) logger.info(f"Importing ceph keyrings to a temporary file on: {mon_obj.name}") _exec_cmd_on_pod( cmd="cp /etc/ceph/keyring-store/keyring /tmp/keyring", pod_obj=mon_obj ) for k_file in self.keyring_files: k_file = k_file.split("/") logger.info(f"Importing keyring {k_file[-1]}") _exec_cmd_on_pod( cmd=f"ceph-authtool /tmp/keyring --import-keyring /tmp/{k_file[-1]}", pod_obj=mon_obj, ) def revert_patches(self, deployment_paths): """ Reverts the patches done on monitors, osds and mgr by replacing their deployments Args: deployment_paths (list): List of paths to deployment yamls """ logger.info("Reverting patches on monitors, mgr and osd") for dep in deployment_paths: logger.info(f"Reverting {dep}") revert_patch = f"replace --force -f {dep}" self.ocp_obj.exec_oc_cmd(revert_patch) def backup_deployments(self): """ Creates a backup of all deployments in the `openshift-storage` namespace """ deployment_names = [] deployments = self.dep_ocp.get("-o name", out_yaml_format=False) deployments_full_name = str(deployments).split() for name in deployments_full_name: deployment_names.append(name.lstrip("deployment.apps").lstrip("/")) for deployment in deployment_names: deployment_get = self.dep_ocp.get(resource_name=deployment) deployment_yaml = join(self.backup_dir, deployment + ".yaml") templating.dump_data_to_temp_yaml(deployment_get, deployment_yaml) def deployments_to_revert(self): """ Gets mon, osd and mgr deployments to revert Returns: tuple: deployment paths to be reverted """ to_revert_patches = ( get_deployments_having_label( label=constants.OSD_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) + get_deployments_having_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) + get_deployments_having_label( label=constants.MGR_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) ) to_revert_mds = get_deployments_having_label( label=constants.MDS_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) to_revert_patches_path = [] to_revert_mds_path = [] for dep in to_revert_patches: to_revert_patches_path.append( join(self.backup_dir, dep["metadata"]["name"] + ".yaml") ) for dep in to_revert_mds: logger.info(dep) to_revert_mds_path.append( join(self.backup_dir, dep["metadata"]["name"] + ".yaml") ) return to_revert_patches_path, to_revert_mds_path def get_ceph_keyrings(self): """ Gets all ceph and csi related keyring from OCS secrets """ mon_k = get_ceph_caps(["rook-ceph-mons-keyring"]) if config.ENV_DATA["platform"].lower() in constants.ON_PREM_PLATFORMS: rgw_k = get_ceph_caps( ["rook-ceph-rgw-ocs-storagecluster-cephobjectstore-a-keyring"] ) else: rgw_k = None mgr_k = get_ceph_caps(["rook-ceph-mgr-a-keyring"]) mds_k = get_ceph_caps( [ "rook-ceph-mds-ocs-storagecluster-cephfilesystem-a-keyring", "rook-ceph-mds-ocs-storagecluster-cephfilesystem-b-keyring", ] ) crash_k = get_ceph_caps(["rook-ceph-crash-collector-keyring"]) fs_node_k = get_ceph_caps([constants.CEPHFS_NODE_SECRET]) rbd_node_k = get_ceph_caps([constants.RBD_NODE_SECRET]) fs_provisinor_k = get_ceph_caps([constants.CEPHFS_PROVISIONER_SECRET]) rbd_provisinor_k = get_ceph_caps([constants.RBD_PROVISIONER_SECRET]) keyring_caps = { "mons": mon_k, "rgws": rgw_k, "mgrs": mgr_k, "mdss": mds_k, "crash": crash_k, "fs_node": fs_node_k, "rbd_node": rbd_node_k, "fs_provisinor": fs_provisinor_k, "rbd_provisinor": rbd_provisinor_k, } for secret, caps in keyring_caps.items(): if caps: with open(f"{self.keyring_dir}/{secret}.keyring", "w") as fd: fd.write(caps) self.keyring_files.append(f"{self.keyring_dir}/{secret}.keyring") def patch_sleep_on_mds(self): """ Patch the OSD deployments to sleep and remove the `livenessProbe` parameter, """ mds_dep = get_deployments_having_label( label=constants.MDS_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mds_deployments = [OCS(**mds) for mds in mds_dep] for mds in mds_deployments: logger.info( f"Patching MDS: {mds.name} to remove livenessProbe and setting sleep infinity" ) params = '[{"op":"remove", "path":"/spec/template/spec/containers/0/livenessProbe"}]' self.dep_ocp.patch( resource_name=mds.name, params=params, format_type="json", ) params = ( '{"spec": {"template": {"spec": {"containers": ' '[{"name": "mds", "command": ["sleep", "infinity"], "args": []}]}}}}' ) self.dep_ocp.patch( resource_name=mds.name, params=params, ) logger.info("Sleeping for 60s and waiting for MDS pods to reach running state") time.sleep(60) for mds in get_mds_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): wait_for_resource_state(resource=mds, state=constants.STATUS_RUNNING) @retry(CommandFailed, tries=10, delay=10, backoff=1) def _exec_oc_cmd(self, cmd): """ Exec oc cmd with retry Args: cmd (str): Command """ self.ocp_obj.exec_oc_cmd(cmd)
def test_multiple_mon_pod_stays_on_same_node(self): """ A testcase to verify multiple mon pods stays on same node 1. Edit the rook-ceph-mon-endpoints configmap say, assign mon-a to another node that would be on the same node as another mon (compute-1 instead of compute-0) 2. Delete the mon-a deployment 3. Edit the mon-b deployment to remove the required mon anti-affinity 4. Restart the operator 5. Edit the mon-a deployment to remove the required mon anti-affinity 6. See mon-a start on compute-1 with mon-b 7. Soon after, see the operator failover one of these mons onto the node that doesn't currently have a mon (compute-0) and start mon-d """ ocs_version = config.ENV_DATA["ocs_version"] # Check that we have LSO cluster and OCS version is 4.8 and below # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937 if not (is_lso_cluster() and Version.coerce(ocs_version) <= Version.coerce("4.8")): pytest.skip( "Skip the test because mons are not node assignment from Rook, if cluster is not " "LSO based. And also currently, we want to run the test only with OCS 4.8 and " "below. This is a workaround due to issue " "https://github.com/red-hat-storage/ocs-ci/issues/4937") # Initialize rook_ceph_mon = "rook-ceph-mon" # Get mons running on pod mon_pods = get_mon_pods() mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get( "mon") mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get( "mon") mon_node = get_pod_node(mon_pods[1]) # Edit the rook-ceph-mon-endpoints log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP(kind=CONFIGMAP, namespace=OPENSHIFT_STORAGE_NAMESPACE) rook_ceph_mon_configmap = configmap_obj.get( resource_name=ROOK_CEPH_MON_ENDPOINTS) json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"]) json_val["node"][mon_name_to_del].update( json_val["node"][mon_name_to_edit]) rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val) new_data = rook_ceph_mon_configmap["data"] params = f'{{"data": {json.dumps(new_data)}}}' configmap_obj.patch( resource_name=ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully") log.info( f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}" ) # Delete one mon deployment which had been edited dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE) mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}" log.info(f"Deleting mon {mon_deployment_name_to_del} deployments") dep_obj.delete(resource_name=mon_deployment_name_to_del) # Edit other mon deployment to remove mon anti-affinity mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}" log.info(f"Edit mon {mon_deployment_name_to_edit} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_edit, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}" ) # Restart operator operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource(condition=STATUS_RUNNING, selector=OPERATOR_LABEL) # Validate deleted deployment mon came up and in pending state # Initially mon stucks in pending state, remove defined anti-affinity POD_OBJ.wait_for_resource( condition=STATUS_PENDING, resource_count=1, selector=MON_APP_LABEL, timeout=1200, ) # Edit mon deployment to remove mon anti-affinity log.info(f"Edit mon {mon_deployment_name_to_del} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_del, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}" ) # Validate mon pod moved to another node such that 2 mons are running on same node log.info("Waiting for 5 seconds for mon recovery") time.sleep(5) new_mon_pods = get_mon_pods() new_node = [ get_pod_node(mon) for mon in new_mon_pods if mon.get().get( "metadata").get("labels").get("mon") == mon_name_to_del ] assert ( new_node[0].name == mon_node.name ), f"Mon moved to node {mon_node} such that 2 mons are running on same node" # Verify rook deletes one of the mon and move to another node timeout = 60 log.info(f"Waiting for {timeout} seconds for mon recovery") time.sleep(timeout) POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, resource_count=len(mon_pods), selector=MON_APP_LABEL, timeout=3600, sleep=5, ) log.info( "Mons are up and running state and validate are running on different nodes" ) mon_pods_running_on_same_node()
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][ "kubernetes.io/hostname" ] log.info(f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS ) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "" ) ) new_data["data"] = ",".join( [ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ] ) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "") ) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info(f"Waiting for {sleep_time} seconds before deleting another mon") time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}" ) log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def test_noobaa_rebuild(self, bucket_factory): """ Test case to verify noobaa rebuild. Verifies KCS: https://access.redhat.com/solutions/5948631 1. Stop the noobaa-operator by setting the replicas of noobaa-operator deployment to 0. 2. Delete the noobaa deployments/statefulsets. 3. Delete the PVC db-noobaa-db-0. 4. Patch existing backingstores and bucketclasses to remove finalizer 5. Delete the backingstores/bucketclass. 6. Delete the noobaa secrets. 7. Restart noobaa-operator by setting the replicas back to 1. 8. Monitor the pods in openshift-storage for noobaa pods to be Running. """ dep_ocp = OCP(kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) state_ocp = OCP(kind=constants.STATEFULSET, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) # Scale down noobaa operator logger.info( f"Scaling down {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 0" ) dep_ocp.exec_oc_cmd( f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=0" ) # Delete noobaa deployments and statefulsets logger.info("Deleting noobaa deployments and statefulsets") dep_ocp.delete(resource_name=constants.NOOBAA_ENDPOINT_DEPLOYMENT) state_ocp.delete(resource_name=constants.NOOBAA_DB_STATEFULSET) state_ocp.delete(resource_name=constants.NOOBAA_CORE_STATEFULSET) # Delete noobaa-db pvc pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) logger.info("Deleting noobaa-db pvc") pvc_obj.delete(resource_name=noobaa_pvc_obj[0].name, wait=True) pvc_obj.wait_for_delete(resource_name=noobaa_pvc_obj[0].name, timeout=300) # Patch and delete existing backingstores params = '{"metadata": {"finalizers":null}}' bs_obj = OCP(kind=constants.BACKINGSTORE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for bs in bs_obj.get()["items"]: assert bs_obj.patch( resource_name=bs["metadata"]["name"], params=params, format_type="merge", ), "Failed to change the parameter in backingstore" logger.info(f"Deleting backingstore: {bs['metadata']['name']}") bs_obj.delete(resource_name=bs["metadata"]["name"]) # Patch and delete existing bucketclass bc_obj = OCP(kind=constants.BUCKETCLASS, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for bc in bc_obj.get()["items"]: assert bc_obj.patch( resource_name=bc["metadata"]["name"], params=params, format_type="merge", ), "Failed to change the parameter in bucketclass" logger.info(f"Deleting bucketclass: {bc['metadata']['name']}") bc_obj.delete(resource_name=bc["metadata"]["name"]) # Delete noobaa secrets logger.info("Deleting noobaa related secrets") dep_ocp.exec_oc_cmd( "delete secrets noobaa-admin noobaa-endpoints noobaa-operator noobaa-server" ) # Scale back noobaa-operator deployment logger.info( f"Scaling back {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 1" ) dep_ocp.exec_oc_cmd( f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=1" ) # Wait and validate noobaa PVC is in bound state pvc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=noobaa_pvc_obj[0].name, timeout=600, sleep=120, ) # Validate noobaa pods are up and running pod_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) noobaa_pods = get_noobaa_pods() pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(noobaa_pods), selector=constants.NOOBAA_APP_LABEL, timeout=900, ) # Verify everything running fine logger.info( "Verifying all resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Verify default backingstore/bucketclass default_bs = OCP(kind=constants.BACKINGSTORE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get( resource_name=DEFAULT_NOOBAA_BACKINGSTORE) default_bc = OCP(kind=constants.BUCKETCLASS, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get( resource_name=DEFAULT_NOOBAA_BUCKETCLASS) assert (default_bs["status"]["phase"] == default_bc["status"]["phase"] == constants.STATUS_READY ), "Failed: Default bs/bc are not in ready state" # Create OBCs logger.info("Creating OBCs after noobaa rebuild") bucket_factory(amount=3, interface="OC", verify_health=True)
def test_noobaa_db_backup_and_recovery( self, pvc_factory, pod_factory, snapshot_factory, bucket_factory, rgw_bucket_factory, ): """ Test case to verify noobaa backup and recovery 1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC 2. Scale down the statefulset noobaa-db 3. Get the yaml of the current PVC, db-noobaa-db-0 and change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC 4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed. The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’ 5. Edit again restore PV and remove the claimRef section. The volume will transition to Available. 6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC. 7. Scale up the stateful set again and the pod should be running """ # Initialise variable self.noobaa_db_sst_name = "noobaa-db-pg" # Get noobaa pods before execution noobaa_pods = get_noobaa_pods() # Get noobaa PVC before execution noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get( "volumeName") # Take snapshot db-noobaa-db-0 PVC log.info(f"Creating snapshot of the {noobaa_pvc_obj[0].name} PVC") snap_obj = snapshot_factory( pvc_obj=noobaa_pvc_obj[0], wait=True, snapshot_name=f"{noobaa_pvc_obj[0].name}-snapshot", ) log.info( f"Successfully created snapshot {snap_obj.name} and in Ready state" ) # Restore it to PVC log.info(f"Restoring snapshot {snap_obj.name} to create new PVC") sc_name = noobaa_pvc_obj[0].get().get("spec").get("storageClassName") pvc_size = (noobaa_pvc_obj[0].get().get("spec").get("resources").get( "requests").get("storage")) self.restore_pvc_obj = create_restore_pvc( sc_name=sc_name, snap_name=snap_obj.name, namespace=snap_obj.namespace, size=pvc_size, pvc_name=f"{snap_obj.name}-restore", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, ) wait_for_resource_state(self.restore_pvc_obj, constants.STATUS_BOUND) self.restore_pvc_obj.reload() log.info(f"Succeesfuly created PVC {self.restore_pvc_obj.name} " f"from snapshot {snap_obj.name}") # Scale down the statefulset noobaa-db modify_statefulset_replica_count( statefulset_name=self.noobaa_db_sst_name, replica_count=0 ), f"Failed to scale down the statefulset {self.noobaa_db_sst_name}" # Get the noobaa-db PVC pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) noobaa_pvc_yaml = pvc_obj.get(resource_name=noobaa_pvc_obj[0].name) # Get the restored noobaa PVC and # change the parameter persistentVolumeReclaimPolicy to Retain restored_noobaa_pvc_obj = get_pvc_objs( pvc_names=[f"{snap_obj.name}-restore"]) restored_noobaa_pv_name = (restored_noobaa_pvc_obj[0].get("spec").get( "spec").get("volumeName")) pv_obj = OCP(kind=constants.PV, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) params = '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' assert pv_obj.patch( resource_name=restored_noobaa_pv_name, params=params), ( "Failed to change the parameter persistentVolumeReclaimPolicy" f" to Retain {restored_noobaa_pv_name}") # Delete both PVCs delete_pvcs(pvc_objs=[noobaa_pvc_obj[0], restored_noobaa_pvc_obj[0]]) # Validate original claim db-noobaa-db-0 removed assert validate_pv_delete( pv_name=noobaa_pv_name ), f"PV not deleted, still exist {noobaa_pv_name}" # Validate PV for claim db-noobaa-db-0-snapshot-restore is in Released state pv_obj.wait_for_resource(condition=constants.STATUS_RELEASED, resource_name=restored_noobaa_pv_name) # Edit again restore PV and remove the claimRef section log.info( f"Remove the claimRef section from PVC {restored_noobaa_pv_name}") params = '[{"op": "remove", "path": "/spec/claimRef"}]' pv_obj.patch(resource_name=restored_noobaa_pv_name, params=params, format_type="json") log.info( f"Successfully removed claimRef section from PVC {restored_noobaa_pv_name}" ) # Validate PV is in Available state pv_obj.wait_for_resource(condition=constants.STATUS_AVAILABLE, resource_name=restored_noobaa_pv_name) # Edit the yaml db-noobaa-db-0.yaml and change the # setting volumeName to restored PVC noobaa_pvc_yaml["spec"]["volumeName"] = restored_noobaa_pv_name noobaa_pvc_yaml = OCS(**noobaa_pvc_yaml) noobaa_pvc_yaml.create() # Validate noobaa PVC is in bound state pvc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=noobaa_pvc_obj[0].name, timeout=120, ) # Scale up the statefulset again assert modify_statefulset_replica_count( statefulset_name=self.noobaa_db_sst_name, replica_count=1 ), f"Failed to scale up the statefulset {self.noobaa_db_sst_name}" # Validate noobaa pod is up and running pod_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(noobaa_pods), selector=constants.NOOBAA_APP_LABEL, ) # Change the parameter persistentVolumeReclaimPolicy to Delete again params = '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}' assert pv_obj.patch( resource_name=restored_noobaa_pv_name, params=params), ( "Failed to change the parameter persistentVolumeReclaimPolicy" f" to Delete {restored_noobaa_pv_name}") log.info( "Changed the parameter persistentVolumeReclaimPolicy to Delete again" ) # Verify all storage pods are running wait_for_storage_pods() # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
def factory(snapshot_factory=snapshot_factory): # Get noobaa pods before execution noobaa_pods = pod.get_noobaa_pods() # Get noobaa PVC before execution noobaa_pvc_obj = pvc.get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get( "volumeName") # Take snapshot db-noobaa-db-0 PVC logger.info(f"Creating snapshot of the {noobaa_pvc_obj[0].name} PVC") snap_obj = snapshot_factory( pvc_obj=noobaa_pvc_obj[0], wait=True, snapshot_name=f"{noobaa_pvc_obj[0].name}-snapshot", ) logger.info( f"Successfully created snapshot {snap_obj.name} and in Ready state" ) # Restore it to PVC logger.info(f"Restoring snapshot {snap_obj.name} to create new PVC") sc_name = noobaa_pvc_obj[0].get().get("spec").get("storageClassName") pvc_size = (noobaa_pvc_obj[0].get().get("spec").get("resources").get( "requests").get("storage")) restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=snap_obj.name, namespace=snap_obj.namespace, size=pvc_size, pvc_name=f"{snap_obj.name}-restore", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, ) restore_pvc_objs.append(restore_pvc_obj) wait_for_resource_state(restore_pvc_obj, constants.STATUS_BOUND) restore_pvc_obj.reload() logger.info(f"Succeesfuly created PVC {restore_pvc_obj.name} " f"from snapshot {snap_obj.name}") # Scale down the statefulset noobaa-db modify_statefulset_replica_count( statefulset_name=constants.NOOBAA_DB_STATEFULSET, replica_count=0 ), f"Failed to scale down the statefulset {constants.NOOBAA_DB_STATEFULSET}" # Get the noobaa-db PVC pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) noobaa_pvc_yaml = pvc_obj.get(resource_name=noobaa_pvc_obj[0].name) # Get the restored noobaa PVC and # change the parameter persistentVolumeReclaimPolicy to Retain restored_noobaa_pvc_obj = pvc.get_pvc_objs( pvc_names=[f"{snap_obj.name}-restore"]) restored_noobaa_pv_name = (restored_noobaa_pvc_obj[0].get("spec").get( "spec").get("volumeName")) pv_obj = OCP(kind=constants.PV, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) params = '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' assert pv_obj.patch( resource_name=restored_noobaa_pv_name, params=params), ( "Failed to change the parameter persistentVolumeReclaimPolicy" f" to Retain {restored_noobaa_pv_name}") # Delete both PVCs pvc.delete_pvcs( pvc_objs=[noobaa_pvc_obj[0], restored_noobaa_pvc_obj[0]]) # Validate original claim db-noobaa-db-0 removed assert validate_pv_delete( pv_name=noobaa_pv_name ), f"PV not deleted, still exist {noobaa_pv_name}" # Validate PV for claim db-noobaa-db-0-snapshot-restore is in Released state pv_obj.wait_for_resource(condition=constants.STATUS_RELEASED, resource_name=restored_noobaa_pv_name) # Edit again restore PV and remove the claimRef section logger.info( f"Remove the claimRef section from PVC {restored_noobaa_pv_name}") params = '[{"op": "remove", "path": "/spec/claimRef"}]' pv_obj.patch(resource_name=restored_noobaa_pv_name, params=params, format_type="json") logger.info( f"Successfully removed claimRef section from PVC {restored_noobaa_pv_name}" ) # Validate PV is in Available state pv_obj.wait_for_resource(condition=constants.STATUS_AVAILABLE, resource_name=restored_noobaa_pv_name) # Edit the yaml db-noobaa-db-0.yaml and change the # setting volumeName to restored PVC noobaa_pvc_yaml["spec"]["volumeName"] = restored_noobaa_pv_name noobaa_pvc_yaml = OCS(**noobaa_pvc_yaml) noobaa_pvc_yaml.create() # Validate noobaa PVC is in bound state pvc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=noobaa_pvc_obj[0].name, timeout=120, ) # Scale up the statefulset again assert modify_statefulset_replica_count( statefulset_name=constants.NOOBAA_DB_STATEFULSET, replica_count=1 ), f"Failed to scale up the statefulset {constants.NOOBAA_DB_STATEFULSET}" # Validate noobaa pod is up and running pod_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(noobaa_pods), selector=constants.NOOBAA_APP_LABEL, ) # Change the parameter persistentVolumeReclaimPolicy to Delete again params = '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}' assert pv_obj.patch( resource_name=restored_noobaa_pv_name, params=params), ( "Failed to change the parameter persistentVolumeReclaimPolicy" f" to Delete {restored_noobaa_pv_name}") logger.info( "Changed the parameter persistentVolumeReclaimPolicy to Delete again" )