def corrupt_ceph_monitors(): """ Corrupts ceph monitors by deleting store.db file """ mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: logger.info(f"Corrupting monitor: {mon.name}") mon_id = mon.get().get("metadata").get("labels").get("ceph_daemon_id") _exec_cmd_on_pod( cmd=f"rm -rf /var/lib/ceph/mon/ceph-{mon_id}/store.db", pod_obj=mon ) try: wait_for_resource_state(resource=mon, state=constants.STATUS_CLBO) except ResourceWrongStatusException: if ( mon.ocp.get_resource(resource_name=mon.name, column="STATUS") != constants.STATUS_CLBO ): logger.info( f"Re-spinning monitor: {mon.name} since it did not reach CLBO state" ) mon.delete() logger.info("Validating all the monitors are in CLBO state") for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): wait_for_resource_state(resource=mon, state=constants.STATUS_CLBO)
def remove_global_id_reclaim(): """ Removes global id warning by re-spinning client and mon pods """ csi_pods = [] interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM] for interface in interfaces: plugin_pods = get_plugin_pods(interface) csi_pods += plugin_pods cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods() rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods() csi_pods += cephfs_provisioner_pods csi_pods += rbd_provisioner_pods for csi_pod in csi_pods: csi_pod.delete() for mds_pod in get_mds_pods(): mds_pod.delete() for mds_pod in get_mds_pods(): wait_for_resource_state(resource=mds_pod, state=constants.STATUS_RUNNING) for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): mon.delete() mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING)
def monitor_rebuild(self, mon_map_cmd): """ Rebuilds the monitor Args: mon_map_cmd (str): mon-store tool command """ logger.info("Re-spinning the mon pods") for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): mon.delete() mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING) mon_a = mon_pods[0] logger.info(f"Working on monitor: {mon_a.name}") logger.info(f"Copying mon-store into monitor: {mon_a.name}") self._exec_oc_cmd(f"cp /tmp/monstore {mon_a.name}:/tmp/") logger.info("Changing ownership of monstore to ceph") _exec_cmd_on_pod(cmd="chown -R ceph:ceph /tmp/monstore", pod_obj=mon_a) self.copy_and_import_keys(mon_obj=mon_a) logger.info("Creating monitor map") _exec_cmd_on_pod(cmd=mon_map_cmd, pod_obj=mon_a) rebuild_mon_cmd = "ceph-monstore-tool /tmp/monstore rebuild -- --keyring /tmp/keyring --monmap /tmp/monmap" logger.info("Running command to rebuild monitor") mon_a.exec_cmd_on_pod(command=rebuild_mon_cmd, out_yaml_format=False) logger.info(f"Copying store.db directory from monitor: {mon_a.name}") self._exec_oc_cmd( f"cp {mon_a.name}:/tmp/monstore/store.db {self.backup_dir}/store.db" ) logger.info("Copying store.db to rest of the monitors") for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE): cmd = ( f"cp {self.backup_dir}/store.db {mon.name}:/var/lib/ceph/mon/ceph-" f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/ " ) logger.info(f"Copying store.db to monitor: {mon.name}") self._exec_oc_cmd(cmd) logger.info("Changing ownership of store.db to ceph:ceph") _exec_cmd_on_pod( cmd=f"chown -R ceph:ceph /var/lib/ceph/mon/ceph-" f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/store.db", pod_obj=mon, )
def set_resource(self, resource): self.resource = resource if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() self.type = 'rook-ceph' if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() self.type = 'rook-ceph' if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() self.type = 'rook-ceph' if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.type = 'rook-ceph' if self.resource == 'cephfsplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM ) self.type = 'csi' if self.resource == 'rbdplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL ) self.type = 'csi' self.resource_count = len(self.resource_obj)
def mon_health_check(self, count): """ Mon health check based on pod count Args: count (int): Expected number of mon pods Raises: MonCountException: if mon pod count doesn't match """ timeout = 10 * len(self.pods) logger.info(f"Expected MONs = {count}") try: assert self.POD.wait_for_resource( condition='Running', selector=self.mon_selector, resource_count=count, timeout=timeout, sleep=3, ) # TODO: Workaround for BZ1748325: actual_mons = pod.get_mon_pods() actual_running_mons = list() for mon in actual_mons: if mon.ocp.get_resource_status(mon.name) == constant.STATUS_RUNNING: actual_running_mons.append(mon) actual = len(actual_running_mons) # TODO: End of workaround for BZ1748325 assert count == actual, f"Expected {count}, Got {actual}" except exceptions.TimeoutExpiredError as e: logger.error(e) raise exceptions.MonCountException( f"Failed to achieve desired Mon count" f" {count}" )
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) self.mons = pod.get_mon_pods(self.mon_selector, self.namespace) self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def mon_health_check(self, count): """ Mon health check based on pod count Args: count (int): Expected number of mon pods Raises: MonCountException: if mon pod count doesn't match """ timeout = 10 * len(self.pods) logger.info(f"Expected MONs = {count}") try: assert self.POD.wait_for_resource( condition='Running', selector=self.mon_selector, resource_count=count, timeout=timeout, sleep=3, ) actual = len(pod.get_mon_pods()) assert count == actual, f"Expected {count}, Got {actual}" except exceptions.TimeoutExpiredError as e: logger.error(e) raise exceptions.MonCountException( f"Failed to achieve desired Mon count" f" {count}")
def delete_pods(self): """ Try to delete pods: - Rook operator - OSD - MGR - MON """ pod_list = [] rook_operator_pod = pod.get_ocs_operator_pod( ocs_label=constants.OPERATOR_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_list.append(rook_operator_pod) osd_pods = pod.get_osd_pods() pod_list.extend(osd_pods) mgr_pods = pod.get_mgr_pods() pod_list.extend(mgr_pods) mon_pods = pod.get_mon_pods() pod_list.extend(mon_pods) logger.info(f"Deleting pods: {[p.name for p in pod_list]}") pod.delete_pods(pod_objs=pod_list)
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) # TODO: Workaround for BZ1748325: mons = pod.get_mon_pods(self.mon_selector, self.namespace) for mon in mons: if mon.ocp.get_resource_status( mon.name) == constant.STATUS_RUNNING: self.mons.append(mon) # TODO: End of workaround for BZ1748325 self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def verify_multus_network(): """ Verify Multus network(s) created successfully and are present on relevant pods. """ with open(constants.MULTUS_YAML, mode="r") as f: multus_public_data = yaml.load(f) multus_namespace = multus_public_data["metadata"]["namespace"] multus_name = multus_public_data["metadata"]["name"] multus_public_network_name = f"{multus_namespace}/{multus_name}" log.info("Verifying multus NetworkAttachmentDefinitions") ocp.OCP( resource_name=multus_public_network_name, kind="network-attachment-definitions", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # TODO: also check if private NAD exists log.info("Verifying multus public network exists on ceph pods") osd_pods = get_osd_pods() for _pod in osd_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) # TODO: also check private network if it exists on OSD pods mon_pods = get_mon_pods() mds_pods = get_mds_pods() mgr_pods = get_mgr_pods() rgw_pods = get_rgw_pods() ceph_pods = [*mon_pods, *mds_pods, *mgr_pods, *rgw_pods] for _pod in ceph_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying multus public network exists on CSI pods") csi_pods = [] interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM] for interface in interfaces: plugin_pods = get_plugin_pods(interface) csi_pods += plugin_pods cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods() rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods() csi_pods += cephfs_provisioner_pods csi_pods += rbd_provisioner_pods for _pod in csi_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying StorageCluster multus network data") sc = get_storage_cluster() sc_data = sc.get().get("items")[0] network_data = sc_data["spec"]["network"] assert network_data["provider"] == "multus" selectors = network_data["selectors"] assert selectors[ "public"] == f"{defaults.ROOK_CLUSTER_NAMESPACE}/ocs-public"
def validate_mon_pods(): """ Checks mon pods are running with retries """ mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING)
def test_connection_time_out(self): """ Test that connection from mon pod to external domain is blocked and gets timeout """ mon_pod = pod.get_mon_pods()[0] with pytest.raises(CommandFailed) as cmdfailed: mon_pod.exec_cmd_on_pod("curl google.com") assert "Connection timed out" in str(cmdfailed)
def setup(self, request, pod_factory): """ Set values for: paxos_service_trim_min=10 paxos_service_trim_max=100 osd_op_complaint_time=0.000001 """ self.fio_pod_obj = pod_factory(constants.CEPHFILESYSTEM) mon_pods = get_mon_pods() self.selected_mon_pod_obj = random.choice(mon_pods) self.selected_mon_pod = ( self.selected_mon_pod_obj.get().get("metadata").get("labels").get("mon") ) log.info(f"Selected mon pod is: {self.selected_mon_pod_obj.name}") log.info( "Setting values: paxos_service_trim_min=10, paxos_service_trim_max=100 " "and osd_op_complaint_time=0.000001" ) self.ct_pod = pod.get_ceph_tools_pod() # mon in the "tell" command should be mon.a / mon.b / mon.c self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --paxos_service_trim_min=10" ) self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --paxos_service_trim_max=100" ) self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --osd_op_complaint_time=0.000001" ) def finalizer(): """ Set default values for: paxos_service_trim_min=250 paxos_service_trim_max=500 osd_op_complaint_time=30.000000 """ if not self.stop_checking_mon_db: self.stop_checking_mon_db = True log.info( f"Setting default values for paxos_service_trim_min({constants.DEFAULT_PAXOS_SERVICE_TRIM_MIN}), " f"paxos_service_trim_max({constants.DEFAULT_PAXOS_SERVICE_TRIM_MAX}) " f"and osd_op_complaint_time({constants.DEFAULT_OSD_OP_COMPLAINT_TIME})" ) self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs " f"--paxos_service_trim_min={constants.DEFAULT_PAXOS_SERVICE_TRIM_MIN}" ) self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs " f"--paxos_service_trim_max={constants.DEFAULT_PAXOS_SERVICE_TRIM_MAX}" ) self.ct_pod.exec_ceph_cmd( ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs " f"--osd_op_complaint_time={constants.DEFAULT_OSD_OP_COMPLAINT_TIME}" ) request.addfinalizer(finalizer)
def set_resource(self, resource, leader_type="provisioner"): self.resource = resource if (config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS) and (resource in CEPH_PODS): # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig' # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods. provider_kubeconfig = os.path.join( config.clusters[ config.get_provider_index()].ENV_DATA["cluster_path"], config.clusters[config.get_provider_index()].RUN.get( "kubeconfig_location"), ) self.cluster_kubeconfig = provider_kubeconfig resource_count = 0 if self.resource == "mgr": self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == "mon": self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == "osd": self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == "mds": self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == "cephfsplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == "rbdplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == "cephfsplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM, leader_type=leader_type) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == "rbdplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL, leader_type=leader_type) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == "operator": self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def set_resource(self, resource): self.resource = resource if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.resource_count = len(self.resource_obj)
def check_mon_pods_eq_3(self): """ Get number of monitoring pods Returns: bool: False if number of mon pods is 3, True otherwise """ mon_pod_list = get_mon_pods() if len(mon_pod_list) == 3: return False else: log.info(f"There are {len(mon_pod_list)} mon pods") for mon_pod in mon_pod_list: log.info(f"{mon_pod.name}") return True
def generate_monmap_cmd(): """ Generates monmap-tool command used to rebuild monitors Returns: str: Monitor map command """ mon_ips_dict = {} mon_ids = [] mon_ips = [] logger.info("Getting monitor pods public IP") mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for mon in mon_pods: mon_ids.append(mon.get().get("metadata").get("labels").get("ceph_daemon_id")) logger.info(f"getting public ip of {mon.name}") logger.info(mon_ids) mon_ips.append( re.findall( r"[0-9]+(?:\.[0-9]+){3}", mon.get().get("spec").get("initContainers")[1].get("args")[-2], ) ) mon_a = mon_pods[0] logger.info(f"Working on monitor: {mon_a.name} to get FSID") fsid = ( mon_a.get() .get("spec") .get("initContainers")[1] .get("args")[0] .replace("--fsid=", "") ) for ids, ip in zip(mon_ids, mon_ips): mon_ips_dict.update({ids: f"{ip}"}) mon_ip_ids = "" for key, val in mon_ips_dict.items(): mon_ip_ids = mon_ip_ids + f"--add {key} {val}" + " " mon_map_cmd = f"monmaptool --create {mon_ip_ids} --enable-all-features --clobber /tmp/monmap --fsid {fsid}" logger.info(f"Generated monitor map creation command: {mon_map_cmd}") return mon_map_cmd
def test_pod_log_after_upgrade(): """ Check OSD/MON/MGR pod logs after upgrade and verify the expected log exist """ pod_objs = get_osd_pods() + get_mon_pods() + get_mgr_pods() pod_names = [osd_pod_obj.name for osd_pod_obj in pod_objs] expected_log_after_upgrade = "set uid:gid to 167:167 (ceph:ceph)" logging.info(f"Check that the log '{expected_log_after_upgrade}' " f"appears after the osd/mon/mg pod is initialized") for pod_name in pod_names: pod_logs = get_pod_logs(pod_name=pod_name, all_containers=True) assert expected_log_after_upgrade in pod_logs, ( f"The expected log after upgrade '{expected_log_after_upgrade}' does not exist" f" on pod {pod_name}") logging.info( f"The log '{expected_log_after_upgrade}' appears in all relevant pods." )
def set_resource(self, resource, leader_type="provisioner"): self.resource = resource resource_count = 0 if self.resource == "mgr": self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == "mon": self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == "osd": self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == "mds": self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == "cephfsplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == "rbdplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == "cephfsplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM, leader_type=leader_type) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == "rbdplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL, leader_type=leader_type) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == "operator": self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def get_node_pods_to_scale_down(node_name): """ Get the pods of a node to scale down as described in the documents of node replacement with LSO Args: node_name (str): The node name Returns: list: The node's pods to scale down """ pods_to_scale_down = [ *pod.get_mon_pods(), *pod.get_osd_pods(), *pod.get_mgr_pods(), ] return get_node_pods(node_name, pods_to_scale_down)
def set_resource(self, resource): self.resource = resource resource_count = 0 if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == 'cephfsplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == 'rbdplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == 'cephfsplugin_provisioner': self.resource_obj = [ pod.plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == 'rbdplugin_provisioner': self.resource_obj = [ pod.plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == 'operator': self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) self.mons = pod.get_mon_pods(self.mon_selector, self.namespace) self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs_config: self.cephfs.reload() self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def teardown(self, request): """ Verifies cluster is healthy """ mon_pod = get_mon_pods() def finalizer(): try: # Validate all mon pods are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, selector=MON_APP_LABEL, resource_count=len(mon_pod), ) log.info("All mons are up and running") except (TimeoutExpiredError, ResourceWrongStatusException) as ex: log.error(f"{ex}") # Restart operator operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) # Wait untill mon pod recovery POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, selector=MON_APP_LABEL, resource_count=len(mon_pod), timeout=3600, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) request.addfinalizer(finalizer)
def workloads_dir_setup(self, request): """ Setting up the environment for the test """ if config.DEPLOYMENT.get("local_storage"): self.worker_node = node.get_worker_nodes()[0] self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) mon_pod_name = self.oc_cmd.exec_oc_debug_cmd( node=self.worker_node, cmd_list=["ls /var/lib/rook/ | grep mon"], ) mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "") mon_pods_info = pod.get_pods_having_label( label=f"ceph_daemon_id={mon_pod_id}", namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) self.mon_pod = pod.get_pod_obj( name=mon_pods_info[0]["metadata"]["name"], namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) else: self.mon_pod = random.choice(pod.get_mon_pods()) self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get( "mon") self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads" log.info(f"Selected mon '{self.mon_pod.name}'") self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}") self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}") def finalizer(): self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}") time.sleep(SLEEP_TIMEOUT) utils.ceph_health_check() request.addfinalizer(finalizer)
def finalizer(): op_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_obj = OCP( kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR) if operator_obj.get("spec").get("replicas") != 1: modify_deployment_replica_count( deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Validate all mons are up and running") try: pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=60, sleep=5, ) except (TimeoutExpiredError, ResourceWrongStatusException) as ex: log.warning(ex) op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR) for pod in get_mon_pods(): pod.delete() pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=360, sleep=5, ) log.info("All mons are up and running")
def test_rook_operator_restart_during_mon_failover(self, node_drain_teardown): """ Verify the number of monitoring pod is three when drain node """ sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" log.info("Get worker node name where monitoring pod run") mon_pod_objs = get_mon_pods() node_name = mon_pod_objs[0].data["spec"]["nodeName"] drain_nodes([node_name]) sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=0, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" timeout = 1400 log.info(f"Verify the number of mon pods is 3 for {timeout} seconds") sample = TimeoutSampler(timeout=timeout, sleep=10, func=check_number_of_mon_pods) if sample.wait_for_func_status(result=False): assert "There are more than 3 mon pods." log.info("Respin pod rook-ceph operator pod") rook_ceph_operator_pod_obj = get_operator_pods() rook_ceph_operator_pod_obj[0].delete() schedule_nodes([node_name]) log.info("Wait for all the pods in openshift-storage to be running.") assert wait_for_pods_to_be_running(timeout=300) sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" ceph_health_check() assert check_number_of_mon_pods( ), "The number of mon pods not equal to 3"
def test_del_mon_svc( self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown ): """ Test to verify same mon comes up and running after deleting mon services manually and joins the quorum 1. Delete the mon services 2. Restart the rook operator 3. Make sure all mon pods are running, and same service or endpoints are running 4. Make sure ceph health Ok and storage pods are running 5. Create PVC, should succeeded. """ self.sanity_helpers = Sanity() # Get all mon services mon_svc_before = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() # Delete the mon services one by one svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) mon_svc_ip_before = [] for svc in mon_svc_before: svc_name = svc["metadata"]["name"] mon_svc_ip_before.append(svc["spec"]["clusterIP"]) log.info(f"Delete mon service {svc_name}") svc_obj.delete(resource_name=svc_name) # Verify mon services deleted svc_obj.wait_for_delete(resource_name=svc_name) # Restart the rook-operator pod operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL ) # Verify same mon services are created again for svc in mon_svc_before: svc_name = svc["metadata"]["name"] svc_obj.check_resource_existence( should_exist=True, timeout=300, resource_name=svc_name ) log.info("Same old mon services are recreated") # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=len(mon_pods), timeout=600, sleep=3, ) # Validate same mon services are running log.info("Validate same mon services are running") mon_svc_after = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after] assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, ( "Different mon services are running. " f"Before mon services list: {mon_svc_ip_before}, " f"After mon services list: {mon_svc_ip_after}" ) log.info("Same old mon services are running and all mons are in running state") # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Validate all storage pods are running wait_for_storage_pods() # Create and delete resources self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][ "kubernetes.io/hostname" ] log.info(f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS ) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "" ) ) new_data["data"] = ",".join( [ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ] ) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "") ) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info(f"Waiting for {sleep_time} seconds before deleting another mon") time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}" ) log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown): """ Test check pods status after a node failure event. All the rook ceph pods should be in "Running" or "Completed" state after a node failure event. """ ocs_nodes = get_ocs_nodes() if not ocs_nodes: pytest.skip("We don't have ocs nodes in the cluster") ocs_node = random.choice(ocs_nodes) node_name = ocs_node.name log.info(f"Selected node is '{node_name}'") # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node( node_name) node_osd_ids = get_node_osd_ids(node_name) node_mon_ids = get_node_mon_ids(node_name) log.info(f"Shutting down node '{node_name}'") nodes.stop_nodes([ocs_node]) wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY) log.info( f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status" ) log.info("Wait for a change in the rook ceph pod statuses...") timeout = 480 is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods( node_name, timeout=timeout) assert ( is_rook_ceph_pods_status_changed ), f"Rook Ceph pods status didn't change after {timeout} seconds" log.info( "Check the rook ceph pods are in 'Running' or 'Completed' state") timeout = 480 are_pods_running = wait_for_pods_to_be_running( pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30) assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds" # Get the rook ceph pods without the osd, and mon pods have the old node ids osd_pods = get_osd_pods() new_node_osd_id_names_set = { p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids } mon_pods = get_mon_pods() new_node_mon_id_names_set = { p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids } new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union( new_node_mon_id_names_set) rook_ceph_pod_names_set = set(get_rook_ceph_pod_names()) new_rook_ceph_pod_names = list(rook_ceph_pod_names_set - new_node_osd_mon_id_names_set) log.info( "Verify that the new rook ceph pods are in 'Running' or 'Completed' state" ) timeout = 300 are_new_pods_running = wait_for_pods_to_be_running( pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20) assert (are_new_pods_running ), f"The new pods are not 'Running' after {timeout} seconds" log.info("All the pods are in 'Running' or 'Completed' state") log.info(f"Starting the node '{node_name}' again...") nodes.start_nodes(nodes=[ocs_node]) wait_for_nodes_status(node_names=[node_name]) log.info( "Waiting for all the pods to be running and cluster health to be OK..." ) wait_for_pods_to_be_running(timeout=600) self.sanity_helpers.health_check(tries=40)
def test_multiple_mon_pod_stays_on_same_node(self): """ A testcase to verify multiple mon pods stays on same node 1. Edit the rook-ceph-mon-endpoints configmap say, assign mon-a to another node that would be on the same node as another mon (compute-1 instead of compute-0) 2. Delete the mon-a deployment 3. Edit the mon-b deployment to remove the required mon anti-affinity 4. Restart the operator 5. Edit the mon-a deployment to remove the required mon anti-affinity 6. See mon-a start on compute-1 with mon-b 7. Soon after, see the operator failover one of these mons onto the node that doesn't currently have a mon (compute-0) and start mon-d """ ocs_version = config.ENV_DATA["ocs_version"] # Check that we have LSO cluster and OCS version is 4.8 and below # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937 if not (is_lso_cluster() and Version.coerce(ocs_version) <= Version.coerce("4.8")): pytest.skip( "Skip the test because mons are not node assignment from Rook, if cluster is not " "LSO based. And also currently, we want to run the test only with OCS 4.8 and " "below. This is a workaround due to issue " "https://github.com/red-hat-storage/ocs-ci/issues/4937") # Initialize rook_ceph_mon = "rook-ceph-mon" # Get mons running on pod mon_pods = get_mon_pods() mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get( "mon") mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get( "mon") mon_node = get_pod_node(mon_pods[1]) # Edit the rook-ceph-mon-endpoints log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP(kind=CONFIGMAP, namespace=OPENSHIFT_STORAGE_NAMESPACE) rook_ceph_mon_configmap = configmap_obj.get( resource_name=ROOK_CEPH_MON_ENDPOINTS) json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"]) json_val["node"][mon_name_to_del].update( json_val["node"][mon_name_to_edit]) rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val) new_data = rook_ceph_mon_configmap["data"] params = f'{{"data": {json.dumps(new_data)}}}' configmap_obj.patch( resource_name=ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully") log.info( f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}" ) # Delete one mon deployment which had been edited dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE) mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}" log.info(f"Deleting mon {mon_deployment_name_to_del} deployments") dep_obj.delete(resource_name=mon_deployment_name_to_del) # Edit other mon deployment to remove mon anti-affinity mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}" log.info(f"Edit mon {mon_deployment_name_to_edit} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_edit, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}" ) # Restart operator operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource(condition=STATUS_RUNNING, selector=OPERATOR_LABEL) # Validate deleted deployment mon came up and in pending state # Initially mon stucks in pending state, remove defined anti-affinity POD_OBJ.wait_for_resource( condition=STATUS_PENDING, resource_count=1, selector=MON_APP_LABEL, timeout=1200, ) # Edit mon deployment to remove mon anti-affinity log.info(f"Edit mon {mon_deployment_name_to_del} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_del, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}" ) # Validate mon pod moved to another node such that 2 mons are running on same node log.info("Waiting for 5 seconds for mon recovery") time.sleep(5) new_mon_pods = get_mon_pods() new_node = [ get_pod_node(mon) for mon in new_mon_pods if mon.get().get( "metadata").get("labels").get("mon") == mon_name_to_del ] assert ( new_node[0].name == mon_node.name ), f"Mon moved to node {mon_node} such that 2 mons are running on same node" # Verify rook deletes one of the mon and move to another node timeout = 60 log.info(f"Waiting for {timeout} seconds for mon recovery") time.sleep(timeout) POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, resource_count=len(mon_pods), selector=MON_APP_LABEL, timeout=3600, sleep=5, ) log.info( "Mons are up and running state and validate are running on different nodes" ) mon_pods_running_on_same_node()