Python get_mon_podsの例、ocs_ci.ocs.resources.pod.get_mon_pods Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_monitor_recovery.py プロジェクト: shyRozen/ocs-ci

def corrupt_ceph_monitors():
    """
    Corrupts ceph monitors by deleting store.db file

    """
    mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
    for mon in mon_pods:
        logger.info(f"Corrupting monitor: {mon.name}")
        mon_id = mon.get().get("metadata").get("labels").get("ceph_daemon_id")
        _exec_cmd_on_pod(
            cmd=f"rm -rf /var/lib/ceph/mon/ceph-{mon_id}/store.db", pod_obj=mon
        )
        try:
            wait_for_resource_state(resource=mon, state=constants.STATUS_CLBO)
        except ResourceWrongStatusException:
            if (
                mon.ocp.get_resource(resource_name=mon.name, column="STATUS")
                != constants.STATUS_CLBO
            ):
                logger.info(
                    f"Re-spinning monitor: {mon.name} since it did not reach CLBO state"
                )
                mon.delete()
    logger.info("Validating all the monitors are in CLBO state")
    for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE):
        wait_for_resource_state(resource=mon, state=constants.STATUS_CLBO)

コード例 #2

0

ファイルを表示

ファイル: test_monitor_recovery.py プロジェクト: shyRozen/ocs-ci

def remove_global_id_reclaim():
    """
    Removes global id warning by re-spinning client and mon pods

    """
    csi_pods = []
    interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]
    for interface in interfaces:
        plugin_pods = get_plugin_pods(interface)
        csi_pods += plugin_pods

    cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods()
    rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods()

    csi_pods += cephfs_provisioner_pods
    csi_pods += rbd_provisioner_pods
    for csi_pod in csi_pods:
        csi_pod.delete()
    for mds_pod in get_mds_pods():
        mds_pod.delete()
    for mds_pod in get_mds_pods():
        wait_for_resource_state(resource=mds_pod, state=constants.STATUS_RUNNING)
    for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE):
        mon.delete()
    mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
    for mon in mon_pods:
        wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING)

コード例 #3

0

ファイルを表示

ファイル: test_monitor_recovery.py プロジェクト: shyRozen/ocs-ci

    def monitor_rebuild(self, mon_map_cmd):
        """
        Rebuilds the monitor

        Args:
            mon_map_cmd (str): mon-store tool command

        """
        logger.info("Re-spinning the mon pods")
        for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE):
            mon.delete()
        mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        for mon in mon_pods:
            wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING)
        mon_a = mon_pods[0]
        logger.info(f"Working on monitor: {mon_a.name}")

        logger.info(f"Copying mon-store into monitor: {mon_a.name}")
        self._exec_oc_cmd(f"cp /tmp/monstore {mon_a.name}:/tmp/")

        logger.info("Changing ownership of monstore to ceph")
        _exec_cmd_on_pod(cmd="chown -R ceph:ceph /tmp/monstore", pod_obj=mon_a)
        self.copy_and_import_keys(mon_obj=mon_a)
        logger.info("Creating monitor map")
        _exec_cmd_on_pod(cmd=mon_map_cmd, pod_obj=mon_a)

        rebuild_mon_cmd = "ceph-monstore-tool /tmp/monstore rebuild -- --keyring /tmp/keyring --monmap /tmp/monmap"
        logger.info("Running command to rebuild monitor")
        mon_a.exec_cmd_on_pod(command=rebuild_mon_cmd, out_yaml_format=False)

        logger.info(f"Copying store.db directory from monitor: {mon_a.name}")
        self._exec_oc_cmd(
            f"cp {mon_a.name}:/tmp/monstore/store.db {self.backup_dir}/store.db"
        )

        logger.info("Copying store.db to rest of the monitors")
        for mon in get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE):
            cmd = (
                f"cp {self.backup_dir}/store.db {mon.name}:/var/lib/ceph/mon/ceph-"
                f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/ "
            )
            logger.info(f"Copying store.db to monitor: {mon.name}")
            self._exec_oc_cmd(cmd)
            logger.info("Changing ownership of store.db to ceph:ceph")
            _exec_cmd_on_pod(
                cmd=f"chown -R ceph:ceph /var/lib/ceph/mon/ceph-"
                f"{mon.get().get('metadata').get('labels').get('ceph_daemon_id')}/store.db",
                pod_obj=mon,
            )

コード例 #4

0

ファイルを表示

 def set_resource(self, resource):
     self.resource = resource
     if self.resource == 'mgr':
         self.resource_obj = pod.get_mgr_pods()
         self.type = 'rook-ceph'
     if self.resource == 'mon':
         self.resource_obj = pod.get_mon_pods()
         self.type = 'rook-ceph'
     if self.resource == 'osd':
         self.resource_obj = pod.get_osd_pods()
         self.type = 'rook-ceph'
     if self.resource == 'mds':
         self.resource_obj = pod.get_mds_pods()
         self.type = 'rook-ceph'
     if self.resource == 'cephfsplugin':
         self.resource_obj = pod.get_plugin_pods(
             interface=constants.CEPHFILESYSTEM
         )
         self.type = 'csi'
     if self.resource == 'rbdplugin':
         self.resource_obj = pod.get_plugin_pods(
             interface=constants.CEPHBLOCKPOOL
         )
         self.type = 'csi'
     self.resource_count = len(self.resource_obj)

コード例 #5

0

ファイルを表示

    def mon_health_check(self, count):
        """
        Mon health check based on pod count

        Args:
            count (int): Expected number of mon pods

        Raises:
            MonCountException: if mon pod count doesn't match
        """
        timeout = 10 * len(self.pods)
        logger.info(f"Expected MONs = {count}")
        try:
            assert self.POD.wait_for_resource(
                condition='Running', selector=self.mon_selector,
                resource_count=count, timeout=timeout, sleep=3,
            )

            # TODO: Workaround for BZ1748325:
            actual_mons = pod.get_mon_pods()
            actual_running_mons = list()
            for mon in actual_mons:
                if mon.ocp.get_resource_status(mon.name) == constant.STATUS_RUNNING:
                    actual_running_mons.append(mon)
            actual = len(actual_running_mons)
            # TODO: End of workaround for BZ1748325

            assert count == actual, f"Expected {count},  Got {actual}"
        except exceptions.TimeoutExpiredError as e:
            logger.error(e)
            raise exceptions.MonCountException(
                f"Failed to achieve desired Mon count"
                f" {count}"
            )

コード例 #6

0

ファイルを表示

ファイル: cluster.py プロジェクト: nimrod-becker/ocs-ci

    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        self.mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs:
            self.cephfs.reload()
        else:
            try:
                self.cephfs_config = self.CEPHFS.get().get('items')[0]
                self.cephfs = ocs.OCS(**self.cephfs_config)
                self.cephfs.reload()
            except IndexError as e:
                logging.warning(e)
                logging.warning("No CephFS found")

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)

コード例 #7

0

ファイルを表示

ファイル: cluster.py プロジェクト: nimrod-becker/ocs-ci

    def mon_health_check(self, count):
        """
        Mon health check based on pod count

        Args:
            count (int): Expected number of mon pods

        Raises:
            MonCountException: if mon pod count doesn't match
        """
        timeout = 10 * len(self.pods)
        logger.info(f"Expected MONs = {count}")
        try:
            assert self.POD.wait_for_resource(
                condition='Running',
                selector=self.mon_selector,
                resource_count=count,
                timeout=timeout,
                sleep=3,
            )
            actual = len(pod.get_mon_pods())
            assert count == actual, f"Expected {count},  Got {actual}"
        except exceptions.TimeoutExpiredError as e:
            logger.error(e)
            raise exceptions.MonCountException(
                f"Failed to achieve desired Mon count"
                f" {count}")

コード例 #8

0

ファイルを表示

ファイル: test_full_cluster_health.py プロジェクト: ramkiperiy/ocs-ci

    def delete_pods(self):
        """
        Try to delete pods:
            - Rook operator
            - OSD
            - MGR
            - MON
        """
        pod_list = []
        rook_operator_pod = pod.get_ocs_operator_pod(
            ocs_label=constants.OPERATOR_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        pod_list.append(rook_operator_pod)

        osd_pods = pod.get_osd_pods()
        pod_list.extend(osd_pods)

        mgr_pods = pod.get_mgr_pods()
        pod_list.extend(mgr_pods)

        mon_pods = pod.get_mon_pods()
        pod_list.extend(mon_pods)

        logger.info(f"Deleting pods: {[p.name for p in pod_list]}")
        pod.delete_pods(pod_objs=pod_list)

コード例 #9

0

ファイルを表示

    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        # TODO: Workaround for BZ1748325:
        mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        for mon in mons:
            if mon.ocp.get_resource_status(
                    mon.name) == constant.STATUS_RUNNING:
                self.mons.append(mon)
        # TODO: End of workaround for BZ1748325
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs:
            self.cephfs.reload()
        else:
            try:
                self.cephfs_config = self.CEPHFS.get().get('items')[0]
                self.cephfs = ocs.OCS(**self.cephfs_config)
                self.cephfs.reload()
            except IndexError as e:
                logging.warning(e)
                logging.warning("No CephFS found")

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)

コード例 #10

0

ファイルを表示

def verify_multus_network():
    """
    Verify Multus network(s) created successfully and are present on relevant pods.
    """
    with open(constants.MULTUS_YAML, mode="r") as f:
        multus_public_data = yaml.load(f)
        multus_namespace = multus_public_data["metadata"]["namespace"]
        multus_name = multus_public_data["metadata"]["name"]
        multus_public_network_name = f"{multus_namespace}/{multus_name}"

    log.info("Verifying multus NetworkAttachmentDefinitions")
    ocp.OCP(
        resource_name=multus_public_network_name,
        kind="network-attachment-definitions",
        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
    )
    # TODO: also check if private NAD exists

    log.info("Verifying multus public network exists on ceph pods")
    osd_pods = get_osd_pods()
    for _pod in osd_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)
    # TODO: also check private network if it exists on OSD pods

    mon_pods = get_mon_pods()
    mds_pods = get_mds_pods()
    mgr_pods = get_mgr_pods()
    rgw_pods = get_rgw_pods()
    ceph_pods = [*mon_pods, *mds_pods, *mgr_pods, *rgw_pods]
    for _pod in ceph_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)

    log.info("Verifying multus public network exists on CSI pods")
    csi_pods = []
    interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]
    for interface in interfaces:
        plugin_pods = get_plugin_pods(interface)
        csi_pods += plugin_pods

    cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods()
    rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods()

    csi_pods += cephfs_provisioner_pods
    csi_pods += rbd_provisioner_pods

    for _pod in csi_pods:
        assert (_pod.data["metadata"]["annotations"]
                ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name)

    log.info("Verifying StorageCluster multus network data")
    sc = get_storage_cluster()
    sc_data = sc.get().get("items")[0]
    network_data = sc_data["spec"]["network"]
    assert network_data["provider"] == "multus"
    selectors = network_data["selectors"]
    assert selectors[
        "public"] == f"{defaults.ROOK_CLUSTER_NAMESPACE}/ocs-public"

コード例 #11

0

ファイルを表示

ファイル: test_monitor_recovery.py プロジェクト: shyRozen/ocs-ci

def validate_mon_pods():
    """
    Checks mon pods are running with retries

    """
    mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
    for mon in mon_pods:
        wait_for_resource_state(resource=mon, state=constants.STATUS_RUNNING)

コード例 #12

0

ファイルを表示

ファイル: test_post_installation_state.py プロジェクト: sidhant-agrawal/ocs-ci

 def test_connection_time_out(self):
     """
     Test that connection from mon pod to external domain is blocked and gets timeout
     """
     mon_pod = pod.get_mon_pods()[0]
     with pytest.raises(CommandFailed) as cmdfailed:
         mon_pod.exec_cmd_on_pod("curl google.com")
     assert "Connection timed out" in str(cmdfailed)

コード例 #13

0

ファイルを表示

ファイル: test_mon_log_trimming.py プロジェクト: sidhant-agrawal/ocs-ci

    def setup(self, request, pod_factory):
        """
        Set values for:
          paxos_service_trim_min=10
          paxos_service_trim_max=100
          osd_op_complaint_time=0.000001
        """
        self.fio_pod_obj = pod_factory(constants.CEPHFILESYSTEM)
        mon_pods = get_mon_pods()
        self.selected_mon_pod_obj = random.choice(mon_pods)
        self.selected_mon_pod = (
            self.selected_mon_pod_obj.get().get("metadata").get("labels").get("mon")
        )
        log.info(f"Selected mon pod is: {self.selected_mon_pod_obj.name}")
        log.info(
            "Setting values: paxos_service_trim_min=10, paxos_service_trim_max=100 "
            "and osd_op_complaint_time=0.000001"
        )
        self.ct_pod = pod.get_ceph_tools_pod()
        # mon in the "tell" command should be mon.a / mon.b / mon.c
        self.ct_pod.exec_ceph_cmd(
            ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --paxos_service_trim_min=10"
        )
        self.ct_pod.exec_ceph_cmd(
            ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --paxos_service_trim_max=100"
        )
        self.ct_pod.exec_ceph_cmd(
            ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs --osd_op_complaint_time=0.000001"
        )

        def finalizer():
            """
            Set default values for:
              paxos_service_trim_min=250
              paxos_service_trim_max=500
              osd_op_complaint_time=30.000000
            """
            if not self.stop_checking_mon_db:
                self.stop_checking_mon_db = True
            log.info(
                f"Setting default values for paxos_service_trim_min({constants.DEFAULT_PAXOS_SERVICE_TRIM_MIN}), "
                f"paxos_service_trim_max({constants.DEFAULT_PAXOS_SERVICE_TRIM_MAX}) "
                f"and osd_op_complaint_time({constants.DEFAULT_OSD_OP_COMPLAINT_TIME})"
            )
            self.ct_pod.exec_ceph_cmd(
                ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs "
                f"--paxos_service_trim_min={constants.DEFAULT_PAXOS_SERVICE_TRIM_MIN}"
            )
            self.ct_pod.exec_ceph_cmd(
                ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs "
                f"--paxos_service_trim_max={constants.DEFAULT_PAXOS_SERVICE_TRIM_MAX}"
            )
            self.ct_pod.exec_ceph_cmd(
                ceph_cmd=f"ceph tell mon.{self.selected_mon_pod} injectargs "
                f"--osd_op_complaint_time={constants.DEFAULT_OSD_OP_COMPLAINT_TIME}"
            )

        request.addfinalizer(finalizer)

コード例 #14

0

ファイルを表示

    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        if (config.ENV_DATA["platform"]
                in constants.MANAGED_SERVICE_PLATFORMS) and (resource
                                                             in CEPH_PODS):
            # If the platform is Managed Services, then the ceph pods will be present in the provider cluster.
            # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig'
            # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods.
            provider_kubeconfig = os.path.join(
                config.clusters[
                    config.get_provider_index()].ENV_DATA["cluster_path"],
                config.clusters[config.get_provider_index()].RUN.get(
                    "kubeconfig_location"),
            )
            self.cluster_kubeconfig = provider_kubeconfig
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)

コード例 #15

0

ファイルを表示

 def set_resource(self, resource):
     self.resource = resource
     if self.resource == 'mgr':
         self.resource_obj = pod.get_mgr_pods()
     if self.resource == 'mon':
         self.resource_obj = pod.get_mon_pods()
     if self.resource == 'osd':
         self.resource_obj = pod.get_osd_pods()
     if self.resource == 'mds':
         self.resource_obj = pod.get_mds_pods()
     self.resource_count = len(self.resource_obj)

コード例 #16

0

ファイルを表示

    def check_mon_pods_eq_3(self):
        """
        Get number of monitoring pods

        Returns:
            bool: False if number of mon pods is 3, True otherwise

        """
        mon_pod_list = get_mon_pods()
        if len(mon_pod_list) == 3:
            return False
        else:
            log.info(f"There are {len(mon_pod_list)} mon pods")
            for mon_pod in mon_pod_list:
                log.info(f"{mon_pod.name}")
            return True

コード例 #17

0

ファイルを表示

ファイル: test_monitor_recovery.py プロジェクト: shyRozen/ocs-ci

def generate_monmap_cmd():
    """
    Generates monmap-tool command used to rebuild monitors

    Returns:
        str: Monitor map command

    """
    mon_ips_dict = {}
    mon_ids = []
    mon_ips = []

    logger.info("Getting monitor pods public IP")
    mon_pods = get_mon_pods(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
    for mon in mon_pods:
        mon_ids.append(mon.get().get("metadata").get("labels").get("ceph_daemon_id"))
        logger.info(f"getting public ip of {mon.name}")
        logger.info(mon_ids)
        mon_ips.append(
            re.findall(
                r"[0-9]+(?:\.[0-9]+){3}",
                mon.get().get("spec").get("initContainers")[1].get("args")[-2],
            )
        )

    mon_a = mon_pods[0]
    logger.info(f"Working on monitor: {mon_a.name} to get FSID")
    fsid = (
        mon_a.get()
        .get("spec")
        .get("initContainers")[1]
        .get("args")[0]
        .replace("--fsid=", "")
    )

    for ids, ip in zip(mon_ids, mon_ips):
        mon_ips_dict.update({ids: f"{ip}"})

    mon_ip_ids = ""
    for key, val in mon_ips_dict.items():
        mon_ip_ids = mon_ip_ids + f"--add {key} {val}" + " "

    mon_map_cmd = f"monmaptool --create {mon_ip_ids} --enable-all-features --clobber /tmp/monmap --fsid {fsid}"
    logger.info(f"Generated monitor map creation command: {mon_map_cmd}")
    return mon_map_cmd

コード例 #18

0

ファイルを表示

def test_pod_log_after_upgrade():
    """
    Check OSD/MON/MGR pod logs after upgrade and verify the expected log exist

    """
    pod_objs = get_osd_pods() + get_mon_pods() + get_mgr_pods()
    pod_names = [osd_pod_obj.name for osd_pod_obj in pod_objs]
    expected_log_after_upgrade = "set uid:gid to 167:167 (ceph:ceph)"
    logging.info(f"Check that the log '{expected_log_after_upgrade}' "
                 f"appears after the osd/mon/mg pod is initialized")
    for pod_name in pod_names:
        pod_logs = get_pod_logs(pod_name=pod_name, all_containers=True)
        assert expected_log_after_upgrade in pod_logs, (
            f"The expected log after upgrade '{expected_log_after_upgrade}' does not exist"
            f" on pod {pod_name}")
    logging.info(
        f"The log '{expected_log_after_upgrade}' appears in all relevant pods."
    )

コード例 #19

0

ファイルを表示

ファイル: disruption_helpers.py プロジェクト: yosibsh/ocs-ci

    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)

コード例 #20

0

ファイルを表示

def get_node_pods_to_scale_down(node_name):
    """
    Get the pods of a node to scale down as described in the documents
    of node replacement with LSO

    Args:
        node_name (str): The node name

    Returns:
        list: The node's pods to scale down

    """
    pods_to_scale_down = [
        *pod.get_mon_pods(),
        *pod.get_osd_pods(),
        *pod.get_mgr_pods(),
    ]

    return get_node_pods(node_name, pods_to_scale_down)

コード例 #21

0

ファイルを表示

ファイル: disruption_helpers.py プロジェクト: ekuric/ocs-ci

    def set_resource(self, resource):
        self.resource = resource
        resource_count = 0
        if self.resource == 'mgr':
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == 'mon':
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == 'osd':
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == 'mds':
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == 'cephfsplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == 'rbdplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == 'cephfsplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == 'rbdplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == 'operator':
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)

コード例 #22

0

ファイルを表示

ファイル: cluster.py プロジェクト: xhtheking/ocs-ci

    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        self.mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs_config:
            self.cephfs.reload()

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)

コード例 #23

0

ファイルを表示

ファイル: test_multiple_mon_pods_stays_on_same_node.py プロジェクト: ramkiperiy/ocs-ci

    def teardown(self, request):
        """
        Verifies cluster is healthy
        """
        mon_pod = get_mon_pods()

        def finalizer():

            try:

                # Validate all mon pods are running
                log.info("Validate all mons are up and running")
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                )
                log.info("All mons are up and running")

            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.error(f"{ex}")
                # Restart operator
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)

                # Wait untill mon pod recovery
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                    timeout=3600,
                    sleep=5,
                )
                log.info("All mons are up and running")

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

        request.addfinalizer(finalizer)

コード例 #24

0

ファイルを表示

ファイル: test_mon_data_avail_warn.py プロジェクト: shyRozen/ocs-ci

    def workloads_dir_setup(self, request):
        """
        Setting up the environment for the test

        """
        if config.DEPLOYMENT.get("local_storage"):
            self.worker_node = node.get_worker_nodes()[0]
            self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            mon_pod_name = self.oc_cmd.exec_oc_debug_cmd(
                node=self.worker_node,
                cmd_list=["ls /var/lib/rook/ | grep mon"],
            )
            mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "")

            mon_pods_info = pod.get_pods_having_label(
                label=f"ceph_daemon_id={mon_pod_id}",
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
            self.mon_pod = pod.get_pod_obj(
                name=mon_pods_info[0]["metadata"]["name"],
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
        else:
            self.mon_pod = random.choice(pod.get_mon_pods())
        self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get(
            "mon")

        self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads"
        log.info(f"Selected mon '{self.mon_pod.name}'")
        self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}")
        self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}")

        def finalizer():
            self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}")
            time.sleep(SLEEP_TIMEOUT)
            utils.ceph_health_check()

        request.addfinalizer(finalizer)

コード例 #25

0

ファイルを表示

ファイル: test_restore_ceph_mon_quorum.py プロジェクト: shyRozen/ocs-ci

        def finalizer():
            op_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            pod_obj = OCP(
                kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR)
            if operator_obj.get("spec").get("replicas") != 1:
                modify_deployment_replica_count(
                    deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1
                ), "Failed to scale up rook-ceph-operator to 1"

            log.info("Validate all mons are up and running")
            try:
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=60,
                    sleep=5,
                )
            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.warning(ex)
                op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR)
                for pod in get_mon_pods():
                    pod.delete()
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=360,
                    sleep=5,
                )
            log.info("All mons are up and running")

コード例 #26

0

ファイルを表示

    def test_rook_operator_restart_during_mon_failover(self,
                                                       node_drain_teardown):
        """
        Verify the number of monitoring pod is three when drain node

        """
        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        log.info("Get worker node name where monitoring pod run")
        mon_pod_objs = get_mon_pods()
        node_name = mon_pod_objs[0].data["spec"]["nodeName"]

        drain_nodes([node_name])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=0,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        timeout = 1400
        log.info(f"Verify the number of mon pods is 3 for {timeout} seconds")
        sample = TimeoutSampler(timeout=timeout,
                                sleep=10,
                                func=check_number_of_mon_pods)
        if sample.wait_for_func_status(result=False):
            assert "There are more than 3 mon pods."

        log.info("Respin pod rook-ceph operator pod")
        rook_ceph_operator_pod_obj = get_operator_pods()
        rook_ceph_operator_pod_obj[0].delete()

        schedule_nodes([node_name])

        log.info("Wait for all the pods in openshift-storage to be running.")
        assert wait_for_pods_to_be_running(timeout=300)

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        ceph_health_check()

        assert check_number_of_mon_pods(
        ), "The number of mon pods not equal to 3"

コード例 #27

0

ファイルを表示

ファイル: test_del_mon_service_and_create_pvc.py プロジェクト: sidhant-agrawal/ocs-ci

    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)

コード例 #28

0

ファイルを表示

ファイル: test_del_mon_service_and_create_pvc.py プロジェクト: sidhant-agrawal/ocs-ci

    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")

コード例 #29

0

ファイルを表示

    def test_check_pods_status_after_node_failure(self, nodes,
                                                  node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(
            node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name],
                              status=constants.NODE_NOT_READY)
        log.info(
            f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status"
        )

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout)
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info(
            "Check the rook ceph pods are in 'Running' or 'Completed' state")
        timeout = 480
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node,
            timeout=timeout,
            sleep=30)
        assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds"

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name
            for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name
            for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set)
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(rook_ceph_pod_names_set -
                                       new_node_osd_mon_id_names_set)

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20)
        assert (are_new_pods_running
                ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")
        log.info(f"Starting the node '{node_name}' again...")
        nodes.start_nodes(nodes=[ocs_node])
        wait_for_nodes_status(node_names=[node_name])

        log.info(
            "Waiting for all the pods to be running and cluster health to be OK..."
        )
        wait_for_pods_to_be_running(timeout=600)
        self.sanity_helpers.health_check(tries=40)

コード例 #30

0

ファイルを表示

ファイル: test_multiple_mon_pods_stays_on_same_node.py プロジェクト: ramkiperiy/ocs-ci

    def test_multiple_mon_pod_stays_on_same_node(self):
        """
        A testcase to verify multiple mon pods stays on same node

        1. Edit the rook-ceph-mon-endpoints configmap
           say, assign mon-a to another node that would be on
           the same node as another mon (compute-1 instead of compute-0)
        2. Delete the mon-a deployment
        3. Edit the mon-b deployment to remove the required mon anti-affinity
        4. Restart the operator
        5. Edit the mon-a deployment to remove the required mon anti-affinity
        6. See mon-a start on compute-1 with mon-b
        7. Soon after, see the operator failover one of these mons onto the
        node that doesn't currently have a mon (compute-0) and start mon-d

        """
        ocs_version = config.ENV_DATA["ocs_version"]
        # Check that we have LSO cluster and OCS version is 4.8 and below
        # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937
        if not (is_lso_cluster()
                and Version.coerce(ocs_version) <= Version.coerce("4.8")):
            pytest.skip(
                "Skip the test because mons are not node assignment from Rook, if cluster is not "
                "LSO based. And also currently, we want to run the test only with OCS 4.8 and "
                "below. This is a workaround due to issue "
                "https://github.com/red-hat-storage/ocs-ci/issues/4937")
        # Initialize
        rook_ceph_mon = "rook-ceph-mon"

        # Get mons running on pod
        mon_pods = get_mon_pods()
        mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get(
            "mon")
        mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get(
            "mon")
        mon_node = get_pod_node(mon_pods[1])

        # Edit the rook-ceph-mon-endpoints
        log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}")
        configmap_obj = OCP(kind=CONFIGMAP,
                            namespace=OPENSHIFT_STORAGE_NAMESPACE)
        rook_ceph_mon_configmap = configmap_obj.get(
            resource_name=ROOK_CEPH_MON_ENDPOINTS)
        json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"])
        json_val["node"][mon_name_to_del].update(
            json_val["node"][mon_name_to_edit])
        rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val)
        new_data = rook_ceph_mon_configmap["data"]
        params = f'{{"data": {json.dumps(new_data)}}}'
        configmap_obj.patch(
            resource_name=ROOK_CEPH_MON_ENDPOINTS,
            params=params,
            format_type="strategic",
        )
        log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully")
        log.info(
            f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}"
        )

        # Delete one mon deployment which had been edited
        dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE)
        mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}"
        log.info(f"Deleting mon {mon_deployment_name_to_del} deployments")
        dep_obj.delete(resource_name=mon_deployment_name_to_del)

        # Edit other mon deployment to remove mon anti-affinity
        mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}"
        log.info(f"Edit mon {mon_deployment_name_to_edit} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_edit,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}"
        )

        # Restart operator
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(condition=STATUS_RUNNING,
                                  selector=OPERATOR_LABEL)

        # Validate deleted deployment mon came up and in pending state
        # Initially mon stucks in pending state, remove defined anti-affinity
        POD_OBJ.wait_for_resource(
            condition=STATUS_PENDING,
            resource_count=1,
            selector=MON_APP_LABEL,
            timeout=1200,
        )
        # Edit mon deployment to remove mon anti-affinity
        log.info(f"Edit mon {mon_deployment_name_to_del} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_del,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}"
        )

        # Validate mon pod moved to another node such that 2 mons are running on same node
        log.info("Waiting for 5 seconds for mon recovery")
        time.sleep(5)
        new_mon_pods = get_mon_pods()
        new_node = [
            get_pod_node(mon) for mon in new_mon_pods if mon.get().get(
                "metadata").get("labels").get("mon") == mon_name_to_del
        ]
        assert (
            new_node[0].name == mon_node.name
        ), f"Mon moved to node {mon_node} such that 2 mons are running on same node"

        # Verify rook deletes one of the mon and move to another node
        timeout = 60
        log.info(f"Waiting for {timeout} seconds for mon recovery")
        time.sleep(timeout)

        POD_OBJ.wait_for_resource(
            condition=STATUS_RUNNING,
            resource_count=len(mon_pods),
            selector=MON_APP_LABEL,
            timeout=3600,
            sleep=5,
        )
        log.info(
            "Mons are up and running state and validate are running on different nodes"
        )
        mon_pods_running_on_same_node()