Beispiel #1
0
    def _guster_pod_delete_cleanup(self, g_pod_list_before):
        """Cleanup for deletion of gluster pod using force delete"""
        # Switch to gluster project
        openshift_ops.switch_oc_project(self._master,
                                        self._registry_project_name)
        try:
            # Fetch gluster pod after delete
            pod_name = self._get_newly_deployed_gluster_pod(g_pod_list_before)

            # Check if pod name is empty i.e no new pod come up so use old pod
            openshift_ops.wait_for_pod_be_ready(
                self._master,
                pod_name[0] if pod_name else g_pod_list_before[0],
                timeout=1)
        except exceptions.ExecutionError:
            # Force delete and wait for new pod to come up
            openshift_ops.oc_delete(self._master,
                                    'pod',
                                    g_pod_list_before[0],
                                    is_force=True)
            openshift_ops.wait_for_resource_absence(self._master, 'pod',
                                                    g_pod_list_before[0])

            # Fetch gluster pod after force delete
            g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before)
            openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0])
    def _perform_io_and_fetch_metrics(self, pod_name, pvc_name, filename,
                                      dirname, metric_data, operation):
        """Create 1000 files and dirs and validate with old metrics"""
        openshift_ops.switch_oc_project(self._master,
                                        self.storage_project_name)
        if operation == "create":
            cmds = ("touch /mnt/{}{{1..1000}}".format(filename),
                    "mkdir /mnt/{}{{1..1000}}".format(dirname))
        else:
            cmds = ("rm -rf /mnt/large_file",
                    "rm -rf /mnt/{}{{1..1000}}".format(filename),
                    "rm -rf /mnt/{}{{1..1000}}".format(dirname))
        for cmd in cmds:
            self.cmd_run("oc rsh {} {}".format(pod_name, cmd))

        # Fetch the new metrics and compare the inodes used and bytes used
        for w in waiter.Waiter(120, 10):
            after_io_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if operation == "create":
                if (int(after_io_metrics['kubelet_volume_stats_inodes_used']) >
                        int(metric_data['kubelet_volume_stats_inodes_used'])
                        and
                        int(after_io_metrics['kubelet_volume_stats_used_bytes']
                            ) >
                        int(metric_data['kubelet_volume_stats_used_bytes'])):
                    break
            else:
                if int(metric_data['kubelet_volume_stats_used_bytes']) > int(
                        after_io_metrics['kubelet_volume_stats_used_bytes']):
                    break
        if w.expired:
            raise AssertionError(
                "After data is modified metrics like bytes used and inodes "
                "used are not reflected in prometheus")
    def _check_heketi_and_gluster_pod_after_node_reboot(self, heketi_node):
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        heketi_pod = openshift_ops.get_pod_names_from_dc(
            self._master, self.heketi_dc_name)[0]

        # Wait for heketi pod to become ready and running
        openshift_ops.wait_for_pod_be_ready(self._master, heketi_pod)
        heketi_ops.hello_heketi(self._master, self.heketi_server_url)

        # Wait for glusterfs pods to become ready if hosted on same node
        heketi_node_ip = openshift_ops.oc_get_custom_resource(
            self._master, 'pod', '.:status.hostIP', heketi_pod)[0]
        if heketi_node_ip in self.gluster_servers:
            gluster_pod = openshift_ops.get_gluster_pod_name_for_specific_node(
                self._master, heketi_node)

            # Wait for glusterfs pod to become ready
            openshift_ops.wait_for_pod_be_ready(self._master, gluster_pod)
            services = (
                ("glusterd", "running"), ("gluster-blockd", "running"),
                ("tcmu-runner", "running"), ("gluster-block-target", "exited"))
            for service, state in services:
                openshift_ops.check_service_status_on_pod(
                    self._master, gluster_pod, service, "active", state)
Beispiel #4
0
    def setUp(self):
        """Initialize all the variables necessary for test cases."""
        super(TestMetricsAndGlusterRegistryValidation, self).setUp()

        try:
            metrics_config = g.config['openshift']['metrics']
            self.metrics_project_name = metrics_config['metrics_project_name']
            self.metrics_rc_hawkular_cassandra = (
                metrics_config['metrics_rc_hawkular_cassandra'])
            self.metrics_rc_hawkular_metrics = (
                metrics_config['metrics_rc_hawkular_metrics'])
            self.metrics_rc_heapster = metrics_config['metrics_rc_heapster']
            self.registry_heketi_server_url = (
                g.config['openshift']['registry_heketi_config']
                ['heketi_server_url'])
            self.registry_project_name = (
                g.config['openshift']['registry_project_name'])
            self.registry_servers_info = g.config['gluster_registry_servers']
        except KeyError as err:
            msg = "Config file doesn't have key {}".format(err)
            g.log.error(msg)
            self.skipTest(msg)

        self.master = self.ocp_master_node[0]
        cmd = "oc project --short=true"
        current_project = command.cmd_run(cmd, self.master)
        switch_oc_project(self.master, self.metrics_project_name)
        self.addCleanup(switch_oc_project, self.master, current_project)
Beispiel #5
0
    def test_kill_bhv_fsd_while_es_pod_running(self):
        """Validate killing of bhv fsd won't effect es pod io's"""

        # Fetch pod and PVC names and validate iscsi and multipath
        es_pod, pvc_name = self._get_es_pod_and_verify_iscsi_sessions()

        # Get the bhv name
        gluster_node = list(self._registry_servers_info.keys())[0]
        openshift_ops.switch_oc_project(self._master,
                                        self._registry_project_name)
        bhv_name = self.get_block_hosting_volume_by_pvc_name(
            pvc_name,
            heketi_server_url=self._registry_heketi_server_url,
            gluster_node=gluster_node)

        # Get one of the bricks pid of the bhv
        gluster_volume_status = gluster_ops.get_gluster_vol_status(bhv_name)
        pid = None
        for g_node, g_node_data in gluster_volume_status.items():
            if g_node != gluster_node:
                continue
            for process_name, process_data in g_node_data.items():
                if not process_name.startswith("/var"):
                    continue
                pid = process_data["pid"]
                # When birck is down, pid of the brick is returned as -1.
                # Which is unexepeted situation. So, add appropriate assertion.
                self.assertNotEqual(
                    pid, "-1", "Got unexpected PID (-1) for '{}' gluster vol "
                    "on '{}' node.".format(bhv_name, gluster_node))
                break
            self.assertTrue(
                pid, "Could not find 'pid' in Gluster vol data for '{}' "
                "Gluster node. Data: {}".format(gluster_node,
                                                gluster_volume_status))
            break

        # Kill gluster vol brick process using found pid
        cmd_kill = "kill -9 {}".format(pid)
        cmd_start_vol = "gluster v start {} force".format(bhv_name)
        openshift_ops.cmd_run_on_gluster_pod_or_node(self._master, cmd_kill,
                                                     gluster_node)
        self.addCleanup(openshift_ops.cmd_run_on_gluster_pod_or_node,
                        self._master, cmd_start_vol, gluster_node)
        self.addCleanup(openshift_ops.switch_oc_project, self._master,
                        self._registry_project_name)

        # Run I/O on ES pod
        openshift_ops.switch_oc_project(self._master,
                                        self._logging_project_name)
        file_name = '/elasticsearch/persistent/file1'
        cmd_run_io = 'dd if=/dev/urandom of={} bs=4k count=10000'.format(
            file_name)
        cmd_remove_file = 'rm {}'.format(file_name)
        openshift_ops.oc_rsh(self._master, es_pod, cmd_run_io)
        self.addCleanup(openshift_ops.oc_rsh, self._master, es_pod,
                        cmd_remove_file)
Beispiel #6
0
    def test_prometheus_volume_metrics_on_pod_restart(self):
        """Validate volume metrics using prometheus before and after pod
        restart"""

        # Create PVC and wait for it to be in 'Bound' state
        pvc_name = self.create_and_wait_for_pvc()
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, "autotest-volume",
            image=self.io_container_image_cirros)
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name,
                        raise_on_absence=False)

        # Wait for POD be up and running
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=2)

        # Write data on the volume and wait for 2 mins and sleep is must for
        # prometheus to get the exact values of the metrics
        self._run_io_on_the_pod(pod_name, 30)
        time.sleep(120)

        # Fetching the metrics and storing in initial_metrics as dictionary
        initial_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)

        # Mark the current node unschedulable on which app pod is running
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pod_info = openshift_ops.oc_get_pods(self._master, name=pod_name)
        openshift_ops.oc_adm_manage_node(
            self._master, '--schedulable=false',
            nodes=[pod_info[pod_name]["node"]])
        self.addCleanup(
            openshift_ops.oc_adm_manage_node, self._master,
            '--schedulable=true', nodes=[pod_info[pod_name]["node"]])

        # Delete the existing pod and create a new pod
        openshift_ops.oc_delete(self._master, 'pod', pod_name)
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, "autotest-volume")
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name)

        # Wait for POD be up and running and prometheus to refresh the data
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=2)
        time.sleep(120)

        # Fetching the metrics and storing in final_metrics as dictionary and
        # validating with initial_metrics
        final_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)
        self.assertEqual(dict(initial_metrics), dict(final_metrics),
                         "Metrics are different post pod restart")
 def _guster_volume_cleanup(self, vol_name):
     # Check brick status. Restart vol if bricks are offline
     openshift_ops.switch_oc_project(self._master,
                                     self._registry_project_name)
     brick_list = brick_libs.get_all_bricks("auto_get_gluster_endpoint",
                                            vol_name)
     self.assertIsNotNone(brick_list, "Failed to get brick list")
     check_bricks = brick_libs.are_bricks_online(
         "auto_get_gluster_endpoint", vol_name, brick_list)
     if not check_bricks:
         start_vol, _, _ = volume_ops.volume_start(
             "auto_get_gluster_endpoint", vol_name, force=True)
         self.assertFalse(start_vol, "Failed to start volume using force")
    def _delete_and_wait_for_new_es_pod_to_come_up(self):

        # Force delete and wait for es pod to come up
        openshift_ops.switch_oc_project(
            self._master, self._logging_project_name)
        pod_name = openshift_ops.get_pod_name_from_dc(
            self._master, self._logging_es_dc)
        openshift_ops.oc_delete(self._master, 'pod', pod_name, is_force=True)
        openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name)
        new_pod_name = openshift_ops.get_pod_name_from_dc(
            self._master, self._logging_es_dc)
        openshift_ops.wait_for_pod_be_ready(
            self._master, new_pod_name, timeout=1800)
    def _wait_for_gluster_pod_be_ready(self, g_pod_list_before):
        """Wait for the gluster pods to be in ready state"""
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)

        # Check if the gluster pods are in ready state
        try:
            pod_count = len(self._registry_servers_info.keys())
            openshift_ops.wait_for_pods_be_ready(
                self._master, pod_count, "glusterfs-node=pod",
                timeout=120, wait_step=6)
        except exceptions.ExecutionError:
            self._guster_pod_delete(g_pod_list_before)
    def test_block_provisioner_on_multiple_clusters(self):
        """Check block provisioner and verify on multiple clusters
        """
        # Skip test if registry project is not present
        self.registry_sc = self.storage_classes.get(
            'registry_block_storage_class')
        if not self.registry_sc:
            self.skipTest("Config file doesn't have key "
                          "openshift.dynamic_provisioning.storage_classes")

        self._registry_heketi_server_url = self.registry_sc.get('resturl')
        self._registry_project_name = self.registry_sc.get(
            'restsecretnamespace')
        if not (self._registry_heketi_server_url
                and self._registry_project_name):
            self.skipTest(
                "Config file doesn't have key"
                "'storage_classes.registry_block_storage_class.resturl' or "
                "'storage_classes.registry_block_storage_class"
                ".restsecretnamespace'")

        size = 1
        prefix = 'autotest-pvc-{}'.format(utils.get_random_str(size=5))

        # Create PVC in default namespace and verify multipath
        pvc_name = self.create_and_wait_for_pvc(pvc_size=size,
                                                pvc_name_prefix=prefix)
        _, pod_name = self.create_dc_with_pvc(pvc_name)
        match_pvc_and_pv(self.node, prefix)
        self.verify_iscsi_sessions_and_multipath(pvc_name,
                                                 pod_name,
                                                 rtype='pod')

        # Change project namespace
        self.addCleanup(switch_oc_project, self.node,
                        self.storage_project_name)
        switch_oc_project(self.node, self._registry_project_name)

        # Create PVC in registry namespace and verify multipath
        self.sc_name = self.create_storage_class(glusterfs_registry=True)
        pvc_name = self.create_and_wait_for_pvc(sc_name=self.sc_name,
                                                pvc_size=size,
                                                pvc_name_prefix=prefix)
        _, pod_name = self.create_dc_with_pvc(pvc_name)
        self.verify_iscsi_sessions_and_multipath(
            pvc_name,
            pod_name,
            rtype='pod',
            heketi_server_url=self._registry_heketi_server_url,
            is_registry_gluster=True)
    def setUpClass(cls):
        """Initialize all the variables necessary for test cases."""
        super(BaseClass, cls).setUpClass()

        # Initializes OCP config variables
        cls.ocp_servers_info = g.config['ocp_servers']
        cls.ocp_master_node = list(g.config['ocp_servers']['master'].keys())
        cls.ocp_master_node_info = g.config['ocp_servers']['master']
        cls.ocp_client = list(g.config['ocp_servers']['client'].keys())
        cls.ocp_client_info = g.config['ocp_servers']['client']
        cls.ocp_nodes = list(g.config['ocp_servers']['nodes'].keys())
        cls.ocp_nodes_info = g.config['ocp_servers']['nodes']

        # Initializes storage project config variables
        openshift_config = g.config.get("cns", g.config.get("openshift"))
        cls.storage_project_name = openshift_config.get(
            'storage_project_name',
            openshift_config.get('setup', {}).get('cns_project_name'))

        # Initializes heketi config variables
        heketi_config = openshift_config['heketi_config']
        cls.heketi_dc_name = heketi_config['heketi_dc_name']
        cls.heketi_service_name = heketi_config['heketi_service_name']
        cls.heketi_client_node = heketi_config['heketi_client_node']
        cls.heketi_server_url = heketi_config['heketi_server_url']
        cls.heketi_cli_user = heketi_config['heketi_cli_user']
        cls.heketi_cli_key = heketi_config['heketi_cli_key']

        cls.gluster_servers = list(g.config['gluster_servers'].keys())
        cls.gluster_servers_info = g.config['gluster_servers']

        cls.storage_classes = openshift_config['dynamic_provisioning'][
            'storage_classes']
        cls.sc = cls.storage_classes.get(
            'storage_class1', cls.storage_classes.get('file_storage_class'))
        cmd = "echo -n %s | base64" % cls.heketi_cli_key
        ret, out, err = g.run(cls.ocp_master_node[0], cmd, "root")
        if ret != 0:
            raise ExecutionError("failed to execute cmd %s on %s out: %s "
                                 "err: %s" % (
                                     cmd, cls.ocp_master_node[0], out, err))
        cls.secret_data_key = out.strip()

        # Checks if heketi server is alive
        if not hello_heketi(cls.heketi_client_node, cls.heketi_server_url):
            raise ConfigError("Heketi server %s is not alive"
                              % cls.heketi_server_url)

        # Switch to the storage project
        if not switch_oc_project(
                cls.ocp_master_node[0], cls.storage_project_name):
            raise ExecutionError("Failed to switch oc project on node %s"
                                 % cls.ocp_master_node[0])

        if 'glustotest_run_id' not in g.config:
            g.config['glustotest_run_id'] = (
                datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
        cls.glustotest_run_id = g.config['glustotest_run_id']
        msg = "Setupclass: %s : %s" % (cls.__name__, cls.glustotest_run_id)
        g.log.info(msg)
Beispiel #12
0
    def setUpClass(cls):
        """Initialize all the variables necessary for test cases."""
        super(BaseClass, cls).setUpClass()

        # Initializes OCP config variables
        cls.ocp_servers_info = g.config['ocp_servers']
        cls.ocp_master_node = list(g.config['ocp_servers']['master'].keys())
        cls.ocp_master_node_info = g.config['ocp_servers']['master']
        cls.ocp_client = list(g.config['ocp_servers']['client'].keys())
        cls.ocp_client_info = g.config['ocp_servers']['client']
        cls.ocp_nodes = list(g.config['ocp_servers']['nodes'].keys())
        cls.ocp_nodes_info = g.config['ocp_servers']['nodes']

        # Initializes storage project config variables
        openshift_config = g.config.get("cns", g.config.get("openshift"))
        cls.storage_project_name = openshift_config.get(
            'storage_project_name',
            openshift_config.get('setup', {}).get('cns_project_name'))

        # Initializes heketi config variables
        heketi_config = openshift_config['heketi_config']
        cls.heketi_dc_name = heketi_config['heketi_dc_name']
        cls.heketi_service_name = heketi_config['heketi_service_name']
        cls.heketi_client_node = heketi_config['heketi_client_node']
        cls.heketi_server_url = heketi_config['heketi_server_url']
        cls.heketi_cli_user = heketi_config['heketi_cli_user']
        cls.heketi_cli_key = heketi_config['heketi_cli_key']

        cls.gluster_servers = list(g.config['gluster_servers'].keys())
        cls.gluster_servers_info = g.config['gluster_servers']

        cls.storage_classes = openshift_config['dynamic_provisioning'][
            'storage_classes']
        cls.sc = cls.storage_classes.get(
            'storage_class1', cls.storage_classes.get('file_storage_class'))
        cmd = "echo -n %s | base64" % cls.heketi_cli_key
        ret, out, err = g.run(cls.ocp_master_node[0], cmd, "root")
        if ret != 0:
            raise ExecutionError("failed to execute cmd %s on %s out: %s "
                                 "err: %s" %
                                 (cmd, cls.ocp_master_node[0], out, err))
        cls.secret_data_key = out.strip()

        # Checks if heketi server is alive
        if not hello_heketi(cls.heketi_client_node, cls.heketi_server_url):
            raise ConfigError("Heketi server %s is not alive" %
                              cls.heketi_server_url)

        # Switch to the storage project
        if not switch_oc_project(cls.ocp_master_node[0],
                                 cls.storage_project_name):
            raise ExecutionError("Failed to switch oc project on node %s" %
                                 cls.ocp_master_node[0])

        if 'glustotest_run_id' not in g.config:
            g.config['glustotest_run_id'] = (
                datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
        cls.glustotest_run_id = g.config['glustotest_run_id']
        msg = "Setupclass: %s : %s" % (cls.__name__, cls.glustotest_run_id)
        g.log.info(msg)
Beispiel #13
0
    def test_metrics_cassandra_pod_with_bhv_brick_process_down(self):
        """Validate metrics during restart of brick process of  bhv"""

        # Validate iscsi and multipath
        gluster_node = list(self.registry_servers_info.keys())[0]
        hawkular_cassandra, pvc_name, _, _, _ = (
            self.verify_cassandra_pod_multipath_and_iscsi())
        switch_oc_project(self.master, self.registry_project_name)

        # Kill the brick process and force restart the volume
        bhv_name = self.get_block_hosting_volume_by_pvc_name(
            pvc_name, heketi_server_url=self.registry_heketi_server_url,
            gluster_node=gluster_node, ocp_client_node=self.master)
        restart_gluster_vol_brick_processes(
            self.master, bhv_name, list(self.registry_servers_info.keys()))
        self.addCleanup(self.cassandra_pod_delete_cleanup, raise_on_error=True)
    def test_prometheus_basic_validation(self):
        """ Validate basic volume metrics using prometheus """

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            volume_expansion=False)

        # Create 1000 files and fetch the metrics that the data is updated
        self._perform_io_and_fetch_metrics(pod_name=pod_name,
                                           pvc_name=pvc_name,
                                           filename="filename1",
                                           dirname="dirname1",
                                           metric_data=initial_metrics,
                                           operation="create")

        # Write the IO half the size of the volume and validated from
        # prometheus pod that the size change is reflected
        size_to_write = int(
            initial_metrics['kubelet_volume_stats_capacity_bytes']) // 2
        openshift_ops.switch_oc_project(self._master,
                                        self.storage_project_name)
        cmd = ("dd if=/dev/urandom of=/mnt/large_file bs={} count=1024".format(
            size_to_write // 1024))
        ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd)
        self.assertFalse(ret, 'Failed to write file due to err {}'.format(err))

        # Fetching the metrics and validating the data change is reflected
        for w in waiter.Waiter(120, 10):
            half_io_metrics = self._get_and_manipulate_metric_data(
                ['kubelet_volume_stats_used_bytes'], pvc_name)
            if bool(half_io_metrics) and (int(
                    half_io_metrics['kubelet_volume_stats_used_bytes']) >
                                          size_to_write):
                break
        if w.expired:
            raise AssertionError(
                "After Data is written on the pvc, metrics like inodes used "
                "and bytes used are not reflected in the prometheus")

        # Delete the files from the volume and wait for the
        # updated details reflected in prometheus
        self._perform_io_and_fetch_metrics(pod_name=pod_name,
                                           pvc_name=pvc_name,
                                           filename="filename1",
                                           dirname="dirname1",
                                           metric_data=half_io_metrics,
                                           operation="delete")
Beispiel #15
0
    def test_resping_gluster_pod(self):
        """Validate gluster pod restart with no disruption to elasticsearch pod
        """
        restart_custom = ":status.containerStatuses[0].restartCount"

        # Fetch pod and validate iscsi and multipath
        es_pod, _ = self._get_es_pod_and_verify_iscsi_sessions()

        # Fetch the restart count for the es pod
        restart_count_before = openshift_ops.oc_get_custom_resource(
            self._master, "pod", restart_custom, es_pod)[0]

        # Switch to gluster project
        openshift_ops.switch_oc_project(self._master,
                                        self._registry_project_name)

        # Fetch the gluster pod list before
        g_pod_list_before = [
            pod["pod_name"]
            for pod in openshift_ops.get_ocp_gluster_pod_details(self._master)
        ]

        # Respin a gluster pod
        openshift_ops.oc_delete(self._master, "pod", g_pod_list_before[0])
        self.addCleanup(self._guster_pod_delete_cleanup, g_pod_list_before)

        # Wait for pod to get absent
        openshift_ops.wait_for_resource_absence(self._master, "pod",
                                                g_pod_list_before[0])

        # Fetch gluster pod after delete
        g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before)
        openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0])

        # Switch to logging project
        openshift_ops.switch_oc_project(self._master,
                                        self._logging_project_name)

        # Fetch the restart count for the es pod
        restart_count_after = openshift_ops.oc_get_custom_resource(
            self._master, "pod", restart_custom, es_pod)[0]
        self.assertEqual(
            restart_count_before, restart_count_after,
            "Failed disruption to es pod found expecting restart count before"
            " {} and after {} for es pod to be equal after gluster pod"
            " respin".format(restart_count_before, restart_count_after))
Beispiel #16
0
 def cassandra_pod_delete_cleanup(self, raise_on_error=False):
     """Cleanup for deletion of cassandra pod using force delete"""
     switch_oc_project(self.master, self.metrics_project_name)
     try:
         # Check if pod is up or ready
         pod_name = get_pod_name_from_rc(self.master,
                                         self.metrics_rc_hawkular_cassandra)
         wait_for_pod_be_ready(self.master, pod_name, timeout=1)
     except exceptions.ExecutionError as err:
         # Force delete and wait for new pod to come up
         oc_delete(self.master, 'pod', pod_name, is_force=True)
         wait_for_resource_absence(self.master, 'pod', pod_name)
         new_pod_name = get_pod_name_from_rc(
             self.master, self.metrics_rc_hawkular_cassandra)
         wait_for_pod_be_ready(self.master, new_pod_name)
         if raise_on_error:
             raise err
Beispiel #17
0
    def _get_and_manipulate_metric_data(self, metrics, pvc):
        """Create a dict of metric names and total values"""

        # Switch to namespace containing prometheus pods
        openshift_ops.switch_oc_project(self._master,
                                        self._prometheus_project_name)
        self.addCleanup(openshift_ops.switch_oc_project,
                        self._master, self.storage_project_name)

        metric_data = dict()
        for metric in metrics:
            out = self._fetch_metric_from_promtheus_pod(metric)
            for matric_result in out:
                if matric_result["metric"]["persistentvolumeclaim"] == pvc:
                    metric_data[matric_result["metric"][
                        "__name__"]] = matric_result["value"][1]
        return metric_data
    def setUp(self):
        """Initialize all the variables which are necessary for test cases"""
        super(TestPrometheusAndGlusterRegistryValidation, self).setUp()

        try:
            prometheus_config = g.config['openshift']['prometheus']
            self._prometheus_project_name = prometheus_config[
                'prometheus_project_name']
            self._prometheus_resources_selector = prometheus_config[
                'prometheus_resources_selector']
            self._alertmanager_resources_selector = prometheus_config[
                'alertmanager_resources_selector']
            self._registry_heketi_server_url = (
                g.config['openshift']['registry_heketi_config'][
                    'heketi_server_url'])
            self._registry_project_name = (
                g.config['openshift']['registry_project_name'])
            self._registry_servers_info = (
                g.config['gluster_registry_servers'])
        except KeyError as err:
            self.skipTest("Config file doesn't have key {}".format(err))

        # Skip the test if iscsi-initiator-utils version is not the expected
        cmd = ("rpm -q iscsi-initiator-utils "
               "--queryformat '%{version}-%{release}\n'"
               "| cut -d '.' -f 1,2,3,4")
        e_pkg_version = "6.2.0.874-17"
        for g_server in self.gluster_servers:
            out = self.cmd_run(cmd, g_server)
            if parse_version(out) < parse_version(e_pkg_version):
                self.skipTest(
                    "Skip the test as iscsi-initiator-utils package version {}"
                    "is less than version {} found on the node {}, for more "
                    "info refer to BZ-1624670".format(
                        out, e_pkg_version, g_server))

        self._master = self.ocp_master_node[0]

        # Switch to namespace conatining prometheus pods
        cmd = "oc project --short=true"
        current_project = command.cmd_run(cmd, self._master)
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        self.addCleanup(
            openshift_ops.switch_oc_project, self._master, current_project)
Beispiel #19
0
    def setUp(self):
        """Initialize all the variables necessary for test cases."""
        super(TestLoggingAndGlusterRegistryValidation, self).setUp()

        try:
            logging_config = g.config['openshift']['logging']
            self._logging_project_name = logging_config['logging_project_name']
            self._logging_fluentd_ds = logging_config['logging_fluentd_ds']
            self._logging_es_dc = logging_config['logging_es_dc']
            self._logging_kibana_dc = logging_config['logging_kibana_dc']
            self._registry_heketi_server_url = (
                g.config['openshift']['registry_heketi_config']
                ['heketi_server_url'])
            self._registry_project_name = (
                g.config['openshift']['registry_project_name'])
            self._registry_servers_info = g.config['gluster_registry_servers']
        except KeyError as err:
            msg = "Config file doesn't have key {}".format(err)
            g.log.error(msg)
            self.skipTest(msg)

        # Skip the test if iscsi-initiator-utils version is not the expected
        cmd = ("rpm -q iscsi-initiator-utils "
               "--queryformat '%{version}-%{release}\n'"
               "| cut -d '.' -f 1,2,3,4")
        e_pkg_version = "6.2.0.874-17"
        for g_server in self.gluster_servers:
            out = self.cmd_run(cmd, g_server)
            if parse_version(out) < parse_version(e_pkg_version):
                msg = ("Skip test since isci initiator utils version actual: "
                       "{out} is less than expected: {ver} on node {server},"
                       " for more info refer to BZ-1624670".format(
                           out=out, ver=e_pkg_version, server=g_server))
                g.log.error(msg)
                self.skipTest(msg)

        self._master = self.ocp_master_node[0]
        cmd = "oc project --short=true"
        current_project = command.cmd_run(cmd, self._master)
        openshift_ops.switch_oc_project(self._master,
                                        self._logging_project_name)
        self.addCleanup(openshift_ops.switch_oc_project, self._master,
                        current_project)
    def _guster_pod_delete(self, g_pod_list_before):
        """Delete the gluster pod using force delete"""
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)

        # Fetch newly deployed gluster pod after delete
        try:
            pod_name = self._get_newly_deployed_gluster_pod(g_pod_list_before)
            openshift_ops.wait_for_pod_be_ready(
                self._master,
                pod_name[0] if pod_name else g_pod_list_before[0],
                timeout=120, wait_step=6)
        except exceptions.ExecutionError:
            openshift_ops.oc_delete(
                self._master, 'pod', g_pod_list_before[0], is_force=True)
            openshift_ops.wait_for_resource_absence(
                self._master, 'pod', g_pod_list_before[0])
            g_new_pod = self._get_newly_deployed_gluster_pod(g_pod_list_before)
            openshift_ops.wait_for_pod_be_ready(self._master, g_new_pod[0])
Beispiel #21
0
    def test_run_workload_with_logging(self):
        """Validate logs are being generated aifter running workload"""

        # Get the size of used space of logs
        es_pod = openshift_ops.get_pod_name_from_dc(
            self._master, self._logging_es_dc)
        mount_point = "/elasticsearch/persistent"
        cmd_space_check = ('df -kh --output=used {} | sed "/Used/d" |'
                           'sed "s/G//"'.format(mount_point))
        ret, initial_used_percent, err = openshift_ops.oc_rsh(
            self._master, es_pod, cmd_space_check)
        err_msg = "Failed to fetch the size of used space, error {}"
        self.assertFalse(ret, err_msg.format(err))

        # Create 20 pvcs and app pods with io
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pvc_count, batch_count = 5, 4
        for _ in range(batch_count):
            pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count)
            self.create_dcs_with_pvc(pvcs)
        self.addCleanup(
            openshift_ops.switch_oc_project,
            self._master, self.storage_project_name)

        # Get and verify the final used size of used space of logs
        openshift_ops.switch_oc_project(
            self._master, self._logging_project_name)
        for w in waiter.Waiter(600, 30):
            ret, final_used_percent, err = openshift_ops.oc_rsh(
                self._master, es_pod, cmd_space_check)
            self.assertFalse(ret, err_msg.format(err))
            if int(initial_used_percent) < int(final_used_percent):
                break
        if w.expired:
            raise AssertionError(
                "Initial used space {} for logs is not less than final "
                "used space {}".format(
                    initial_used_percent, final_used_percent))
Beispiel #22
0
    def test_verify_metrics_data_during_gluster_pod_respin(self):
        # Add check for CRS version
        switch_oc_project(self.master, self.registry_project_name)
        if not self.is_containerized_gluster():
            self.skipTest("Skipping this test case as CRS version check "
                          "can not be implemented")

        # Verify multipath and iscsi for cassandra pod
        switch_oc_project(self.master, self.metrics_project_name)
        hawkular_cassandra, pvc_name, iqn, _, node = (
            self.verify_cassandra_pod_multipath_and_iscsi())

        # Get the ip of active path
        device_and_ip = get_iscsi_block_devices_by_path(node, iqn)
        mpath = get_mpath_name_from_device_name(node,
                                                list(device_and_ip.keys())[0])
        active_passive_dict = get_active_and_enabled_devices_from_mpath(
            node, mpath)
        node_ip = device_and_ip[active_passive_dict['active'][0]]

        # Get the name of gluster pod from the ip
        switch_oc_project(self.master, self.registry_project_name)
        gluster_pods = get_ocp_gluster_pod_details(self.master)
        pod_name = list(
            filter(lambda pod: (pod["pod_host_ip"] == node_ip),
                   gluster_pods))[0]["pod_name"]
        err_msg = "Failed to get the gluster pod name {} with active path"
        self.assertTrue(pod_name, err_msg.format(pod_name))

        # Delete the pod
        oc_delete(self.master, 'pod', pod_name)
        wait_for_resource_absence(self.master, 'pod', pod_name)

        # Wait for new pod to come up
        pod_count = len(self.registry_servers_info.keys())
        selector = "glusterfs-node=pod"
        wait_for_pods_be_ready(self.master, pod_count, selector)

        # Validate cassandra pod state, multipath and issci
        switch_oc_project(self.master, self.metrics_project_name)
        wait_for_pod_be_ready(self.master, hawkular_cassandra, timeout=2)
        self.verify_iscsi_sessions_and_multipath(
            pvc_name,
            self.metrics_rc_hawkular_cassandra,
            rtype='rc',
            heketi_server_url=self.registry_heketi_server_url,
            is_registry_gluster=True)
Beispiel #23
0
    def test_run_workload_with_metrics(self):
        """Validate if logs are being generated after running workload"""

        # Get the size of used space of logs
        cassandra_pod = get_pod_name_from_rc(
            self.master, self.metrics_rc_hawkular_cassandra)
        mount_point = "/cassandra_data"
        cmd_space_check = ('df -k --output=used {} | sed "/Used/d" |'
                           'sed "s/G//"'.format(mount_point))
        ret, initial_used_percent, err = oc_rsh(self.master, cassandra_pod,
                                                cmd_space_check)
        err_msg = "Failed to fetch the size of used space, error {}"
        self.assertFalse(ret, err_msg.format(err))

        # Create 20 PVCs and app pods with IO
        switch_oc_project(self.master, self.storage_project_name)
        pvc_count, batch_count = 5, 4
        for _ in range(batch_count):
            pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count)
            self.create_dcs_with_pvc(pvcs)
        self.addCleanup(switch_oc_project, self.master,
                        self.storage_project_name)

        # Get and verify the final size of used space of logs
        switch_oc_project(self.master, self.metrics_project_name)
        for w in waiter.Waiter(600, 30):
            ret, final_used_percent, err = oc_rsh(self.master, cassandra_pod,
                                                  cmd_space_check)
            self.assertFalse(ret, err_msg.format(err))
            if int(initial_used_percent) < int(final_used_percent):
                break
        if w.expired:
            raise AssertionError(
                "Initial used space {} for logs is not less than final "
                "used space {}".format(initial_used_percent,
                                       final_used_percent))
    def test_heketi_prometheus_usedbytes_brickcount_on_device_delete(
            self, operation):
        """Validate used bytes,device count on heketi and prometheus"""
        h_node, h_server = self.heketi_client_node, self.heketi_server_url

        # Get list of additional devices for one of the Gluster nodes
        gluster_server_0 = list(self.gluster_servers_info.values())[0]
        manage_hostname = gluster_server_0.get("manage")
        self.assertTrue(
            manage_hostname, "IP Address is not specified for "
                             "node {}".format(gluster_server_0))
        device_name = gluster_server_0.get("additional_devices")[0]
        self.assertTrue(
            device_name, "Additional devices are not specified for "
                         "node {}".format(gluster_server_0))

        # Get node ID of the Gluster hostname
        node_list = heketi_ops.heketi_topology_info(
            h_node, h_server, json=True).get("clusters")[0].get("nodes")
        self.assertTrue(
            node_list, "Cluster info command returned empty list of nodes")
        node_id = [
            node.get("id")
            for node in node_list
            if manage_hostname == node.get("hostnames").get("manage")[0]]
        self.assertTrue(
            node_id, "Failed to get node_id for {}".format(manage_hostname))
        node_id = node_id[0]

        # Adding heketi device
        heketi_ops.heketi_device_add(h_node, h_server, device_name, node_id)
        node_info_after_addition = heketi_ops.heketi_node_info(
            h_node, h_server, node_id, json=True)
        device_id, bricks = None, None
        for device in node_info_after_addition.get("devices"):
            if device.get("name") == device_name:
                device_id, bricks = (
                    device.get("id"), len(device.get("bricks")))
                break

        # Verify zero bricks on the device
        msg = (
            "Number of bricks on the device {} of the nodes should be"
            "zero".format(device_name))
        self.assertFalse(bricks, msg)
        self.addCleanup(
            heketi_ops.heketi_device_delete, h_node, h_server, device_id,
            raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_device_remove, h_node, h_server, device_id,
            raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_device_disable, h_node, h_server, device_id,
            raise_on_error=False)

        # Disable,Remove and Delete heketi device
        heketi_ops.heketi_device_disable(h_node, h_server, device_id)
        heketi_ops.heketi_device_remove(h_node, h_server, device_id)
        heketi_ops.heketi_device_delete(h_node, h_server, device_id)

        # Verify device deletion
        node_info_after_deletion = (
            heketi_ops.heketi_node_info(h_node, h_server, node_id))
        msg = ("Device {} should not be shown in node info of the node {}"
               "after the device deletion".format(device_id, node_id))
        self.assertNotIn(device_id, node_info_after_deletion, msg)

        if operation == "usedbytes":
            # Validate heketi and prometheus device used bytes
            for w in waiter.Waiter(timeout=60, interval=10):
                device_used_bytes_prometheus = 0
                device_used_bytes_metrics = 0
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'openshift-monitoring')
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_device_used_bytes')
                for result in metric_result:
                    if (node_id == result.get('cluster')
                            and device_name == result.get('device')):
                        device_used_bytes_prometheus += (
                            int(result.get('value')[1]))
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'glusterfs')
                metrics = heketi_ops.get_heketi_metrics(h_node, h_server)
                heketi_device_count_metric = (
                    metrics.get('heketi_device_used_bytes'))
                for result in heketi_device_count_metric:
                    if (node_id == result.get('cluster')
                            and device_name == result.get('device')):
                        device_used_bytes_metrics = int(result.get('value'))
                if device_used_bytes_prometheus == device_used_bytes_metrics:
                    break
            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to update device details in prometheus")

        elif operation == "brickcount":
            # Validate heketi and prometheus device brick count
            for w in waiter.Waiter(timeout=60, interval=10):
                device_brick_count_prometheus = 0
                device_brick_count_metrics = 0
                metrics = heketi_ops.get_heketi_metrics(h_node, h_server)
                heketi_device_count_metric = metrics.get(
                    'heketi_device_brick_count')
                for result in heketi_device_count_metric:
                    device_brick_count_metrics += int(result.get('value'))
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'openshift-monitoring')
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_device_brick_count')
                for result in metric_result:
                    device_brick_count_prometheus += (
                        int(result.get('value')[1]))
                if device_brick_count_prometheus == device_brick_count_metrics:
                    break
            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to update device details in prometheus")
    def test_prometheus_volume_metrics_on_node_reboot(self):
        """Validate volume metrics using prometheus before and after node
        reboot"""

        # Pod name for the entire test
        prefix = "autotest-{}".format(utils.get_random_str())

        # Create I/O pod with PVC
        pvc_name = self.create_and_wait_for_pvc()
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, prefix,
            image=self.io_container_image_cirros)
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name,
                        raise_on_absence=False)
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=5)

        # Write data on the volume and wait for 2 mins and sleep is must for
        # prometheus to get the exact values of the metrics
        ret, _, err = openshift_ops.oc_rsh(
            self._master, pod_name, "touch /mnt/file{1..1000}")
        self.assertEqual(
            ret, 0, "Failed to create files in the app pod "
                    "with {}".format(err))
        time.sleep(120)

        # Fetch the metrics and store in initial_metrics as dictionary
        initial_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)

        # Get the hostname to reboot where the pod is running
        pod_info = openshift_ops.oc_get_pods(self._master, name=pod_name)
        node_for_reboot = pod_info[pod_name]['node']

        # Get the vm name by the hostname
        vm_name = node_ops.find_vm_name_by_ip_or_hostname(node_for_reboot)

        # power off and on the vm, based on the vm type(either gluster or not)
        if node_for_reboot in self.gluster_servers:
            self.power_off_gluster_node_vm(vm_name, node_for_reboot)
            self.power_on_gluster_node_vm(vm_name, node_for_reboot)
        else:
            self.power_off_vm(vm_name)
            self.power_on_vm(vm_name)
            openshift_ops.wait_for_ocp_node_be_ready(
                self._master, node_for_reboot)

        # Create the new pod and validate the prometheus metrics
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, prefix)
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name)

        # Wait for POD be up and running and prometheus to refresh the data
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=5)
        time.sleep(120)

        # Fetching the metrics and storing in final_metrics as dictionary and
        # validating with initial_metrics
        final_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)
        self.assertEqual(dict(initial_metrics), dict(final_metrics),
                         "Metrics are different post node reboot")
Beispiel #26
0
    def test_prometheus_pv_resize(self):
        """ Validate prometheus metrics with pv resize"""

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            vol_name_prefix="for-pv-resize", volume_expansion=True)

        # Write data on the pvc and confirm it is reflected in the prometheus
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="filename1", dirname="dirname1",
            metric_data=initial_metrics, operation="create")

        # Resize the pvc to 2GiB
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pvc_size = 2
        openshift_ops.resize_pvc(self._master, pvc_name, pvc_size)
        openshift_ops.wait_for_events(self._master, obj_name=pvc_name,
                                      event_reason='VolumeResizeSuccessful')
        openshift_ops.verify_pvc_size(self._master, pvc_name, pvc_size)
        pv_name = openshift_ops.get_pv_name_from_pvc(
            self._master, pvc_name)
        openshift_ops.verify_pv_size(self._master, pv_name, pvc_size)

        heketi_volume_name = heketi_ops.heketi_volume_list_by_name_prefix(
            self.heketi_client_node, self.heketi_server_url,
            "for-pv-resize", json=True)[0][2]
        self.assertIsNotNone(
            heketi_volume_name, "Failed to fetch volume with prefix {}".
            format("for-pv-resize"))

        openshift_ops.oc_delete(self._master, 'pod', pod_name)
        openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name)
        pod_name = openshift_ops.get_pod_name_from_dc(
            self._master, self.dc_name)
        openshift_ops.wait_for_pod_be_ready(self._master, pod_name)

        # Check whether the metrics are updated or not
        for w in waiter.Waiter(120, 10):
            resize_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if bool(resize_metrics) and int(resize_metrics[
                'kubelet_volume_stats_capacity_bytes']) > int(
                    initial_metrics['kubelet_volume_stats_capacity_bytes']):
                break
        if w.expired:
            raise AssertionError("Failed to reflect PVC Size after resizing")
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        time.sleep(240)

        # Lookup and trigger rebalance and wait for the its completion
        for _ in range(100):
            self.cmd_run("oc rsh {} ls /mnt/".format(pod_name))
        self._rebalance_completion(heketi_volume_name)

        # Write data on the resized pvc and compared with the resized_metrics
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="secondfilename", dirname="seconddirname",
            metric_data=resize_metrics, operation="create")
    def test_prometheous_kill_bhv_brick_process(self):
        """Validate kill brick process of block hosting
        volume with prometheus workload running"""

        # Add check for CRS version
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)
        if not self.is_containerized_gluster():
            self.skipTest("Skipping this test case as CRS"
                          " version check can not be implemented")

        # Get one of the prometheus pod name and respective pvc name
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        prometheus_pods = openshift_ops.oc_get_pods(
            self._master, selector=self._prometheus_resources_selector)
        if not prometheus_pods:
            self.skipTest(
                prometheus_pods, "Skipping test as prometheus"
                " pod is not present")

        # Validate iscsi and multipath
        prometheus_pod = list(prometheus_pods.keys())[0]
        pvc_name = openshift_ops.oc_get_custom_resource(
            self._master, "pod",
            ":.spec.volumes[*].persistentVolumeClaim.claimName",
            prometheus_pod)
        self.assertTrue(pvc_name, "Failed to get PVC name")
        pvc_name = pvc_name[0]
        self.verify_iscsi_sessions_and_multipath(
            pvc_name, prometheus_pod, rtype='pod',
            heketi_server_url=self._registry_heketi_server_url,
            is_registry_gluster=True)

        # Try to fetch metric from prometheus pod
        self._fetch_metric_from_promtheus_pod(
            metric='heketi_device_brick_count')

        # Kill the brick process of a BHV
        gluster_node = list(self._registry_servers_info.keys())[0]
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)
        bhv_name = self.get_block_hosting_volume_by_pvc_name(
            pvc_name, heketi_server_url=self._registry_heketi_server_url,
            gluster_node=gluster_node, ocp_client_node=self._master)
        vol_status = gluster_ops.get_gluster_vol_status(bhv_name)
        gluster_node_ip, brick_pid = None, None
        for g_node, g_node_data in vol_status.items():
            for process_name, process_data in g_node_data.items():
                if process_name.startswith("/var"):
                    gluster_node_ip = g_node
                    brick_pid = process_data["pid"]
                    break
            if gluster_node_ip and brick_pid:
                break
        self.assertIsNotNone(brick_pid, "Could not find pid for brick")
        cmd = "kill -9 {}".format(brick_pid)
        openshift_ops.cmd_run_on_gluster_pod_or_node(
            self._master, cmd, gluster_node_ip)
        self.addCleanup(self._guster_volume_cleanup, bhv_name)

        # Check if the brick-process has been killed
        killed_pid_cmd = (
            "ps -p {} -o pid --no-headers".format(brick_pid))
        try:
            openshift_ops.cmd_run_on_gluster_pod_or_node(
                self._master, killed_pid_cmd, gluster_node_ip)
        except exceptions.ExecutionError:
            g.log.info("Brick process {} was killed"
                       "successfully".format(brick_pid))

        # Try to fetch metric from prometheus pod
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        self._fetch_metric_from_promtheus_pod(
            metric='heketi_device_brick_count')

        # Start the bhv using force
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)
        start_vol, _, _ = volume_ops.volume_start(
            gluster_node_ip, bhv_name, force=True)
        self.assertFalse(
            start_vol, "Failed to start volume {}"
            " using force".format(bhv_name))

        # Validate iscsi and multipath
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        self.verify_iscsi_sessions_and_multipath(
            pvc_name, prometheus_pod, rtype='pod',
            heketi_server_url=self._registry_heketi_server_url,
            is_registry_gluster=True)

        # Try to fetch metric from prometheus pod
        self._fetch_metric_from_promtheus_pod(
            metric='heketi_device_brick_count')
    def test_heketi_metrics_validation_with_node_reboot(self):
        """Validate heketi metrics after node reboot using prometheus"""

        initial_metrics, final_metrics = {}, {}

        # Use storage project
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)

        # Get initial metrics result
        h_node, h_server = self.heketi_client_node, self.heketi_server_url
        initial_metrics = tuple(
            heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0]
            for metric in self.metrics)

        # Use prometheus project
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)

        # Get initial prometheus result
        initial_prometheus = self._get_and_manipulate_metric_data(
            self.metrics)

        # Get hosted node IP of heketi pod
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        heketi_pod = openshift_ops.get_pod_name_from_dc(
            self._master, self.heketi_dc_name)
        heketi_node = openshift_ops.oc_get_custom_resource(
            self._master, 'pod', '.:spec.nodeName', heketi_pod)[0]

        # Reboot the node on which heketi pod is scheduled
        self.addCleanup(
            self._check_heketi_and_gluster_pod_after_node_reboot, heketi_node)
        node_ops.node_reboot_by_command(heketi_node)

        # Wait node to become NotReady
        custom = r'":.status.conditions[?(@.type==\"Ready\")]".status'
        for w in waiter.Waiter(300, 10):
            status = openshift_ops.oc_get_custom_resource(
                self._master, 'node', custom, heketi_node)
            if status[0] == 'False':
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to bring down node {}".format(heketi_node))

        # Wait for node to become ready
        openshift_ops.wait_for_ocp_node_be_ready(self._master, heketi_node)

        # Wait for heketi and glusterfs pod to become ready
        self._check_heketi_and_gluster_pod_after_node_reboot(heketi_node)

        # Use prometheus project
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)

        # Get final metrics result
        final_metrics = tuple(
            heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0]
            for metric in self.metrics)

        # Get final prometheus result
        final_prometheus = self._get_and_manipulate_metric_data(
            self.metrics)

        err_msg = "Initial value {} is not same as final value {}"
        self.assertEqual(
            initial_metrics, final_metrics, err_msg.format(
                initial_metrics, final_metrics))
        self.assertEqual(
            initial_prometheus, final_prometheus, err_msg.format(
                initial_prometheus, final_prometheus))
    def test_heketi_metrics_validation_after_node(self, condition):
        """Validate heketi metrics after adding and remove node"""

        # Get additional node
        additional_host_info = g.config.get("additional_gluster_servers")
        if not additional_host_info:
            self.skipTest(
                "Skipping this test case as additional gluster server is "
                "not provied in config file")

        additional_host_info = list(additional_host_info.values())[0]
        storage_hostname = additional_host_info.get("manage")
        storage_ip = additional_host_info.get("storage")
        if not (storage_hostname and storage_ip):
            self.skipTest(
                "Config options 'additional_gluster_servers.manage' "
                "and 'additional_gluster_servers.storage' must be set.")

        h_client, h_server = self.heketi_client_node, self.heketi_server_url
        initial_node_count, final_node_count = 0, 0

        # Get initial node count from prometheus metrics
        metric_result = self._fetch_metric_from_promtheus_pod(
            metric='heketi_nodes_count')
        initial_node_count = reduce(
            lambda x, y: x + y,
            [result.get('value')[1] for result in metric_result])

        # Switch to storage project
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)

        # Configure node before adding node
        self.configure_node_to_run_gluster(storage_hostname)

        # Get cluster list
        cluster_info = heketi_ops.heketi_cluster_list(
            h_client, h_server, json=True)

        # Add node to the cluster
        heketi_node_info = heketi_ops.heketi_node_add(
            h_client, h_server,
            len(self.gluster_servers), cluster_info.get('clusters')[0],
            storage_hostname, storage_ip, json=True)
        heketi_node_id = heketi_node_info.get("id")
        self.addCleanup(
            heketi_ops.heketi_node_delete,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_node_remove,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_node_disable,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            openshift_ops.switch_oc_project,
            self._master, self.storage_project_name)

        if condition == 'delete':
            # Switch to openshift-monitoring project
            openshift_ops.switch_oc_project(
                self.ocp_master_node[0], self._prometheus_project_name)

            # Get initial node count from prometheus metrics
            for w in waiter.Waiter(timeout=60, interval=10):
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_nodes_count')
                node_count = reduce(
                    lambda x, y: x + y,
                    [result.get('value')[1] for result in metric_result])
                if node_count != initial_node_count:
                    break

            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to get updated node details from prometheus")

            # Remove node from cluster
            heketi_ops.heketi_node_disable(h_client, h_server, heketi_node_id)
            heketi_ops.heketi_node_remove(h_client, h_server, heketi_node_id)
            for device in heketi_node_info.get('devices'):
                heketi_ops.heketi_device_delete(
                    h_client, h_server, device.get('id'))
            heketi_ops.heketi_node_delete(h_client, h_server, heketi_node_id)

        # Switch to openshift-monitoring project
        openshift_ops.switch_oc_project(
            self.ocp_master_node[0], self._prometheus_project_name)

        # Get final node count from prometheus metrics
        for w in waiter.Waiter(timeout=60, interval=10):
            metric_result = self._fetch_metric_from_promtheus_pod(
                metric='heketi_nodes_count')
            final_node_count = reduce(
                lambda x, y: x + y,
                [result.get('value')[1] for result in metric_result])

            if condition == 'delete':
                if final_node_count < node_count:
                    break
            else:
                if final_node_count > initial_node_count:
                    break

        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to update node details in prometheus")
    def test_metrics_workload_on_prometheus(self):
        """Validate metrics workload on prometheus"""

        # Skip test if the prometheus pods are not present
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        prometheus_pods = openshift_ops.oc_get_pods(
            self._master, selector=self._prometheus_resources_selector)
        if not prometheus_pods:
            self.skipTest(
                prometheus_pods, "Skipping test as prometheus"
                " pod is not present")

        if not self.registry_sc:
            self.skipTest(
                prometheus_pods, "Skipping test as registry "
                " storage details are not provided")
        self._registry_project = self.registry_sc.get(
            'restsecretnamespace')
        self.prefix = "autotest-{}".format(utils.get_random_str())

        # Get one of the prometheus pod name and respective pvc name
        prometheus_pod = list(prometheus_pods.keys())[0]
        pvc_custom = ":.spec.volumes[*].persistentVolumeClaim.claimName"
        pvc_name = openshift_ops.oc_get_custom_resource(
            self._master, "pod", pvc_custom, prometheus_pod)[0]
        self.assertTrue(
            pvc_name, "Failed to get PVC name for prometheus"
            " pod {}".format(prometheus_pod))
        self.verify_iscsi_sessions_and_multipath(
            pvc_name, prometheus_pod, rtype='pod',
            heketi_server_url=self._registry_heketi_server_url,
            is_registry_gluster=True)

        # Try to fetch metric from the prometheus pod
        self._fetch_metric_from_promtheus_pod(
            metric='kube_persistentvolumeclaim_info')

        # Create storage class
        openshift_ops.switch_oc_project(
            self._master, self._registry_project)
        self.sc_name = self.create_storage_class(
            vol_name_prefix=self.prefix, glusterfs_registry=True)
        self.addCleanup(openshift_ops.switch_oc_project,
                        self._master, self._registry_project)

        # Create PVCs and app pods
        pvc_size, pvc_count, batch_count = 1, 5, 5
        for _ in range(batch_count):
            test_pvc_names = self.create_and_wait_for_pvcs(
                pvc_size, pvc_name_prefix=self.prefix,
                pvc_amount=pvc_count, sc_name=self.sc_name, timeout=600,
                wait_step=10)
            self.create_dcs_with_pvc(
                test_pvc_names, timeout=600, wait_step=5,
                dc_name_prefix="autotests-dc-with-app-io",
                space_to_use=1048576)

        # Check from the prometheus pod for the PVC space usage
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        mount_path = "/prometheus"
        cmd = "oc exec {0} -- df -PT {1} | grep {1}".format(
            prometheus_pod, mount_path)
        out = self.cmd_run(cmd)
        self.assertTrue(out, "Failed to get info about mounted volume. "
                             "Output is empty.")

        # Try to fetch metric from prometheus pod
        self._fetch_metric_from_promtheus_pod(
            metric='kube_persistentvolumeclaim_info')
        self._fetch_metric_from_promtheus_pod(
            metric='kube_pod_spec_volumes_persistentvolumeclaims_info')
        self.addCleanup(openshift_ops.switch_oc_project,
                        self._master, self._registry_project)
    def test_restart_prometheus_glusterfs_pod(self):
        """Validate restarting glusterfs pod"""

        # Add check for CRS version
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)
        if not self.is_containerized_gluster():
            self.skipTest(
                "Skipping this test case as CRS version check "
                "can not be implemented")

        # Get one of the prometheus pod name and respective pvc name
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)
        prometheus_pods = openshift_ops.oc_get_pods(
            self._master, selector=self._prometheus_resources_selector)
        if not prometheus_pods:
            self.skipTest(
                prometheus_pods, "Skipping test as prometheus"
                " pod is not present")
        prometheus_pod = list(prometheus_pods.keys())[0]
        pvc_name = openshift_ops.oc_get_custom_resource(
            self._master, "pod",
            ":.spec.volumes[*].persistentVolumeClaim.claimName",
            prometheus_pod)[0]
        self.assertTrue(
            pvc_name,
            "Failed to get pvc name from {} pod".format(prometheus_pod))
        iqn, _, node = self.verify_iscsi_sessions_and_multipath(
            pvc_name, prometheus_pod, rtype='pod',
            heketi_server_url=self._registry_heketi_server_url,
            is_registry_gluster=True)

        # Get the ip of active path
        devices = openshift_storage_libs.get_iscsi_block_devices_by_path(
            node, iqn)
        mpath = openshift_storage_libs.get_mpath_name_from_device_name(
            node, list(devices.keys())[0])
        mpath_dev = (
            openshift_storage_libs.get_active_and_enabled_devices_from_mpath(
                node, mpath))
        node_ip = devices[mpath_dev['active'][0]]

        # Get the name of gluster pod from the ip
        openshift_ops.switch_oc_project(
            self._master, self._registry_project_name)
        gluster_pods = openshift_ops.get_ocp_gluster_pod_details(
            self._master)
        active_pod_name = list(
            filter(lambda pod: (pod["pod_host_ip"] == node_ip), gluster_pods)
        )[0]["pod_name"]
        err_msg = "Failed to get the gluster pod name {} with active path"
        self.assertTrue(active_pod_name, err_msg.format(active_pod_name))
        g_pods = [pod['pod_name'] for pod in gluster_pods]
        g_pods.remove(active_pod_name)
        pod_list = [active_pod_name, g_pods[0]]
        for pod_name in pod_list:

            # Delete the glusterfs pods
            openshift_ops.switch_oc_project(
                self._master, self._prometheus_project_name)
            self._fetch_metric_from_promtheus_pod(
                metric='heketi_device_brick_count')

            openshift_ops.switch_oc_project(
                self._master, self._registry_project_name)
            g_pod_list_before = [
                pod["pod_name"]
                for pod in openshift_ops.get_ocp_gluster_pod_details(
                    self._master)]

            openshift_ops.oc_delete(self._master, 'pod', pod_name)
            self.addCleanup(
                self._guster_pod_delete, g_pod_list_before)

            # Wait for gluster pod to be absent
            openshift_ops.wait_for_resource_absence(
                self._master, 'pod', pod_name)

            # Try to fetch metric from prometheus pod
            openshift_ops.switch_oc_project(
                self._master, self._prometheus_project_name)
            self._fetch_metric_from_promtheus_pod(
                metric='heketi_device_brick_count')

            # Wait for new pod to come up
            openshift_ops.switch_oc_project(
                self._master, self._registry_project_name)
            self.assertTrue(self._get_newly_deployed_gluster_pod(
                g_pod_list_before), "Failed to get new pod")
            self._wait_for_gluster_pod_be_ready(g_pod_list_before)

            # Validate iscsi and multipath
            openshift_ops.switch_oc_project(
                self._master, self._prometheus_project_name)
            self.verify_iscsi_sessions_and_multipath(
                pvc_name, prometheus_pod, rtype='pod',
                heketi_server_url=self._registry_heketi_server_url,
                is_registry_gluster=True)

            # Try to fetch metric from prometheus pod
            self._fetch_metric_from_promtheus_pod(
                metric='heketi_device_brick_count')