def _fetch_initial_metrics(self,
                               vol_name_prefix=None,
                               volume_expansion=False):

        # Create PVC and wait for it to be in 'Bound' state
        sc_name = self.create_storage_class(
            vol_name_prefix=vol_name_prefix,
            allow_volume_expansion=volume_expansion)

        if vol_name_prefix:
            pvc_name = self.create_and_wait_for_pvc(
                pvc_name_prefix=vol_name_prefix, sc_name=sc_name)
        else:
            pvc_name = self.create_and_wait_for_pvc(sc_name=sc_name)

        # Create DC and attach with pvc
        self.dc_name, pod_name = self.create_dc_with_pvc(pvc_name)
        for w in waiter.Waiter(120, 10):
            initial_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if bool(initial_metrics) and len(initial_metrics) == 6:
                break
        if w.expired:
            raise AssertionError("Unable to fetch metrics for the pvc")
        return pvc_name, pod_name, initial_metrics
Exemple #2
0
    def test_block_vol_offline_expand(self):
        """Test blockvol expansion while PVC is not in use"""
        node = self.ocp_master_node[0]

        pvc_name, dc_name, bvol_info = (
            self._block_vol_expand_common_offline_vs_online(False))

        # create and wait for job to be completed
        jobname = oc_create_offline_block_volume_expand_job(node, pvc_name)
        self.addCleanup(oc_delete, node, 'job', jobname)
        for w in waiter.Waiter(300, 5):
            if is_job_complete(node, jobname):
                break
        if w.expired:
            raise AssertionError(
                "block expand job {} is not completed".format(jobname))

        # verify expand size
        scale_dc_pod_amount_and_wait(node, dc_name[0], pod_amount=1)
        pod_name = get_pod_name_from_dc(node, dc_name[0])
        ret, size, _ = oc_rsh(
            node, pod_name,
            'df -kh /mnt | sed "/Filesystem/d" | awk \'{print $2}\' '
            '| sed "s/G//"')
        self.assertFalse(ret, "Failed to get size from client side")
        self.assertEqual(
            int(float(size)), bvol_info["size"], "new size is not "
            "reflected at mount point after block volume expand")
    def _perform_io_and_fetch_metrics(self, pod_name, pvc_name, filename,
                                      dirname, metric_data, operation):
        """Create 1000 files and dirs and validate with old metrics"""
        openshift_ops.switch_oc_project(self._master,
                                        self.storage_project_name)
        if operation == "create":
            cmds = ("touch /mnt/{}{{1..1000}}".format(filename),
                    "mkdir /mnt/{}{{1..1000}}".format(dirname))
        else:
            cmds = ("rm -rf /mnt/large_file",
                    "rm -rf /mnt/{}{{1..1000}}".format(filename),
                    "rm -rf /mnt/{}{{1..1000}}".format(dirname))
        for cmd in cmds:
            self.cmd_run("oc rsh {} {}".format(pod_name, cmd))

        # Fetch the new metrics and compare the inodes used and bytes used
        for w in waiter.Waiter(120, 10):
            after_io_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if operation == "create":
                if (int(after_io_metrics['kubelet_volume_stats_inodes_used']) >
                        int(metric_data['kubelet_volume_stats_inodes_used'])
                        and
                        int(after_io_metrics['kubelet_volume_stats_used_bytes']
                            ) >
                        int(metric_data['kubelet_volume_stats_used_bytes'])):
                    break
            else:
                if int(metric_data['kubelet_volume_stats_used_bytes']) > int(
                        after_io_metrics['kubelet_volume_stats_used_bytes']):
                    break
        if w.expired:
            raise AssertionError(
                "After data is modified metrics like bytes used and inodes "
                "used are not reflected in prometheus")
    def _set_log_level(self, node, level, msg, exec_time):
        delete_log_level = r'sed -i "/\(^log_level.*=.*[0-9]\)/d" {}'
        set_log_level = r'sed -i "\$alog_level = {}" {}'
        check_log_msg = r'sed -n "/.*\({}\).*/{{p;}}" {} | tail -1'

        # Set log level
        openshift_ops.cmd_run_on_gluster_pod_or_node(
            self.node, set_log_level.format(level, TCMU_CONF),
            gluster_node=node)
        self.addCleanup(
            openshift_ops.cmd_run_on_gluster_pod_or_node,
            self.node, delete_log_level.format(TCMU_CONF), gluster_node=node)

        # Validate log level
        log_msg = "log level now is {}".format(msg)
        for w in waiter.Waiter(120, 3):
            out = openshift_ops.cmd_run_on_gluster_pod_or_node(
                self.node, check_log_msg.format(log_msg, TCMU_RUNNER_LOG),
                gluster_node=node)
            match = re.match(LOG_REGEX, out)
            if (match
                    and exec_time < datetime.datetime.strptime(
                        match.group(1), self.timeformat)):
                break

        if w.expired:
            raise exceptions.ExecutionError(
                "Log level '{}:{}' of tcmu did not get changed on node"
                " {}".format(level, msg, node))

        openshift_ops.cmd_run_on_gluster_pod_or_node(
            self.node, delete_log_level.format(TCMU_CONF), gluster_node=node)
Exemple #5
0
 def _wait_for_docker_service_status(self, pod_host_ip, status, state):
     for w in waiter.Waiter(30, 3):
         out = command.cmd_run(DOCKER_SERVICE.format("status"), pod_host_ip)
         for line in out.splitlines():
             status_match = re.search(SERVICE_STATUS_REGEX, line)
             if (status_match and status_match.group(1) == status
                     and status_match.group(2) == state):
                 return True
Exemple #6
0
 def _check_for_pending_operations(self, h_node, h_url):
     # Check for pending operations
     for w in waiter.Waiter(timeout=120, interval=10):
         h_db_check = heketi_db_check(h_node, h_url)
         h_db_check_vol = h_db_check.get("blockvolumes")
         if h_db_check_vol.get("pending"):
             break
     if w.expired:
         raise exceptions.ExecutionError(
             "No pending operations found during blockvolumes creation "
             "{}".format(h_db_check_vol.get("pending")))
Exemple #7
0
    def _rebalance_completion(self, volume_name):
        """Rebalance start and completion after expansion."""
        ret, _, err = rebalance_ops.rebalance_start(
            'auto_get_gluster_endpoint', volume_name)
        self.assertFalse(
            ret, "Rebalance for {} volume not started with error {}".format(
                volume_name, err))

        for w in waiter.Waiter(240, 10):
            reb_status = rebalance_ops.get_rebalance_status(
                'auto_get_gluster_endpoint', volume_name)
            if reb_status["aggregate"]["statusStr"] == "completed":
                break
        if w.expired:
            raise AssertionError(
                "Failed to complete the rebalance in 240 seconds")
    def test_prometheus_basic_validation(self):
        """ Validate basic volume metrics using prometheus """

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            volume_expansion=False)

        # Create 1000 files and fetch the metrics that the data is updated
        self._perform_io_and_fetch_metrics(pod_name=pod_name,
                                           pvc_name=pvc_name,
                                           filename="filename1",
                                           dirname="dirname1",
                                           metric_data=initial_metrics,
                                           operation="create")

        # Write the IO half the size of the volume and validated from
        # prometheus pod that the size change is reflected
        size_to_write = int(
            initial_metrics['kubelet_volume_stats_capacity_bytes']) // 2
        openshift_ops.switch_oc_project(self._master,
                                        self.storage_project_name)
        cmd = ("dd if=/dev/urandom of=/mnt/large_file bs={} count=1024".format(
            size_to_write // 1024))
        ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd)
        self.assertFalse(ret, 'Failed to write file due to err {}'.format(err))

        # Fetching the metrics and validating the data change is reflected
        for w in waiter.Waiter(120, 10):
            half_io_metrics = self._get_and_manipulate_metric_data(
                ['kubelet_volume_stats_used_bytes'], pvc_name)
            if bool(half_io_metrics) and (int(
                    half_io_metrics['kubelet_volume_stats_used_bytes']) >
                                          size_to_write):
                break
        if w.expired:
            raise AssertionError(
                "After Data is written on the pvc, metrics like inodes used "
                "and bytes used are not reflected in the prometheus")

        # Delete the files from the volume and wait for the
        # updated details reflected in prometheus
        self._perform_io_and_fetch_metrics(pod_name=pod_name,
                                           pvc_name=pvc_name,
                                           filename="filename1",
                                           dirname="dirname1",
                                           metric_data=half_io_metrics,
                                           operation="delete")
    def _power_off_node_and_wait_node_to_be_not_ready(self, hostname):
        # Bring down the glusterfs node
        vm_name = node_ops.find_vm_name_by_ip_or_hostname(hostname)
        self.addCleanup(self._wait_for_gluster_pod_after_node_reboot, hostname)
        self.addCleanup(node_ops.power_on_vm_by_name, vm_name)
        node_ops.power_off_vm_by_name(vm_name)

        # Wait glusterfs node to become NotReady
        custom = r'":.status.conditions[?(@.type==\"Ready\")]".status'
        for w in waiter.Waiter(300, 20):
            status = openshift_ops.oc_get_custom_resource(
                self.ocp_client, 'node', custom, hostname)
            if status[0] in ['False', 'Unknown']:
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to bring down node {}".format(hostname))
Exemple #10
0
def wait_to_heal_complete(timeout=300, wait_step=5):
    """Monitors heal for volumes on gluster"""
    gluster_vol_list = get_volume_list("auto_get_gluster_endpoint")
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete("auto_get_gluster_endpoint", gluster_vol):
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Exemple #11
0
def power_on_vm_by_name(name, timeout=600, interval=10):
    """Power on the virtual machine and wait for SSH ready within given
    timeout.

    Args:
        name (str): name of the VM which needs to be powered on.
    Returns:
        None
    Raises:
        CloudProviderError: In case of any failures.
    """
    cloudProvider = _get_cloud_provider()
    g.log.info('powering on the VM "%s"' % name)
    cloudProvider.power_on_vm_by_name(name)
    g.log.info('Powered on the VM "%s" successfully' % name)

    # Wait for hostname to get assigned
    _waiter = waiter.Waiter(timeout, interval)
    err = ""
    for w in _waiter:
        try:
            hostname = cloudProvider.wait_for_hostname(name, 1, 1)
            # NOTE(vponomar): Reset attempts for waiter to avoid redundant
            # sleep equal to 'interval' on the next usage.
            _waiter._attempt = 0
            break
        except Exception as e:
            err = e
            g.log.info(e)
    if w.expired:
        raise exceptions.CloudProviderError(err)

    # Wait for hostname to ssh connection ready
    for w in _waiter:
        try:
            wait_for_ssh_connection(hostname, 1, 1)
            break
        except Exception as e:
            g.log.info(e)
            err = e
    if w.expired:
        raise exceptions.CloudProviderError(err)
def wait_to_heal_complete(
        timeout=300, wait_step=5, g_node="auto_get_gluster_endpoint"):
    """Monitors heal for volumes on gluster"""
    gluster_vol_list = get_volume_list(g_node)
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete(g_node, gluster_vol):
                # NOTE(vponomar): Reset attempts for waiter to avoid redundant
                # sleep equal to 'interval' on the next usage.
                _waiter._attempt = 0
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Exemple #13
0
def wait_for_ssh_connection(hostname, timeout=600, interval=10):
    """Wait for ssh conection to be ready within given timeout.

    Args:
        hostname (str): hostname of a machine.
    Returns:
        None
    Raises:
        CloudProviderError: In case of any failures.
    """
    for w in waiter.Waiter(timeout, interval):
        try:
            # Run random command to verify ssh connection
            g.run(hostname, 'ls')
            return
        except (exceptions.ExecutionError, ExecutionError):
            g.log.info("Waiting for ssh connection on host '%s'" % hostname)

    msg = 'Not able to connect with the %s' % hostname
    g.log.error(msg)
    raise exceptions.CloudProviderError(msg)
Exemple #14
0
def node_reboot_by_command(node, timeout=600, wait_step=10):
    """Reboot node and wait to start for given timeout.

    Args:
        node (str)     : Node which needs to be rebooted.
        timeout (int)  : Seconds to wait before node to be started.
        wait_step (int): Interval in seconds to wait before checking
                         status of node again.
    """
    cmd = "sleep 3; /sbin/shutdown -r now 'Reboot triggered by Glusto'"
    ret, out, err = g.run(node, cmd)
    if ret != 255:
        err_msg = "failed to reboot host '%s' error %s" % (node, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    try:
        g.ssh_close_connection(node)
    except Exception as e:
        g.log.error("failed to close connection with host %s "
                    "with error: %s" % (node, e))
        raise

    # added sleep as node will restart after 3 sec
    time.sleep(3)

    for w in waiter.Waiter(timeout=timeout, interval=wait_step):
        try:
            if g.rpyc_get_connection(node, user="******"):
                g.rpyc_close_connection(node, user="******")
                return
        except Exception as err:
            g.log.info("exception while getting connection: '%s'" % err)

    if w.expired:
        error_msg = ("exceeded timeout %s sec, node '%s' is "
                     "not reachable" % (timeout, node))
        g.log.error(error_msg)
        raise exceptions.ExecutionError(error_msg)
Exemple #15
0
    def test_run_workload_with_logging(self):
        """Validate logs are being generated aifter running workload"""

        # Get the size of used space of logs
        es_pod = openshift_ops.get_pod_name_from_dc(
            self._master, self._logging_es_dc)
        mount_point = "/elasticsearch/persistent"
        cmd_space_check = ('df -kh --output=used {} | sed "/Used/d" |'
                           'sed "s/G//"'.format(mount_point))
        ret, initial_used_percent, err = openshift_ops.oc_rsh(
            self._master, es_pod, cmd_space_check)
        err_msg = "Failed to fetch the size of used space, error {}"
        self.assertFalse(ret, err_msg.format(err))

        # Create 20 pvcs and app pods with io
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pvc_count, batch_count = 5, 4
        for _ in range(batch_count):
            pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count)
            self.create_dcs_with_pvc(pvcs)
        self.addCleanup(
            openshift_ops.switch_oc_project,
            self._master, self.storage_project_name)

        # Get and verify the final used size of used space of logs
        openshift_ops.switch_oc_project(
            self._master, self._logging_project_name)
        for w in waiter.Waiter(600, 30):
            ret, final_used_percent, err = openshift_ops.oc_rsh(
                self._master, es_pod, cmd_space_check)
            self.assertFalse(ret, err_msg.format(err))
            if int(initial_used_percent) < int(final_used_percent):
                break
        if w.expired:
            raise AssertionError(
                "Initial used space {} for logs is not less than final "
                "used space {}".format(
                    initial_used_percent, final_used_percent))
def wait_to_heal_complete(vol_name=None,
                          g_node="auto_get_gluster_endpoint",
                          timeout=300,
                          wait_step=5):
    """Monitors heal for volumes on gluster

    Args:
        vol_name (str): Name of the gluster volume else default is None and
            will check for all the volumes
        g_node (str): Name of the gluster node else default is
            auto_get_gluster_endpoint
        timeout (int): Time to wait for heal check to complete default is 300
        wait_step (int): Time to trigger heal check command for next iteration
    Raises:
        AssertionError: In case heal is not complete
    """
    if not vol_name:
        gluster_vol_list = get_volume_list(g_node)
        if not gluster_vol_list:
            raise AssertionError("failed to get gluster volume list")
    else:
        gluster_vol_list = [vol_name]

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete(g_node, gluster_vol):
                # NOTE(vponomar): Reset attempts for waiter to avoid redundant
                # sleep equal to 'interval' on the next usage.
                _waiter._attempt = 0
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Exemple #17
0
    def test_run_workload_with_metrics(self):
        """Validate if logs are being generated after running workload"""

        # Get the size of used space of logs
        cassandra_pod = get_pod_name_from_rc(
            self.master, self.metrics_rc_hawkular_cassandra)
        mount_point = "/cassandra_data"
        cmd_space_check = ('df -k --output=used {} | sed "/Used/d" |'
                           'sed "s/G//"'.format(mount_point))
        ret, initial_used_percent, err = oc_rsh(self.master, cassandra_pod,
                                                cmd_space_check)
        err_msg = "Failed to fetch the size of used space, error {}"
        self.assertFalse(ret, err_msg.format(err))

        # Create 20 PVCs and app pods with IO
        switch_oc_project(self.master, self.storage_project_name)
        pvc_count, batch_count = 5, 4
        for _ in range(batch_count):
            pvcs = self.create_and_wait_for_pvcs(pvc_amount=pvc_count)
            self.create_dcs_with_pvc(pvcs)
        self.addCleanup(switch_oc_project, self.master,
                        self.storage_project_name)

        # Get and verify the final size of used space of logs
        switch_oc_project(self.master, self.metrics_project_name)
        for w in waiter.Waiter(600, 30):
            ret, final_used_percent, err = oc_rsh(self.master, cassandra_pod,
                                                  cmd_space_check)
            self.assertFalse(ret, err_msg.format(err))
            if int(initial_used_percent) < int(final_used_percent):
                break
        if w.expired:
            raise AssertionError(
                "Initial used space {} for logs is not less than final "
                "used space {}".format(initial_used_percent,
                                       final_used_percent))
Exemple #18
0
    def test_prometheus_pv_resize(self):
        """ Validate prometheus metrics with pv resize"""

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            vol_name_prefix="for-pv-resize", volume_expansion=True)

        # Write data on the pvc and confirm it is reflected in the prometheus
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="filename1", dirname="dirname1",
            metric_data=initial_metrics, operation="create")

        # Resize the pvc to 2GiB
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pvc_size = 2
        openshift_ops.resize_pvc(self._master, pvc_name, pvc_size)
        openshift_ops.wait_for_events(self._master, obj_name=pvc_name,
                                      event_reason='VolumeResizeSuccessful')
        openshift_ops.verify_pvc_size(self._master, pvc_name, pvc_size)
        pv_name = openshift_ops.get_pv_name_from_pvc(
            self._master, pvc_name)
        openshift_ops.verify_pv_size(self._master, pv_name, pvc_size)

        heketi_volume_name = heketi_ops.heketi_volume_list_by_name_prefix(
            self.heketi_client_node, self.heketi_server_url,
            "for-pv-resize", json=True)[0][2]
        self.assertIsNotNone(
            heketi_volume_name, "Failed to fetch volume with prefix {}".
            format("for-pv-resize"))

        openshift_ops.oc_delete(self._master, 'pod', pod_name)
        openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name)
        pod_name = openshift_ops.get_pod_name_from_dc(
            self._master, self.dc_name)
        openshift_ops.wait_for_pod_be_ready(self._master, pod_name)

        # Check whether the metrics are updated or not
        for w in waiter.Waiter(120, 10):
            resize_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if bool(resize_metrics) and int(resize_metrics[
                'kubelet_volume_stats_capacity_bytes']) > int(
                    initial_metrics['kubelet_volume_stats_capacity_bytes']):
                break
        if w.expired:
            raise AssertionError("Failed to reflect PVC Size after resizing")
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        time.sleep(240)

        # Lookup and trigger rebalance and wait for the its completion
        for _ in range(100):
            self.cmd_run("oc rsh {} ls /mnt/".format(pod_name))
        self._rebalance_completion(heketi_volume_name)

        # Write data on the resized pvc and compared with the resized_metrics
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="secondfilename", dirname="seconddirname",
            metric_data=resize_metrics, operation="create")
def restart_gluster_vol_brick_processes(ocp_client_node, file_vol,
                                        gluster_nodes):
    """Restarts brick process of a file volume.

    Args:
        ocp_client_node (str): Node to execute OCP commands on.
        file_vol (str): file volume name.
        gluster_nodes (str/list): One or several IPv4 addresses of Gluster
            nodes, where 'file_vol' brick processes must be recreated.
    """
    if not isinstance(gluster_nodes, (list, set, tuple)):
        gluster_nodes = [gluster_nodes]

    # Get Gluster vol brick PIDs
    gluster_volume_status = get_gluster_vol_status(file_vol)
    pids = []
    for gluster_node in gluster_nodes:
        pid = None
        for g_node, g_node_data in gluster_volume_status.items():
            if g_node != gluster_node:
                continue
            for process_name, process_data in g_node_data.items():
                if not process_name.startswith("/var"):
                    continue
                pid = process_data["pid"]
                # When birck is down, pid of the brick is returned as -1.
                # Which is unexepeted situation. So, add appropriate assertion.
                assert pid != "-1", (
                    "Got unexpected PID (-1) for '%s' gluster vol on '%s' "
                    "node." % file_vol, gluster_node)
        assert pid, ("Could not find 'pid' in Gluster vol data for '%s' "
                     "Gluster node. Data: %s" % (
                         gluster_node, gluster_volume_status))
        pids.append((gluster_node, pid))

    # Restart Gluster vol brick processes using found PIDs
    for gluster_node, pid in pids:
        cmd = "kill -9 %s" % pid
        cmd_run_on_gluster_pod_or_node(ocp_client_node, cmd, gluster_node)

    # Wait for Gluster vol brick processes to be recreated
    for gluster_node, pid in pids:
        killed_pid_cmd = "ps -eaf | grep %s | grep -v grep | awk '{print $2}'"
        _waiter = waiter.Waiter(timeout=60, interval=2)
        for w in _waiter:
            result = cmd_run_on_gluster_pod_or_node(
                ocp_client_node, killed_pid_cmd, gluster_node)
            if result.strip() == pid:
                continue
            g.log.info("Brick process '%s' was killed successfully on '%s'" % (
                pid, gluster_node))
            break
        if w.expired:
            error_msg = ("Process ID '%s' still exists on '%s' after waiting "
                         "for it 60 seconds to get killed." % (
                             pid, gluster_node))
            g.log.error(error_msg)
            raise exceptions.ExecutionError(error_msg)

    # Start volume after gluster vol brick processes recreation
    ret, out, err = volume_start(
        "auto_get_gluster_endpoint", file_vol, force=True)
    if ret != 0:
        err_msg = "Failed to start gluster volume %s on %s. error: %s" % (
            file_vol, gluster_node, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Exemple #20
0
    def test_heketi_server_stale_operations_during_heketi_pod_reboot(self):
        """
        Validate failed/stale entries in db and performs a cleanup
        of those entries
        """
        volume_id_list, async_obj, ocp_node = [], [], self.ocp_master_node[0]
        h_node, h_server = self.heketi_client_node, self.heketi_server_url
        for i in range(0, 8):
            volume_info = heketi_ops.heketi_volume_create(h_node,
                                                          h_server,
                                                          1,
                                                          json=True)
            volume_id_list.append(volume_info["id"])
            self.addCleanup(heketi_ops.heketi_volume_delete,
                            h_node,
                            h_server,
                            volume_info["id"],
                            raise_on_error=False)

        def run_async(cmd, hostname, raise_on_error=True):
            async_op = g.run_async(host=hostname, command=cmd)
            async_obj.append(async_op)
            return async_op

        # Temporary replace g.run with g.async_run in heketi_volume_delete
        # to be able to run it in background.
        for vol_id in volume_id_list:
            with mock.patch.object(command, 'cmd_run', side_effect=run_async):
                heketi_ops.heketi_volume_delete(h_node, h_server, vol_id)

        # Restart heketi pod and check pod is running
        heketi_pod_name = openshift_ops.get_pod_name_from_dc(
            ocp_node, self.heketi_dc_name)
        openshift_ops.oc_delete(ocp_node,
                                'pod',
                                heketi_pod_name,
                                collect_logs=self.heketi_logs_before_delete)
        self.addCleanup(self._heketi_pod_delete_cleanup, ocp_node)
        openshift_ops.wait_for_resource_absence(ocp_node, 'pod',
                                                heketi_pod_name)
        heketi_pod_name = openshift_ops.get_pod_name_from_dc(
            ocp_node, self.heketi_dc_name)
        openshift_ops.wait_for_pod_be_ready(ocp_node, heketi_pod_name)
        self.assertTrue(heketi_ops.hello_heketi(h_node, h_server),
                        "Heketi server {} is not alive".format(h_server))

        # Wait for pending operations to get generate
        for w in waiter.Waiter(timeout=30, interval=3):
            h_db_check = heketi_ops.heketi_db_check(h_node, h_server)
            h_db_check_vol = h_db_check.get("volumes")
            h_db_check_bricks = h_db_check.get("bricks")
            if ((h_db_check_vol.get("pending"))
                    and (h_db_check_bricks.get("pending"))):
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "No any pending operations found during volumes deletion "
                "volumes:{}, Bricks:{} ".format(
                    h_db_check_vol.get("pending"),
                    h_db_check_bricks.get("pending")))

        # Verify pending bricks are multiples of 3
        self.assertFalse(
            h_db_check_bricks.get("pending") % 3,
            "Expecting bricks pending count to be multiple of 3 but "
            "found {}".format(h_db_check_bricks.get("pending")))

        # Verify and Wait for pending operations to complete
        for w in waiter.Waiter(timeout=120, interval=10):
            h_db_check = heketi_ops.heketi_db_check(h_node, h_server)
            h_db_check_vol = h_db_check.get("volumes")
            h_db_check_bricks = h_db_check.get("bricks")
            if ((not h_db_check_bricks.get("pending"))
                    and (not h_db_check_vol.get("pending"))):
                break
        if w.expired:
            raise AssertionError("Failed to delete volumes after 120 secs")
    def test_heketi_metrics_validation_after_node(self, condition):
        """Validate heketi metrics after adding and remove node"""

        # Get additional node
        additional_host_info = g.config.get("additional_gluster_servers")
        if not additional_host_info:
            self.skipTest(
                "Skipping this test case as additional gluster server is "
                "not provied in config file")

        additional_host_info = list(additional_host_info.values())[0]
        storage_hostname = additional_host_info.get("manage")
        storage_ip = additional_host_info.get("storage")
        if not (storage_hostname and storage_ip):
            self.skipTest(
                "Config options 'additional_gluster_servers.manage' "
                "and 'additional_gluster_servers.storage' must be set.")

        h_client, h_server = self.heketi_client_node, self.heketi_server_url
        initial_node_count, final_node_count = 0, 0

        # Get initial node count from prometheus metrics
        metric_result = self._fetch_metric_from_promtheus_pod(
            metric='heketi_nodes_count')
        initial_node_count = reduce(
            lambda x, y: x + y,
            [result.get('value')[1] for result in metric_result])

        # Switch to storage project
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)

        # Configure node before adding node
        self.configure_node_to_run_gluster(storage_hostname)

        # Get cluster list
        cluster_info = heketi_ops.heketi_cluster_list(
            h_client, h_server, json=True)

        # Add node to the cluster
        heketi_node_info = heketi_ops.heketi_node_add(
            h_client, h_server,
            len(self.gluster_servers), cluster_info.get('clusters')[0],
            storage_hostname, storage_ip, json=True)
        heketi_node_id = heketi_node_info.get("id")
        self.addCleanup(
            heketi_ops.heketi_node_delete,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_node_remove,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_node_disable,
            h_client, h_server, heketi_node_id, raise_on_error=False)
        self.addCleanup(
            openshift_ops.switch_oc_project,
            self._master, self.storage_project_name)

        if condition == 'delete':
            # Switch to openshift-monitoring project
            openshift_ops.switch_oc_project(
                self.ocp_master_node[0], self._prometheus_project_name)

            # Get initial node count from prometheus metrics
            for w in waiter.Waiter(timeout=60, interval=10):
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_nodes_count')
                node_count = reduce(
                    lambda x, y: x + y,
                    [result.get('value')[1] for result in metric_result])
                if node_count != initial_node_count:
                    break

            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to get updated node details from prometheus")

            # Remove node from cluster
            heketi_ops.heketi_node_disable(h_client, h_server, heketi_node_id)
            heketi_ops.heketi_node_remove(h_client, h_server, heketi_node_id)
            for device in heketi_node_info.get('devices'):
                heketi_ops.heketi_device_delete(
                    h_client, h_server, device.get('id'))
            heketi_ops.heketi_node_delete(h_client, h_server, heketi_node_id)

        # Switch to openshift-monitoring project
        openshift_ops.switch_oc_project(
            self.ocp_master_node[0], self._prometheus_project_name)

        # Get final node count from prometheus metrics
        for w in waiter.Waiter(timeout=60, interval=10):
            metric_result = self._fetch_metric_from_promtheus_pod(
                metric='heketi_nodes_count')
            final_node_count = reduce(
                lambda x, y: x + y,
                [result.get('value')[1] for result in metric_result])

            if condition == 'delete':
                if final_node_count < node_count:
                    break
            else:
                if final_node_count > initial_node_count:
                    break

        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to update node details in prometheus")
Exemple #22
0
    def test_verify_delete_heketi_volumes_pending_entries_in_db(
            self, vol_type):
        """Verify pending entries of blockvolumes/volumes and bricks in heketi
           db during blockvolume/volume delete operation.
        """
        # Create a large volumes to observe the pending operation
        vol_count, volume_ids, async_obj = 10, [], []
        h_node, h_url = self.heketi_client_node, self.heketi_server_url

        # Verify file/block volumes pending operation before creation,
        h_db_check_before = heketi_db_check(h_node, h_url)
        h_db_check_bricks_before = h_db_check_before.get("bricks")
        h_db_check_vol_before = (h_db_check_before.get(
            "{}volumes".format(vol_type)))

        # Get existing heketi volume list
        existing_volumes = heketi_volume_list(h_node, h_url, json=True)

        # Add cleanup function to clean stale volumes created during test
        self.addCleanup(self._cleanup_heketi_volumes,
                        existing_volumes.get("volumes"))

        # Delete heketi pod to clean db operations
        if (h_db_check_bricks_before.get("pending")
                or h_db_check_vol_before.get("pending")):
            self._respin_heketi_pod()

        # Calculate heketi volume size
        free_space, nodenum = get_total_free_space(h_node, h_url)
        free_space_available = int(free_space / nodenum)
        if free_space_available > vol_count:
            h_volume_size = int(free_space_available / vol_count)
            if h_volume_size > 50:
                h_volume_size = 50
        else:
            h_volume_size, vol_count = 1, free_space_available

        # Create BHV in case blockvolume size is greater than default BHV size
        if vol_type:
            default_bhv_size = get_default_block_hosting_volume_size(
                h_node, self.heketi_dc_name)
            if default_bhv_size < h_volume_size:
                h_volume_name = "autotest-{}".format(utils.get_random_str())
                bhv_info = self.create_heketi_volume_with_name_and_wait(
                    h_volume_name,
                    free_space_available,
                    raise_on_cleanup_error=False,
                    block=True,
                    json=True)
                free_space_available -= (
                    int(bhv_info.get("blockinfo").get("reservedsize")) + 1)
                h_volume_size = int(free_space_available / vol_count)

        # Create file/block volumes
        for _ in range(vol_count):
            vol_id = eval("heketi_{}volume_create".format(vol_type))(
                h_node, h_url, h_volume_size, json=True).get("id")
            volume_ids.append(vol_id)
            self.addCleanup(eval("heketi_{}volume_delete".format(vol_type)),
                            h_node,
                            h_url,
                            vol_id,
                            raise_on_error=False)

        def run_async(cmd, hostname, raise_on_error=True):
            async_op = g.run_async(host=hostname, command=cmd)
            async_obj.append(async_op)
            return async_op

        bhv_list = []
        for vol_id in volume_ids:
            # Get BHV ids to delete in case of block volumes
            if vol_type:
                vol_info = (heketi_blockvolume_info(h_node,
                                                    h_url,
                                                    vol_id,
                                                    json=True))
                if not vol_info.get("blockhostingvolume") in bhv_list:
                    bhv_list.append(vol_info.get("blockhostingvolume"))

            # Temporary replace g.run with g.async_run in heketi_volume_delete
            # and heketi_blockvolume_delete func to be able to run it in
            # background.
            with mock.patch.object(command, 'cmd_run', side_effect=run_async):
                eval("heketi_{}volume_delete".format(vol_type))(h_node, h_url,
                                                                vol_id)

        # Wait for pending operations to get generate
        for w in waiter.Waiter(timeout=30, interval=3):
            h_db_check = heketi_db_check(h_node, h_url)
            h_db_check_vol = h_db_check.get("{}volumes".format(vol_type))
            if h_db_check_vol.get("pending"):
                h_db_check_bricks = h_db_check.get("bricks")
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "No any pending operations found during {}volumes deletion "
                "{}".format(vol_type, h_db_check_vol.get("pending")))

        # Verify bricks pending operation during creation
        if not vol_type:
            self.assertTrue(h_db_check_bricks.get("pending"),
                            "Expecting at least one bricks pending count")
            self.assertFalse(
                h_db_check_bricks.get("pending") % 3,
                "Expecting bricks pending count to be multiple of 3 but "
                "found {}".format(h_db_check_bricks.get("pending")))

        # Verify file/block volume pending operation during delete
        for w in waiter.Waiter(timeout=120, interval=10):
            h_db_check = heketi_db_check(h_node, h_url)
            h_db_check_vol = h_db_check.get("{}volumes".format(vol_type))
            h_db_check_bricks = h_db_check.get("bricks")
            if ((not h_db_check_bricks.get("pending"))
                    and (not h_db_check_vol.get("pending"))):
                break
        if w.expired:
            raise AssertionError(
                "Failed to delete {}volumes after 120 secs".format(vol_type))

        # Check that all background processes got exited
        for obj in async_obj:
            ret, out, err = obj.async_communicate()
            self.assertFalse(
                ret, "Failed to delete {}volume due to error: {}".format(
                    vol_type, err))

        # Delete BHV created during block volume creation
        if vol_type:
            for bhv_id in bhv_list:
                heketi_volume_delete(h_node, h_url, bhv_id)

        # Verify bricks and volume pending operations
        h_db_check_after = heketi_db_check(h_node, h_url)
        h_db_check_bricks_after = h_db_check_after.get("bricks")
        h_db_check_vol_after = (h_db_check_after.get(
            "{}volumes".format(vol_type)))
        act_brick_count = h_db_check_bricks_after.get("pending")
        act_vol_count = h_db_check_vol_after.get("pending")

        # Verify bricks pending operation after delete
        err_msg = "{} operations are pending for {} after {}volume deletion"
        if not vol_type:
            self.assertFalse(
                act_brick_count,
                err_msg.format(act_brick_count, "brick", vol_type))

        # Verify file/bock volumes pending operation after delete
        self.assertFalse(act_vol_count,
                         err_msg.format(act_vol_count, "volume", vol_type))

        act_brick_count = h_db_check_bricks_after.get("total")
        act_vol_count = h_db_check_vol_after.get("total")
        exp_brick_count = h_db_check_bricks_before.get("total")
        exp_vol_count = h_db_check_vol_before.get("total")
        err_msg = "Actual {} and expected {} {} counts are not matched"

        # Verify if initial and final file/block volumes are same
        self.assertEqual(
            act_vol_count, exp_vol_count,
            err_msg.format(act_vol_count, exp_vol_count, "volume"))

        # Verify if initial and final bricks are same
        self.assertEqual(
            act_brick_count, exp_brick_count,
            err_msg.format(act_brick_count, exp_brick_count, "brick"))
Exemple #23
0
    def test_verify_create_heketi_volumes_pending_entries_in_db(
            self, vol_type):
        """Verify pending entries of file/block volumes in db during
           volumes creation from heketi side
        """
        # Create large volumes to observe the pending operations
        vol_count, h_vol_creation_async_op = 3, []
        h_node, h_url = self.heketi_client_node, self.heketi_server_url

        # Verify file/block volumes pending operation before creation,
        h_db_check_before = heketi_db_check(h_node, h_url)
        h_db_check_vol_before = (h_db_check_before.get(
            "{}volumes".format(vol_type)))

        # Delete heketi pod to clean db operations
        if (h_db_check_vol_before.get("pending")
                or h_db_check_before.get("bricks").get("pending")):
            self._respin_heketi_pod()

        # Calculate heketi volume size
        free_space, nodenum = get_total_free_space(h_node, h_url)
        free_space_available = int(free_space / nodenum)
        if free_space_available > vol_count:
            h_volume_size = int(free_space_available / vol_count)
            if h_volume_size > 30:
                h_volume_size = 30
        else:
            h_volume_size, vol_count = 1, free_space_available

        # Get existing heketi volume list
        existing_volumes = heketi_volume_list(h_node, h_url, json=True)

        # Add cleanup function to clean stale volumes created during test
        self.addCleanup(self._cleanup_heketi_volumes,
                        existing_volumes.get("volumes"))

        # Create BHV in case blockvolume size is greater than default BHV size
        if vol_type:
            default_bhv_size = get_default_block_hosting_volume_size(
                h_node, self.heketi_dc_name)
            if default_bhv_size < h_volume_size:
                h_volume_name = "autotest-{}".format(utils.get_random_str())
                bhv_info = self.create_heketi_volume_with_name_and_wait(
                    h_volume_name,
                    free_space_available,
                    raise_on_cleanup_error=False,
                    block=True,
                    json=True)
                free_space_available -= (
                    int(bhv_info.get("blockinfo").get("reservedsize")) + 1)
                h_volume_size = int(free_space_available / vol_count)

        # Temporary replace g.run with g.async_run in heketi_blockvolume_create
        # func to be able to run it in background.Also, avoid parsing the
        # output as it won't be json at that moment. Parse it after reading
        # the async operation results.
        def run_async(cmd, hostname, raise_on_error=True):
            return g.run_async(host=hostname, command=cmd)

        for count in range(vol_count):
            with mock.patch.object(json, 'loads', side_effect=(lambda j: j)):
                with mock.patch.object(command,
                                       'cmd_run',
                                       side_effect=run_async):
                    h_vol_creation_async_op.append(
                        eval("heketi_{}volume_create".format(vol_type))(
                            h_node, h_url, h_volume_size, json=True))

        # Check for pending operations
        for w in waiter.Waiter(timeout=120, interval=10):
            h_db_check = heketi_db_check(h_node, h_url)
            h_db_check_vol = h_db_check.get("{}volumes".format(vol_type))
            if h_db_check_vol.get("pending"):
                h_db_check_bricks = h_db_check.get("bricks")
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "No any pending operations found during {}volumes creation "
                "{}".format(vol_type, h_db_check_vol.get("pending")))

        # Verify bricks pending operation during creation
        if not vol_type:
            self.assertTrue(h_db_check_bricks.get("pending"),
                            "Expecting at least one bricks pending count")
            self.assertFalse(
                h_db_check_bricks.get("pending") % 3,
                "Expecting bricks pending count to be multiple of 3 but "
                "found {}".format(h_db_check_bricks.get("pending")))

        # Wait for all counts of pending operations to be zero
        for w in waiter.Waiter(timeout=300, interval=10):
            h_db_check = heketi_db_check(h_node, h_url)
            h_db_check_vol = h_db_check.get("{}volumes".format(vol_type))
            if not h_db_check_vol.get("pending"):
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "Expecting no pending operations after 300 sec but "
                "found {} operation".format(h_db_check_vol.get("pending")))

        # Get heketi server DB details
        h_db_check_after = heketi_db_check(h_node, h_url)
        h_db_check_vol_after = (h_db_check_after.get(
            "{}volumes".format(vol_type)))
        h_db_check_bricks_after = h_db_check_after.get("bricks")

        # Verify if initial and final file/block volumes are same
        act_vol_count = h_db_check_vol_after.get("total")
        exp_vol_count = h_db_check_vol_before.get("total") + vol_count
        err_msg = (
            "Actual {} and expected {} {}volume counts are not matched".format(
                act_vol_count, exp_vol_count, vol_type))
        self.assertEqual(act_vol_count, exp_vol_count, err_msg)

        # Verify if initial and final bricks are same for file volume
        volumes = heketi_volume_list(h_node, h_url, json=True).get("volumes")
        new_volumes = list(set(volumes) - set(existing_volumes))
        exp_brick_count = 0
        for volume in new_volumes:
            vol_info = heketi_volume_info(h_node, h_url, volume, json=True)
            exp_brick_count += len(vol_info.get("bricks"))

        err_msg = "Actual {} and expected {} bricks counts are not matched"
        act_brick_count = h_db_check_bricks_after.get("total")
        self.assertEqual(act_brick_count, exp_brick_count,
                         err_msg.format(act_brick_count, exp_brick_count))
Exemple #24
0
    def test_verify_pending_entries_in_db(self):
        """Verify pending entries of volumes and bricks in db during
        volume creation from heketi side
        """
        h_volume_size = 100
        h_db_chk_bfr_v_creation = heketi_db_check(self.heketi_client_node,
                                                  self.heketi_server_url)

        if (h_db_chk_bfr_v_creation["bricks"]["pending"] != 0
                or h_db_chk_bfr_v_creation["volumes"]["pending"] != 0):
            self.skipTest(
                "Skip TC due to unexpected bricks/volumes pending operations")

        # Verify bricks and volume pending operation before creation
        self.assertEqual(h_db_chk_bfr_v_creation["bricks"]["pending"], 0)
        self.assertEqual(h_db_chk_bfr_v_creation["volumes"]["pending"], 0)

        # Temporary replace g.run with g.async_run in heketi_volume_create func
        # to be able to run it in background.Also, avoid parsing the output as
        # it won't be json at that moment. Parse it after reading the async
        # operation results.

        def run_async(cmd, hostname, raise_on_error=True):
            return g.run_async(host=hostname, command=cmd)

        with mock.patch.object(json, 'loads', side_effect=(lambda j: j)):
            with mock.patch.object(command, 'cmd_run', side_effect=run_async):
                h_vol_creation_async_op = heketi_volume_create(
                    self.heketi_client_node,
                    self.heketi_server_url,
                    h_volume_size,
                    json=True)

        for w in waiter.Waiter(timeout=5, interval=1):
            h_db_chk_during_v_creation = heketi_db_check(
                self.heketi_client_node, self.heketi_server_url)
            if h_db_chk_during_v_creation["bricks"]["pending"] != 0:
                break
        if w.expired:
            err_msg = "No pending operation in Heketi db"
            g.log.error(err_msg)
            raise exceptions.ExecutionError(err_msg)

        retcode, stdout, stderr = h_vol_creation_async_op.async_communicate()
        heketi_vol = json.loads(stdout)
        volume_id = heketi_vol["id"]
        self.addCleanup(heketi_volume_delete,
                        self.heketi_client_node,
                        self.heketi_server_url,
                        volume_id,
                        raise_on_error=True)

        # Verify volume pending operation during creation
        self.assertFalse(h_db_chk_during_v_creation["bricks"]["pending"] % 3)
        self.assertEqual(h_db_chk_bfr_v_creation["volumes"]["pending"] + 1,
                         h_db_chk_during_v_creation["volumes"]["pending"])

        h_db_chk_after_v_creation = heketi_db_check(self.heketi_client_node,
                                                    self.heketi_server_url)

        # Verify bricks and volume pending operation after creation
        self.assertEqual(h_db_chk_after_v_creation["bricks"]["pending"], 0)
        self.assertEqual(h_db_chk_after_v_creation["volumes"]["pending"], 0)
Exemple #25
0
    def test_heketi_manual_cleanup_operation_in_bhv(self):
        """Validate heketi db cleanup will resolve the mismatch
           in the free size of the block hosting volume with failed
           block device create operations.
        """
        bhv_size_before, bhv_size_after, vol_count = [], [], 5
        ocp_node, g_node = self.ocp_master_node[0], self.gluster_servers[0]
        h_node, h_url = self.heketi_client_node, self.heketi_server_url

        # Get existing heketi volume list
        existing_volumes = heketi_volume_list(h_node, h_url, json=True)

        # Add function to clean stale volumes created during test
        self.addCleanup(self._cleanup_heketi_volumes,
                        existing_volumes.get("volumes"))

        # Get nodes id list
        node_id_list = heketi_node_list(h_node, h_url)

        # Disable 4th and other nodes
        for node_id in node_id_list[3:]:
            heketi_node_disable(h_node, h_url, node_id)
            self.addCleanup(heketi_node_enable, h_node, h_url, node_id)

        # Calculate heketi volume size
        free_space, nodenum = get_total_free_space(h_node, h_url)
        free_space_available = int(free_space / nodenum)
        if free_space_available > vol_count:
            h_volume_size = int(free_space_available / vol_count)
            if h_volume_size > 50:
                h_volume_size = 50
        else:
            h_volume_size, vol_count = 1, free_space_available

        # Create BHV in case blockvolume size is greater than default BHV size
        default_bhv_size = get_default_block_hosting_volume_size(
            h_node, self.heketi_dc_name)
        if default_bhv_size < h_volume_size:
            h_volume_name = "autotest-{}".format(utils.get_random_str())
            bhv_info = self.create_heketi_volume_with_name_and_wait(
                h_volume_name,
                free_space_available,
                raise_on_cleanup_error=False,
                block=True,
                json=True)
            free_space_available -= (
                int(bhv_info.get("blockinfo").get("reservedsize")) + 1)
            h_volume_size = int(free_space_available / vol_count)

        # Get BHV list
        h_bhv_list = get_block_hosting_volume_list(h_node, h_url).keys()
        self.assertTrue(h_bhv_list, "Failed to get the BHV list")

        # Get BHV size
        for bhv in h_bhv_list:
            vol_info = heketi_volume_info(h_node, h_url, bhv, json=True)
            bhv_vol_size_before = vol_info.get("freesize")
            bhv_size_before.append(bhv_vol_size_before)

        # Kill Tcmu-runner service
        services = ("tcmu-runner", "gluster-block-target", "gluster-blockd")
        kill_service_on_gluster_pod_or_node(ocp_node, "tcmu-runner", g_node)

        # Restart the services
        for service in services:
            state = ('exited'
                     if service == 'gluster-block-target' else 'running')
            self.addCleanup(wait_for_service_status_on_gluster_pod_or_node,
                            ocp_node, service, 'active', state, g_node)
            self.addCleanup(restart_service_on_gluster_pod_or_node, ocp_node,
                            service, g_node)

        def run_async(cmd, hostname, raise_on_error=True):
            return g.run_async(host=hostname, command=cmd)

        # Create stale block volumes in async
        for count in range(vol_count):
            with mock.patch.object(json, 'loads', side_effect=(lambda j: j)):
                with mock.patch.object(command,
                                       'cmd_run',
                                       side_effect=run_async):
                    heketi_blockvolume_create(h_node,
                                              h_url,
                                              h_volume_size,
                                              json=True)

        # Wait for pending operation to get generated
        self._check_for_pending_operations(h_node, h_url)

        # Restart the services
        for service in services:
            state = ('exited'
                     if service == 'gluster-block-target' else 'running')
            restart_service_on_gluster_pod_or_node(ocp_node, service, g_node)
            wait_for_service_status_on_gluster_pod_or_node(
                ocp_node, service, 'active', state, g_node)

        # Cleanup pending operation
        heketi_server_operation_cleanup(h_node, h_url)

        # wait for pending operation to get cleaned up
        for w in waiter.Waiter(timeout=120, interval=10):
            # Get BHV size
            for bhv in h_bhv_list:
                vol_info = heketi_volume_info(h_node, h_url, bhv, json=True)
                bhv_vol_size_after = vol_info.get("freesize")
                bhv_size_after.append(bhv_vol_size_after)

            if (set(bhv_size_before) == set(bhv_size_after)):
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to Validate volume size Actual:{},"
                " Expected:{}".format(set(bhv_size_before),
                                      set(bhv_size_after)))
Exemple #26
0
    def test_pv_resize_when_heketi_down(self):
        """Create a PVC and try to expand it when heketi is down, It should
        fail. After heketi is up, expand PVC should work.
        """
        self.create_storage_class(allow_volume_expansion=True)
        pvc_name = self.create_and_wait_for_pvc()
        dc_name, pod_name = self.create_dc_with_pvc(pvc_name)

        pv_name = get_pv_name_from_pvc(self.node, pvc_name)
        custom = (r':metadata.annotations.'
                  r'"gluster\.kubernetes\.io\/heketi-volume-id"')
        vol_id = oc_get_custom_resource(self.node, 'pv', custom, pv_name)[0]

        h_vol_info = heketi_ops.heketi_volume_info(self.heketi_client_node,
                                                   self.heketi_server_url,
                                                   vol_id,
                                                   json=True)

        # Bring the heketi POD down
        scale_dc_pod_amount_and_wait(self.node,
                                     self.heketi_dc_name,
                                     pod_amount=0)
        self.addCleanup(scale_dc_pod_amount_and_wait,
                        self.node,
                        self.heketi_dc_name,
                        pod_amount=1)

        cmd = 'dd if=/dev/urandom of=/mnt/%s bs=614400k count=1'
        ret, out, err = oc_rsh(self.node, pod_name, cmd % 'file1')
        self.assertFalse(ret, 'Not able to write file with err: %s' % err)
        wait_for_pod_be_ready(self.node, pod_name, 10, 5)

        resize_pvc(self.node, pvc_name, 2)
        wait_for_events(self.node,
                        pvc_name,
                        obj_type='PersistentVolumeClaim',
                        event_type='Warning',
                        event_reason='VolumeResizeFailed')

        # Verify volume was not expanded
        vol_info = get_gluster_vol_info_by_pvc_name(self.node, pvc_name)
        self.assertEqual(vol_info['gluster_vol_id'], h_vol_info['name'])
        self.assertEqual(len(vol_info['bricks']['brick']),
                         len(h_vol_info['bricks']))

        # Bring the heketi POD up
        scale_dc_pod_amount_and_wait(self.node,
                                     self.heketi_dc_name,
                                     pod_amount=1)

        # Verify volume expansion
        verify_pvc_size(self.node, pvc_name, 2)
        vol_info = get_gluster_vol_info_by_pvc_name(self.node, pvc_name)
        self.assertFalse(len(vol_info['bricks']['brick']) % 3)
        self.assertLess(len(h_vol_info['bricks']),
                        len(vol_info['bricks']['brick']))

        # Wait for remount after expansion
        for w in waiter.Waiter(timeout=30, interval=5):
            ret, out, err = oc_rsh(self.node, pod_name,
                                   "df -Ph /mnt | awk '{print $2}' | tail -1")
            self.assertFalse(ret,
                             'Failed with err: %s and Output: %s' % (err, out))
            if out.strip() == '2.0G':
                break

        # Write data making sure we have more space than it was
        ret, out, err = oc_rsh(self.node, pod_name, cmd % 'file2')
        self.assertFalse(ret, 'Not able to write file with err: %s' % err)

        # Verify pod is running
        wait_for_pod_be_ready(self.node, pod_name, 10, 5)
Exemple #27
0
    def test_heketi_server_db_pending_entries_for_volume_operations(
            self, vol_type):
        """Verify pending entries of blockvolumes/volumes and bricks in db
        during heketi blockvolume/volume delete operation.
        """

        # Create a large volumes to observe the pending operation
        h_volume_size, volume_ids, async_obj = 95, [], []
        h_node, h_url = self.heketi_client_node, self.heketi_server_url

        h_db_check_before = heketi_db_check(h_node, h_url)
        h_db_check_bricks_before = h_db_check_before["bricks"]
        h_db_check_vol_before = h_db_check_before["{}volumes".format(vol_type)]

        # Check file/block volume pending operations before creation.
        if h_db_check_vol_before["pending"]:
            self.skipTest(
                "Skip TC due to unexpected {}volumes pending operations".
                format(vol_type))

        # Check bricks pending operations before creation.
        if h_db_check_bricks_before["pending"]:
            self.skipTest(
                "Skip TC due to unexpected bricks pending operations for"
                " {}volume".format(vol_type))

        # Create 5 file/block volumes to find out pending operations
        for count in range(5):
            vol_info = eval("heketi_{}volume_create".format(vol_type))(
                h_node, h_url, h_volume_size, json=True)
            volume_ids.append(vol_info["id"])
            self.addCleanup(eval("heketi_{}volume_delete".format(vol_type)),
                            h_node,
                            h_url,
                            vol_info["id"],
                            raise_on_error=False)

            h_db_check_after = heketi_db_check(h_node, h_url)
            h_db_check_bricks_after = h_db_check_after["bricks"]
            h_db_check_vol_after = h_db_check_after["{}volumes".format(
                vol_type)]

            # Verify file/block volumes pending operation after creation
            err_msg = ("Expecting heketi db {}volume pending operation to be "
                       "0 but found {}")
            self.assertFalse(
                h_db_check_vol_after["pending"],
                err_msg.format(vol_type, h_db_check_vol_after["pending"]))

            # Verify bricks pending operation after volume creation
            err_msg = ("Expecting heketi db bricks pending operation to be "
                       "0 but found {} after {}volume creation")
            self.assertFalse(
                h_db_check_bricks_after["pending"],
                err_msg.format(h_db_check_bricks_after["pending"], vol_type))

        def run_async(cmd, hostname, raise_on_error=True):
            async_op = g.run_async(host=hostname, command=cmd)
            async_obj.append(async_op)
            return async_op

        for vol_id in volume_ids:
            # Temporary replace g.run with g.async_run in heketi_volume_delete
            # and heketi_blockvolume_delete func to be able to run it in
            # background.
            with mock.patch.object(command, 'cmd_run', side_effect=run_async):
                eval("heketi_{}volume_delete".format(vol_type))(h_node, h_url,
                                                                vol_id)

        for w in waiter.Waiter(timeout=10, interval=1):
            h_db_check = heketi_db_check(h_node, h_url)
            h_db_check_bricks = h_db_check["bricks"]
            h_db_check_vol = h_db_check["{}volumes".format(vol_type)]

            if h_db_check_vol["pending"] != 0:
                break

        if w.expired:
            err_msg = ("Expected some pending operations found {} operation"
                       " for {}volume in Heketi db")
            g.log.error(err_msg.format(h_db_check_vol["pending"], vol_type))
            raise exceptions.ExecutionError(
                err_msg.format(h_db_check_vol["pending"], vol_type))

        # Verify pending operation during file/block volumes delete
        err_msg = ("Expecting pending operations for {}volume during"
                   " deletion")
        self.assertTrue(h_db_check_vol["pending"], err_msg.format(vol_type))

        # Verify brick pending operation during delete
        err_msg = ("Expecting bricks pending in multiple of 3 but found {}")
        if vol_type == '':
            self.assertFalse(h_db_check_bricks["pending"] % 3,
                             err_msg.format(h_db_check_bricks["pending"]))

        # Verify volume/blockvolume pending operation during delete
        for w in waiter.Waiter(timeout=100, interval=5):
            h_db_check_vol_after = heketi_db_check(h_node, h_url)
            h_db_check_bricks_after = h_db_check_vol_after["bricks"]
            h_db_check_vol_after = h_db_check_vol_after["{}volumes".format(
                vol_type)]

            # verify if file/block volumes and bricks are properly deleted
            if (((not vol_type) and (not h_db_check_bricks_after["pending"]))
                    or (not h_db_check_vol_after["pending"])):
                break
        if w.expired:
            err_msg = ("Failed to delete {}volumes after waiting for 100 secs")
            raise exceptions.AssertionError(err_msg.format(vol_type))

        # Check that all background processes got exited
        for obj in async_obj:
            ret, out, err = obj.async_communicate()
            self.assertFalse(ret, err)

        # Verify bricks pending operation after delete
        if vol_type == "":
            err_msg = ("Expecting 0 bricks pending operations after deletion"
                       " but found {} after {}volume deletion")
            self.assertFalse(
                h_db_check_bricks_after["pending"],
                err_msg.format(h_db_check_bricks_after["pending"], vol_type))

        # Verify volumes/bockvolumes pending operation after delete
        err_msg = ("Expecting 0 {}volume pending operations after deletion"
                   " but found {}")
        self.assertFalse(
            h_db_check_vol_after["pending"],
            err_msg.format(vol_type, h_db_check_vol_after["pending"]))

        # Verify if initial and final volumes/blockvolumes are same
        err_msg = (
            "Total volume before {} and after {} creation not matched".format(
                h_db_check_vol_after["total"], h_db_check_vol_before["total"]))
        self.assertEqual(h_db_check_vol_after["total"],
                         h_db_check_vol_before["total"], err_msg)

        # Verify if initial and final bricks are same
        err_msg = (
            "Total bricks before {} and after {} creation not matched".format(
                h_db_check_bricks_after["total"],
                h_db_check_bricks_before["total"]))
        self.assertEqual(h_db_check_bricks_after["total"],
                         h_db_check_bricks_before["total"], err_msg)
    def test_heketi_prometheus_usedbytes_brickcount_on_device_delete(
            self, operation):
        """Validate used bytes,device count on heketi and prometheus"""
        h_node, h_server = self.heketi_client_node, self.heketi_server_url

        # Get list of additional devices for one of the Gluster nodes
        gluster_server_0 = list(self.gluster_servers_info.values())[0]
        manage_hostname = gluster_server_0.get("manage")
        self.assertTrue(
            manage_hostname, "IP Address is not specified for "
                             "node {}".format(gluster_server_0))
        device_name = gluster_server_0.get("additional_devices")[0]
        self.assertTrue(
            device_name, "Additional devices are not specified for "
                         "node {}".format(gluster_server_0))

        # Get node ID of the Gluster hostname
        node_list = heketi_ops.heketi_topology_info(
            h_node, h_server, json=True).get("clusters")[0].get("nodes")
        self.assertTrue(
            node_list, "Cluster info command returned empty list of nodes")
        node_id = [
            node.get("id")
            for node in node_list
            if manage_hostname == node.get("hostnames").get("manage")[0]]
        self.assertTrue(
            node_id, "Failed to get node_id for {}".format(manage_hostname))
        node_id = node_id[0]

        # Adding heketi device
        heketi_ops.heketi_device_add(h_node, h_server, device_name, node_id)
        node_info_after_addition = heketi_ops.heketi_node_info(
            h_node, h_server, node_id, json=True)
        device_id, bricks = None, None
        for device in node_info_after_addition.get("devices"):
            if device.get("name") == device_name:
                device_id, bricks = (
                    device.get("id"), len(device.get("bricks")))
                break

        # Verify zero bricks on the device
        msg = (
            "Number of bricks on the device {} of the nodes should be"
            "zero".format(device_name))
        self.assertFalse(bricks, msg)
        self.addCleanup(
            heketi_ops.heketi_device_delete, h_node, h_server, device_id,
            raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_device_remove, h_node, h_server, device_id,
            raise_on_error=False)
        self.addCleanup(
            heketi_ops.heketi_device_disable, h_node, h_server, device_id,
            raise_on_error=False)

        # Disable,Remove and Delete heketi device
        heketi_ops.heketi_device_disable(h_node, h_server, device_id)
        heketi_ops.heketi_device_remove(h_node, h_server, device_id)
        heketi_ops.heketi_device_delete(h_node, h_server, device_id)

        # Verify device deletion
        node_info_after_deletion = (
            heketi_ops.heketi_node_info(h_node, h_server, node_id))
        msg = ("Device {} should not be shown in node info of the node {}"
               "after the device deletion".format(device_id, node_id))
        self.assertNotIn(device_id, node_info_after_deletion, msg)

        if operation == "usedbytes":
            # Validate heketi and prometheus device used bytes
            for w in waiter.Waiter(timeout=60, interval=10):
                device_used_bytes_prometheus = 0
                device_used_bytes_metrics = 0
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'openshift-monitoring')
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_device_used_bytes')
                for result in metric_result:
                    if (node_id == result.get('cluster')
                            and device_name == result.get('device')):
                        device_used_bytes_prometheus += (
                            int(result.get('value')[1]))
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'glusterfs')
                metrics = heketi_ops.get_heketi_metrics(h_node, h_server)
                heketi_device_count_metric = (
                    metrics.get('heketi_device_used_bytes'))
                for result in heketi_device_count_metric:
                    if (node_id == result.get('cluster')
                            and device_name == result.get('device')):
                        device_used_bytes_metrics = int(result.get('value'))
                if device_used_bytes_prometheus == device_used_bytes_metrics:
                    break
            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to update device details in prometheus")

        elif operation == "brickcount":
            # Validate heketi and prometheus device brick count
            for w in waiter.Waiter(timeout=60, interval=10):
                device_brick_count_prometheus = 0
                device_brick_count_metrics = 0
                metrics = heketi_ops.get_heketi_metrics(h_node, h_server)
                heketi_device_count_metric = metrics.get(
                    'heketi_device_brick_count')
                for result in heketi_device_count_metric:
                    device_brick_count_metrics += int(result.get('value'))
                openshift_ops.switch_oc_project(
                    self.ocp_master_node[0], 'openshift-monitoring')
                metric_result = self._fetch_metric_from_promtheus_pod(
                    metric='heketi_device_brick_count')
                for result in metric_result:
                    device_brick_count_prometheus += (
                        int(result.get('value')[1]))
                if device_brick_count_prometheus == device_brick_count_metrics:
                    break
            if w.expired:
                raise exceptions.ExecutionError(
                    "Failed to update device details in prometheus")
def enable_pvc_resize(master_node):
    '''
     This function edits the /etc/origin/master/master-config.yaml
     file - to enable pv_resize feature
     and restarts atomic-openshift service on master node
     Args:
         master_node (str): hostname of masternode  on which
                           want to edit the
                           master-config.yaml file
     Returns:
         bool: True if successful,
               otherwise raise Exception
    '''
    version = get_openshift_version()
    if version < "3.9":
        msg = ("pv resize is not available in openshift "
               "version %s " % version)
        g.log.error(msg)
        raise NotSupportedException(msg)

    with tempfile.NamedTemporaryFile(delete=False) as temp:
        temp_filename = temp.name

    try:
        g.download(master_node, MASTER_CONFIG_FILEPATH, temp_filename)
    except Exception as e:
        err_msg = (
            "Failed to download '{}' from master node '{}' due to"
            "exception\n{}".format(
                MASTER_CONFIG_FILEPATH, master_node, six.text_type(e)))
        raise ExecutionError(err_msg)

    with open(temp_filename, 'r') as f:
        data = yaml.load(f, Loader=yaml.FullLoader)
        dict_add = data['admissionConfig']['pluginConfig']
        if "PersistentVolumeClaimResize" in dict_add:
            g.log.info("master-config.yaml file is already edited")
            return True
        dict_add['PersistentVolumeClaimResize'] = {
            'configuration': {
                'apiVersion': 'v1',
                'disable': 'false',
                'kind': 'DefaultAdmissionConfig'}}
        data['admissionConfig']['pluginConfig'] = dict_add
        kube_config = data['kubernetesMasterConfig']
        for key in ('apiServerArguments', 'controllerArguments'):
            kube_config[key] = (
                kube_config.get(key)
                if isinstance(kube_config.get(key), dict) else {})
            value = ['ExpandPersistentVolumes=true']
            kube_config[key]['feature-gates'] = value

    with open(temp_filename, 'w+') as f:
        yaml.dump(data, f, default_flow_style=False)

    try:
        g.upload(master_node, temp_filename, MASTER_CONFIG_FILEPATH)
    except Exception as e:
        err_msg = (
            "Failed to upload '{}' to master node '{}' due to"
            "exception\n{}".format(
                master_node, MASTER_CONFIG_FILEPATH, six.text_type(e)))
        raise ExecutionError(err_msg)
    os.unlink(temp_filename)

    if version == "3.9":
        cmd = ("systemctl restart atomic-openshift-master-api "
               "atomic-openshift-master-controllers")
    else:
        cmd = ("/usr/local/bin/master-restart api && "
               "/usr/local/bin/master-restart controllers")
    ret, out, err = g.run(master_node, cmd, "root")
    if ret != 0:
        err_msg = "Failed to execute cmd %s on %s\nout: %s\nerr: %s" % (
            cmd, master_node, out, err)
        g.log.error(err_msg)
        raise ExecutionError(err_msg)

    # Wait for API service to be ready after the restart
    for w in waiter.Waiter(timeout=120, interval=1):
        try:
            cmd_run("oc get nodes", master_node)
            return True
        except AssertionError:
            continue
    err_msg = "Exceeded 120s timeout waiting for OCP API to start responding."
    g.log.error(err_msg)
    raise ExecutionError(err_msg)
    def test_heketi_metrics_validation_with_node_reboot(self):
        """Validate heketi metrics after node reboot using prometheus"""

        initial_metrics, final_metrics = {}, {}

        # Use storage project
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)

        # Get initial metrics result
        h_node, h_server = self.heketi_client_node, self.heketi_server_url
        initial_metrics = tuple(
            heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0]
            for metric in self.metrics)

        # Use prometheus project
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)

        # Get initial prometheus result
        initial_prometheus = self._get_and_manipulate_metric_data(
            self.metrics)

        # Get hosted node IP of heketi pod
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        heketi_pod = openshift_ops.get_pod_name_from_dc(
            self._master, self.heketi_dc_name)
        heketi_node = openshift_ops.oc_get_custom_resource(
            self._master, 'pod', '.:spec.nodeName', heketi_pod)[0]

        # Reboot the node on which heketi pod is scheduled
        self.addCleanup(
            self._check_heketi_and_gluster_pod_after_node_reboot, heketi_node)
        node_ops.node_reboot_by_command(heketi_node)

        # Wait node to become NotReady
        custom = r'":.status.conditions[?(@.type==\"Ready\")]".status'
        for w in waiter.Waiter(300, 10):
            status = openshift_ops.oc_get_custom_resource(
                self._master, 'node', custom, heketi_node)
            if status[0] == 'False':
                break
        if w.expired:
            raise exceptions.ExecutionError(
                "Failed to bring down node {}".format(heketi_node))

        # Wait for node to become ready
        openshift_ops.wait_for_ocp_node_be_ready(self._master, heketi_node)

        # Wait for heketi and glusterfs pod to become ready
        self._check_heketi_and_gluster_pod_after_node_reboot(heketi_node)

        # Use prometheus project
        openshift_ops.switch_oc_project(
            self._master, self._prometheus_project_name)

        # Get final metrics result
        final_metrics = tuple(
            heketi_ops.get_heketi_metrics(h_node, h_server).get(metric)[0]
            for metric in self.metrics)

        # Get final prometheus result
        final_prometheus = self._get_and_manipulate_metric_data(
            self.metrics)

        err_msg = "Initial value {} is not same as final value {}"
        self.assertEqual(
            initial_metrics, final_metrics, err_msg.format(
                initial_metrics, final_metrics))
        self.assertEqual(
            initial_prometheus, final_prometheus, err_msg.format(
                initial_prometheus, final_prometheus))