def test_volume_status_xml(self):

        # create a two node cluster
        ret = peer_probe_servers(self.servers[0], self.servers[1])
        self.assertTrue(
            ret,
            "Peer probe failed to %s from %s" % (self.mnode, self.servers[1]))

        # create a distributed volume with single node
        number_of_bricks = 1
        servers_info_from_single_node = {}
        servers_info_from_single_node[self.servers[0]] = self.all_servers_info[
            self.servers[0]]

        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       number_of_bricks, self.servers[0],
                                       servers_info_from_single_node)
        ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list)
        self.assertEqual(ret, 0, "Volume creation failed")
        g.log.info("Volume %s created successfully", self.volname)

        # Get volume status
        ret, _, err = volume_status(self.servers[1], self.volname)
        self.assertNotEqual(ret, 0, ("Unexpected: volume status is success for"
                                     " %s, even though volume is not started "
                                     "yet" % self.volname))
        self.assertIn("is not started", err, ("volume status exited with"
                                              " incorrect error message"))

        # Get volume status with --xml
        vol_status = get_volume_status(self.servers[1], self.volname)
        self.assertIsNone(vol_status, ("Unexpected: volume status --xml for %s"
                                       " is success even though the volume is"
                                       " not stared yet" % self.volname))

        # start the volume
        ret, _, _ = volume_start(self.servers[1], self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)

        # Get volume status
        ret, _, _ = volume_status(self.servers[1], self.volname)
        self.assertEqual(ret, 0,
                         ("Failed to get volume status for %s" % self.volname))

        # Get volume status with --xml
        vol_status = get_volume_status(self.servers[1], self.volname)
        self.assertIsNotNone(vol_status,
                             ("Failed to get volume "
                              "status --xml for %s" % self.volname))

        # Verify there are no crashes while executing gluster volume status
        status = True
        glusterd_log = (self._get_test_specific_glusterd_log(
            self.mnode).split("\n"))
        for line in glusterd_log:
            if ' E ' in glusterd_log:
                status = False
                g.log.info("Unexpected! Error found %s", line)

        self.assertTrue(status, "Error found in glusterd logs")
Example #2
0
    def test_volume_status_xml(self):

        # create a two node cluster
        ret = peer_probe_servers(self.servers[0], self.servers[1])
        self.assertTrue(
            ret,
            "Peer probe failed to %s from %s" % (self.mnode, self.servers[1]))

        # create a distributed volume with single node
        number_of_bricks = 1
        servers_info_from_single_node = {}
        servers_info_from_single_node[self.servers[0]] = self.all_servers_info[
            self.servers[0]]

        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       number_of_bricks, self.servers[0],
                                       servers_info_from_single_node)
        ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list)
        self.assertEqual(ret, 0, "Volume creation failed")
        g.log.info("Volume %s created successfully", self.volname)

        # Get volume status
        ret, _, err = volume_status(self.servers[1], self.volname)
        self.assertNotEqual(ret, 0, ("Unexpected: volume status is success for"
                                     " %s, even though volume is not started "
                                     "yet" % self.volname))
        self.assertIn("is not started", err, ("volume status exited with"
                                              " incorrect error message"))

        # Get volume status with --xml
        vol_status = get_volume_status(self.servers[1], self.volname)
        self.assertIsNone(vol_status, ("Unexpected: volume status --xml for %s"
                                       " is success even though the volume is"
                                       " not stared yet" % self.volname))

        # start the volume
        ret, _, _ = volume_start(self.servers[1], self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)

        # Get volume status
        ret, _, _ = volume_status(self.servers[1], self.volname)
        self.assertEqual(ret, 0,
                         ("Failed to get volume status for %s" % self.volname))

        # Get volume status with --xml
        vol_status = get_volume_status(self.servers[1], self.volname)
        self.assertIsNotNone(vol_status,
                             ("Failed to get volume "
                              "status --xml for %s" % self.volname))
Example #3
0
    def get_brick_and_volume_status(self, volume_name):
        """Status of each brick in a volume for background validation."""

        volume_info = volume_ops.get_volume_info(
            'auto_get_gluster_endpoint', volume_name)
        self.assertIsNotNone(
            volume_info, "'%s' volume info is empty" % volume_name)

        volume_status = volume_ops.get_volume_status(
            'auto_get_gluster_endpoint', volume_name)
        self.assertIsNotNone(
            volume_status, "'%s' volume status is empty" % volume_name)

        self.assertEqual(int(volume_info[volume_name]["status"]), 1,
                         "Volume not up")

        brick_info = []
        for brick_details in volume_info[volume_name]["bricks"]["brick"]:
            brick_info.append(brick_details["name"])
        self.assertTrue(
            brick_info, "Brick details are empty for %s" % volume_name)

        for brick in brick_info:
            brick_data = brick.strip().split(":")
            brick_ip = brick_data[0]
            brick_name = brick_data[1]
            self.assertEqual(int(volume_status[volume_name][brick_ip]
                             [brick_name]["status"]), 1,
                             "Brick %s not up" % brick_name)
Example #4
0
def are_bricks_online(mnode, volname, bricks_list):
    """Verify all the specified list of bricks are online.

    Args:
        mnode (str): Node on which commands will be executed.
        volname (str): Name of the volume.
        bricks_list (list): List of bricks to verify online status.

    Returns:
        bool : True if all bricks online. False otherwise.
        NoneType: None on failure in getting volume status
    """
    _rc = True
    offline_bricks_list = []
    volume_status = get_volume_status(mnode, volname)
    if not volume_status:
        g.log.error("Unable to check if bricks are online for the volume %s",
                    volname)
        return None
    for brick in bricks_list:
        brick_node, brick_path = brick.split(":")
        status = int(volume_status[volname][brick_node][brick_path]['status'])
        if status != 1:
            g.log.error("BRICK : %s is not online", brick)
            offline_bricks_list.append(brick)
            _rc = False

    if not _rc:
        g.log.error("Some of the bricks %s are not online",
                    offline_bricks_list)
        return False

    g.log.info("All the bricks %s are online", bricks_list)
    return True
Example #5
0
def get_online_bricks_list(mnode, volname):
    """Get list of bricks which are online.

    Args:
        mnode (str): Node on which commands will be executed.
        volname (str): Name of the volume.

    Returns:
        list : List of bricks in the volume which are online.
        NoneType: None on failure in getting volume status
    """
    online_bricks_list = []
    volume_status = get_volume_status(mnode, volname)
    if not volume_status:
        g.log.error("Unable to get online bricks_list for the volume %s",
                    volname)
        return None

    bricks_list = get_all_bricks(mnode, volname)
    for brick in bricks_list:
        brick_node, brick_path = brick.split(":")
        try:
            status = int(
                volume_status[volname][brick_node][brick_path]['status'])
        except KeyError:
            continue
        if status == 1:
            online_bricks_list.append(brick)

    return online_bricks_list
Example #6
0
def is_snapd_running(mnode, volname):
    """Checks if snapd is running on the given node

        Args:
            mnode (str): Node on which cmd has to be executed.
            volname (str): volume name

        Returns:
            True on success, False otherwise

        Example:
            is_snapd_running("abc.com", "testvol")
            """
    vol_status = get_volume_status(mnode, volname=volname)

    if vol_status is None:
        g.log.error("Failed to get volume status in is_snapd_running()")
        return False

    if 'Snapshot Daemon' not in vol_status[volname][mnode]:
        g.log.error("uss is not enabled in volume %s" % volname)
        return False

    snapd_status = vol_status[volname][mnode]['Snapshot Daemon']['status']
    if snapd_status != '1':
        g.log.error("Snapshot Daemon is not running in node %s" % mnode)
        return False
    return True
    def get_brick_and_volume_status(self, volume_name):
        """Status of each brick in a volume for background validation."""

        volume_info = volume_ops.get_volume_info(
            'auto_get_gluster_endpoint', volume_name)
        self.assertIsNotNone(
            volume_info, "'%s' volume info is empty" % volume_name)

        volume_status = volume_ops.get_volume_status(
            'auto_get_gluster_endpoint', volume_name)
        self.assertIsNotNone(
            volume_status, "'%s' volume status is empty" % volume_name)

        self.assertEqual(int(volume_info[volume_name]["status"]), 1,
                         "Volume not up")

        brick_info = []
        for brick_details in volume_info[volume_name]["bricks"]["brick"]:
            brick_info.append(brick_details["name"])
        self.assertTrue(
            brick_info, "Brick details are empty for %s" % volume_name)

        for brick in brick_info:
            brick_data = brick.strip().split(":")
            brick_ip = brick_data[0]
            brick_name = brick_data[1]
            self.assertEqual(int(volume_status[volume_name][brick_ip]
                             [brick_name]["status"]), 1,
                             "Brick %s not up" % brick_name)
Example #8
0
def is_shd_daemon_running(mnode, node, volname):
    """
    Verifies whether the shd daemon is up and running on a particular node by
    checking the existence of shd pid and parsing the get volume status output.

    Args:
        mnode (str): The first node in servers list
        node (str): The node to be checked for whether the glustershd
                    process is up or not
        volname (str): Name of the volume created

    Returns:
        boolean: True if shd is running on the node, False, otherwise
    """

    # Get glustershd pid from node.
    ret, glustershd_pids = get_self_heal_daemon_pid(node)
    if not ret and glustershd_pids[node] != -1:
        return False
    # Verifying glustershd process is no longer running from get status.
    vol_status = get_volume_status(mnode, volname)
    if vol_status is None:
        return False
    try:
        _ = vol_status[volname][node]['Self-heal Daemon']
        return True
    except KeyError:
        return False
Example #9
0
    def is_daemon_process_running(self):
        """
            function for checking daemon process.
        """
        vol_status_shd_pid_list = []
        vol_status_quotad_pid_list = []
        g.log.info(
            "Total self-heal and quota daemon process should be %d for "
            "%d nodes",
            len(self.servers) * 2, len(self.servers))

        # Getting vol status in dictonary format
        vol_status = get_volume_status(self.mnode, self.volname)

        # Getting self-heal daemon and quota daemon pids of every host from
        # gluster volume status
        for server in self.servers:
            vol_status_quotad_pid_list.append(
                vol_status[self.volname][server]['Quota Daemon']['pid'])
            vol_status_shd_pid_list.append(
                vol_status[self.volname][server]['Self-heal Daemon']['pid'])

        g.log.info("shd list from get volume status: %s",
                   vol_status_shd_pid_list)
        g.log.info("quotad list from get volume status: %s",
                   vol_status_quotad_pid_list)

        sh_daemon_list = []
        quotad_list = []

        # Finding and Storing all hosts self heal daemon
        # in to sh_daemon_list, all
        # host quota daemon into quotad_list list using ps command
        for daemon_name, daemon_list in (('glustershd', sh_daemon_list),
                                         ('quotad', quotad_list)):
            for host in self.servers:
                cmd = "ps -eaf |grep %s |grep -v grep | awk '{ print $2 }'" % (
                    daemon_name)
                ret, out, err = g.run(host, cmd)
                err_msg = ("Failed to find '%s' daemon on the '%s' host using "
                           "'ps -eaf' command.\nret: %s\nout: %s\nerr: %s" %
                           (daemon_name, host, ret, out, err))
                self.assertEqual(ret, 0, err_msg)
                daemon_list.append(out.strip())

        g.log.info("shd list :%s", sh_daemon_list)
        g.log.info("quotad list :%s", quotad_list)

        # Checking in all hosts quota daemon and self heal daemon is
        # running or not
        # Here comparing the list of daemons got from ps command and
        # list of daemons got from vol status command,
        # all daemons should match from both the list
        if sorted(sh_daemon_list +
                  quotad_list) == sorted(vol_status_shd_pid_list +
                                         vol_status_quotad_pid_list):
            return len(sh_daemon_list + quotad_list) == len(self.servers) * 2

        return False
 def test_gluster_volume_status_xml_dump(self):
     """
     Setps:
     1. stop one of the volume
         (i.e) gluster volume stop <vol-name>
     2. Get the status of the volumes with --xml dump
         XML dump should be consistent
     """
     ret, _, _ = volume_stop(self.mnode, volname=self.volname_2, force=True)
     self.assertFalse(ret,
                      "Failed to stop volume '{}'".format(self.volname_2))
     out = get_volume_status(self.mnode)
     self.assertIsNotNone(
         out, "Failed to get volume status on {}".format(self.mnode))
     for _ in range(4):
         sleep(2)
         out1 = get_volume_status(self.mnode)
         self.assertIsNotNone(
             out1, "Failed to get volume status on {}".format(self.mnode))
         self.assertEqual(out1, out)
Example #11
0
def are_all_self_heal_daemons_are_online(mnode, volname):
    """Verifies whether all the self-heal-daemons are online for the specified
        volume.

    Args:
        mnode (str): Node on which cmd has to be executed.
        volname (str): volume name

    Returns:
        bool : True if all the self-heal-daemons are online for the volume.
            False otherwise.
        NoneType: None if unable to get the volume status
    """
    from glustolibs.gluster.volume_libs import is_distribute_volume
    if is_distribute_volume(mnode, volname):
        g.log.info(
            "Volume %s is a distribute volume. "
            "Hence not checking for self-heal daemons "
            "to be online", volname)
        return True

    service = 'shd'
    failure_msg = ("Verifying all self-heal-daemons are online failed for "
                   "volume %s" % volname)
    # Get volume status
    vol_status = get_volume_status(mnode=mnode,
                                   volname=volname,
                                   service=service)
    if vol_status is None:
        g.log.error(failure_msg)
        return None

    # Get all nodes from pool list
    from glustolibs.gluster.peer_ops import nodes_from_pool_list
    all_nodes = nodes_from_pool_list(mnode)
    if not all_nodes:
        g.log.error(failure_msg)
        return False

    online_status = True
    for node in all_nodes:
        node_shd_status_value = (
            vol_status[volname][node]['Self-heal Daemon']['status'])
        if node_shd_status_value != '1':
            online_status = False
    g.run(mnode, ("gluster volume status %s shd" % volname))
    if online_status is True:
        g.log.info("All self-heal Daemons are online")
        return True
    else:
        g.log.error("Some of the self-heal Daemons are offline")
        return False
def get_gluster_vol_status(file_vol):
    """Get Gluster vol hosting nodes.

    Args:
        file_vol (str): file volume name.
    """
    # Get Gluster vol info
    gluster_volume_status = get_volume_status(
        "auto_get_gluster_endpoint", file_vol)
    if not gluster_volume_status:
        raise AssertionError("Failed to get volume status for gluster "
                             "volume '%s'" % file_vol)
    if file_vol in gluster_volume_status:
        gluster_volume_status = gluster_volume_status.get(file_vol)
    return gluster_volume_status
def get_gluster_vol_status(file_vol):
    """Get Gluster vol status.

    Args:
        file_vol (str): file volume name.
    """
    # Get Gluster vol info
    gluster_volume_status = get_volume_status(
        "auto_get_gluster_endpoint", file_vol)
    if not gluster_volume_status:
        raise AssertionError("Failed to get volume status for gluster "
                             "volume '%s'" % file_vol)
    if file_vol in gluster_volume_status:
        gluster_volume_status = gluster_volume_status.get(file_vol)
    return gluster_volume_status
def get_gluster_vol_status(file_vol, is_detail=False):
    """Get Gluster vol status.

    Args:
        file_vol (str): file volume name.
        is_detail (bool): True for detailed output else False
    """
    # Get Gluster vol info
    options = 'detail' if is_detail else ''
    gluster_volume_status = get_volume_status("auto_get_gluster_endpoint",
                                              file_vol,
                                              options=options)
    if not gluster_volume_status:
        raise AssertionError("Failed to get volume status for gluster "
                             "volume '%s'" % file_vol)
    if file_vol in gluster_volume_status:
        gluster_volume_status = gluster_volume_status.get(file_vol)
    return gluster_volume_status
def restart_block_hosting_volume(gluster_pod,
                                 block_hosting_vol,
                                 sleep_time=120,
                                 hostname=None):
    """restars block hosting volume service

    Args:
        hostname (str): hostname on which gluster pod exists
        gluster_pod (podcmd | str): gluster pod class object has gluster
                                    pod and ocp master node or gluster
                                    pod name
        block_hosting_vol (str): name of block hosting volume
    """
    gluster_pod = _get_gluster_pod(gluster_pod, hostname)

    gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol)
    if not gluster_volume_status:
        raise AssertionError("failed to get gluster volume status")

    g.log.info("Gluster volume %s status\n%s : " %
               (block_hosting_vol, gluster_volume_status))

    ret, out, err = volume_stop(gluster_pod, block_hosting_vol)
    if ret != 0:
        err_msg = "failed to stop gluster volume %s on pod %s error: %s" % (
            block_hosting_vol, gluster_pod, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    # Explicit wait to stop ios and pvc creation for 2 mins
    time.sleep(sleep_time)
    ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True)
    if ret != 0:
        err_msg = "failed to start gluster volume %s on pod %s error: %s" % (
            block_hosting_vol, gluster_pod, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    ret, out, err = volume_status(gluster_pod, block_hosting_vol)
    if ret != 0:
        err_msg = ("failed to get status for gluster volume %s on pod %s "
                   "error: %s" % (block_hosting_vol, gluster_pod, err))
        g.log.error(err_msg)
        raise AssertionError(err_msg)
def check_for_cpu_usage_spikes_on_glusterfsd(nodes, test_name, threshold=3):
    """Check for CPU usage spikes in glusterfsd

    Args:
     nodes(list): Servers on which memory leaks have to be checked
     test_name(str): Name of testcase for which memory leaks has to be checked

    Kwargs:
     threshold(int): Accepted amount of instances of 100% CPU usage
                    (Default:3)

    Returns:
      bool: True if CPU spikes are more than threshold else False

    NOTE:
     This function should be exuected with the volumes present on the cluster.
    """
    is_there_a_spike = []
    for node in nodes:
        # Get the volume status on the node
        volume_status = get_volume_status(node)
        dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name)
        if dataframe.empty:
            return False

        for volume in volume_status.keys():
            for process in volume_status[volume][node].keys():
                # Skiping if process isn't brick process
                if process in ('Self-heal Daemon', 'Quota Daemon'):
                    continue

                # Call function to check for cpu spikes
                cpu_spikes = _check_for_cpu_usage_spikes(
                    dataframe, node, 'glusterfsd', threshold, volume_status,
                    volume, process)
                if cpu_spikes:
                    g.log.error(
                        "CPU usage spikes observed more than "
                        "threshold %d on node %s on volume %s for "
                        "brick process %s", threshold, node, volume, process)
                is_there_a_spike.append(cpu_spikes)

    return any(is_there_a_spike)
def check_for_memory_leaks_in_glusterfsd(nodes, test_name, gain=30.0):
    """Check for memory leaks in glusterfsd

    Args:
     nodes(list): Servers on which memory leaks have to be checked
     test_name(str): Name of testcase for which memory leaks has to be checked

    Kwargs:
     gain(float): Accepted amount of leak for a given testcase in MB
                  (Default:30)

    Returns:
      bool: True if memory leak was obsevred else False

    NOTE:
     This function should be executed with the volumes present on the cluster.
    """
    is_there_a_leak = []
    for node in nodes:
        # Get the volume status on the node
        volume_status = get_volume_status(node)
        dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name)
        if dataframe.empty:
            return False

        for volume in volume_status.keys():
            for process in volume_status[volume][node].keys():
                # Skiping if process isn't brick process
                if not process.count('/'):
                    continue

                # Call 3 point check function
                three_point_check = _perform_three_point_check_for_memory_leak(
                    dataframe, node, 'glusterfsd', gain, volume_status, volume,
                    process)
                if three_point_check:
                    g.log.error(
                        "Memory leak observed on node %s in brick "
                        " process for brick %s on volume %s", node, process,
                        volume)
                is_there_a_leak.append(three_point_check)

    return any(is_there_a_leak)
Example #18
0
def check_brick_pid_matches_glusterfsd_pid(mnode, volname):
    # pylint: disable=invalid-name
    """Checks for brick process(es) both volume status
       and 'ps -eaf | grep glusterfsd' matches for
       the given volume

    Args:
        mnode (str): Node on which cmd has to be executed.
        volname (str): Name of the volume.

    Returns:
        bool : True if pid's matches. False otherwise.
    """
    from glustolibs.gluster.brick_libs import get_all_bricks
    _rc = True
    bricks_list = get_all_bricks(mnode, volname)
    for brick in bricks_list:
        brick_node, brick_path = brick.split(":")
        ret = get_volume_status(mnode, volname)
        brick_pid = ret[volname][brick_node][brick_path]["pid"]
        if brick_pid == "None":
            g.log.error(
                "Failed to get brick pid on node %s "
                "of brick path %s", brick_node, brick_path)
            _rc = False

        cmd = "pgrep -x glusterfsd"
        ret, pid, _ = g.run(brick_node, cmd)
        if ret != 0:
            g.log.error("Failed to run the command %s on "
                        "node %s", cmd, brick_node)
            _rc = False

        else:
            glusterfsd_pid = pid.split('\n')[:-1]

        if brick_pid not in glusterfsd_pid:
            g.log.error(
                "Brick pid %s doesn't match glusterfsd "
                "pid %s of the node %s", brick_pid, glusterfsd_pid, brick_node)
            _rc = False

    return _rc
Example #19
0
def is_heal_enabled(mnode, volname):
    """Check if heal is enabled for a volume.

    Args:
        mnode : Node on which commands are executed
        volname : Name of the volume

    Returns:
        bool : True if heal is enabled on volume. False otherwise.
        NoneType: None if unable to get the volume status.
    """
    enabled = True
    vol_status_dict = get_volume_status(mnode, volname, service='shd')
    if vol_status_dict is None:
        g.log.error("Failed to check if heal is enabled on volume %s or not" %
                    volname)
        return None
    for node in vol_status_dict[volname].keys():
        if not ('Self-heal Daemon' in vol_status_dict[volname][node]):
            enabled = False
    return enabled
def restart_file_volume(file_vol, sleep_time=120):
    """Restars file volume service.

    Args:
        file_vol (str): name of a file volume
    """
    gluster_volume_status = get_volume_status(
        "auto_get_gluster_endpoint", file_vol)
    if not gluster_volume_status:
        raise AssertionError("failed to get gluster volume status")

    g.log.info("Gluster volume %s status\n%s : " % (
        file_vol, gluster_volume_status)
    )

    ret, out, err = volume_stop("auto_get_gluster_endpoint", file_vol)
    if ret != 0:
        err_msg = "Failed to stop gluster volume %s. error: %s" % (
            file_vol, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    # Explicit wait to stop ios and pvc creation for 2 mins
    time.sleep(sleep_time)

    ret, out, err = volume_start(
        "auto_get_gluster_endpoint", file_vol, force=True)
    if ret != 0:
        err_msg = "failed to start gluster volume %s error: %s" % (
            file_vol, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    ret, out, err = volume_status("auto_get_gluster_endpoint", file_vol)
    if ret != 0:
        err_msg = ("Failed to get status for gluster volume %s error: %s" % (
            file_vol, err))
        g.log.error(err_msg)
        raise AssertionError(err_msg)
def restart_file_volume(file_vol, sleep_time=120):
    """Restart file volume (stop and start volume).

    Args:
        file_vol (str): name of a file volume
    """
    gluster_volume_status = get_volume_status(
        "auto_get_gluster_endpoint", file_vol)
    if not gluster_volume_status:
        raise AssertionError("failed to get gluster volume status")

    g.log.info("Gluster volume %s status\n%s : " % (
        file_vol, gluster_volume_status)
    )

    ret, out, err = volume_stop("auto_get_gluster_endpoint", file_vol)
    if ret != 0:
        err_msg = "Failed to stop gluster volume %s. error: %s" % (
            file_vol, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    # Explicit wait to stop ios and pvc creation for 2 mins
    time.sleep(sleep_time)

    ret, out, err = volume_start(
        "auto_get_gluster_endpoint", file_vol, force=True)
    if ret != 0:
        err_msg = "failed to start gluster volume %s error: %s" % (
            file_vol, err)
        g.log.error(err_msg)
        raise AssertionError(err_msg)

    ret, out, err = volume_status("auto_get_gluster_endpoint", file_vol)
    if ret != 0:
        err_msg = ("Failed to get status for gluster volume %s error: %s" % (
            file_vol, err))
        g.log.error(err_msg)
        raise AssertionError(err_msg)
def get_brick_pids(gluster_pod, block_hosting_vol, hostname=None):
    """gets brick pids from gluster pods

    Args:
        hostname (str): hostname on which gluster pod exists
        gluster_pod (podcmd | str): gluster pod class object has gluster
                                    pod and ocp master node or gluster
                                    pod name
        block_hosting_vol (str): Block hosting volume id
    """
    gluster_pod = _get_gluster_pod(gluster_pod, hostname)

    gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol)
    if not gluster_volume_status:
        raise AssertionError("failed to get volume status for gluster "
                             "volume '%s' on pod '%s'" %
                             (gluster_pod, block_hosting_vol))

    gluster_volume_status = gluster_volume_status.get(block_hosting_vol)
    assert gluster_volume_status, ("gluster volume %s not present" %
                                   (block_hosting_vol))

    pids = {}
    for parent_key, parent_val in gluster_volume_status.items():
        for child_key, child_val in parent_val.items():
            if not child_key.startswith("/var"):
                continue

            pid = child_val["pid"]
            # When birck is down, pid of the brick is returned as -1.
            # Which is unexepeted situation, hence raising error.
            if pid == "-1":
                raise AssertionError("Something went wrong brick pid is -1")

            pids[parent_key] = pid

    return pids
Example #23
0
    def test_replace_brick_quorum(self):
        '''
        -> Create volume
        -> Set quorum type
        -> Set quorum ratio to 95%
        -> Start the volume
        -> Stop the glusterd on one node
        -> Now quorum is in not met condition
        -> Check all bricks went to offline or not
        -> Perform replace brick operation
        -> Start glusterd on same node which is already stopped
        -> Check all bricks are in online or not
        -> Verify in vol info that old brick not replaced with new brick
        '''

        # Forming brick list, 6 bricks for creating volume, 7th brick for
        # performing replace brick operation
        brick_list = form_bricks_list(self.mnode, self.volname, 7,
                                      self.servers, self.all_servers_info)

        # Create Volume
        ret, _, _ = volume_create(self.mnode,
                                  self.volname,
                                  brick_list[0:6],
                                  replica_count=3)
        self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname)
        g.log.info("Volume created successfully %s", self.volname)

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(
            ret, "Failed to set server quorum on volume %s" % self.volname)
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)

        # Setting Quorum ratio in percentage
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set server quorum ratio on %s" % self.servers)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # Start the volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)
        g.log.info("Volume started successfully %s", self.volname)

        # Stop glusterd on one of the node
        random_server = random.choice(self.servers[1:])
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server)
        g.log.info("Glusterd stopped successfully on server %s", random_server)

        # Checking whether glusterd is running or not
        ret = is_glusterd_running(random_server)
        self.assertEqual(
            ret, 1, "Glusterd is still running on the node %s "
            "where glusterd stopped" % random_server)
        g.log.info("Glusterd is not running on the server %s", random_server)

        # Verifying node count in volume status after glusterd stopped
        # on one of the server, Its not possible to check the brick status
        # immediately in volume status after glusterd stop
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 5:
                break
            sleep(2)
            count += 1

        # creating brick list from volume status
        offline_bricks = []
        vol_status = get_volume_status(self.mnode, self.volname)
        for node in vol_status[self.volname]:
            for brick_path in vol_status[self.volname][node]:
                if brick_path != 'Self-heal Daemon':
                    offline_bricks.append(':'.join([node, brick_path]))

        # Checking bricks are offline or not with quorum ratio(95%)
        ret = are_bricks_offline(self.mnode, self.volname, offline_bricks)
        self.assertTrue(
            ret, "Bricks are online when quorum is in not met "
            "condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)

        # Getting random brick from offline brick list
        self.random_brick = random.choice(offline_bricks)

        # Performing replace brick commit force when quorum not met
        self.replace_brick_failed = False
        ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick,
                                  brick_list[6])
        self.assertNotEqual(
            ret, 0, "Replace brick should fail when quorum is "
            "in not met condition but replace brick "
            "success on %s" % self.volname)
        g.log.info(
            "Failed to replace brick when quorum is in not met "
            "condition %s", self.volname)
        self.replace_brick_failed = True

        # Start glusterd on one of the node
        ret = start_glusterd(random_server)
        self.assertTrue(
            ret, "Failed to start glusterd on server %s" % random_server)
        g.log.info("Glusterd started successfully on server %s", random_server)

        # Verifying node count in volume status after glusterd started
        # on one of the servers, Its not possible to check the brick status
        # immediately in volume status after glusterd start
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 6:
                break
            sleep(2)
            count += 1

        # Checking bricks are online or not
        count = 0
        while count < 100:
            ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6])
            if ret:
                break
            sleep(2)
            count += 1
        self.assertTrue(ret, "All bricks are not online for %s" % self.volname)
        g.log.info("All bricks are online for volume %s", self.volname)

        # Comparing brick lists of before and after performing replace brick
        # operation
        after_brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertListEqual(
            after_brick_list, brick_list[0:6],
            "Bricks are not same before and after performing "
            "replace brick operation for volume %s" % self.volname)
        g.log.info(
            "Bricks are same before and after performing replace "
            "brick operation for volume %s", self.volname)
Example #24
0
    def test_create_heketi_volume(self):
        """Test heketi volume creation and background gluster validation"""

        hosts = []
        gluster_servers = []
        brick_info = []

        output_dict = heketi_ops.heketi_volume_create(self.heketi_client_node,
                                                      self.heketi_server_url,
                                                      10,
                                                      json=True)

        self.assertNotEqual(output_dict, False, "Volume could not be created")

        volume_name = output_dict["name"]
        volume_id = output_dict["id"]

        self.addCleanup(heketi_ops.heketi_volume_delete,
                        self.heketi_client_node, self.heketi_server_url,
                        volume_id)

        self.assertEqual(output_dict["durability"]["replicate"]["replica"], 3,
                         "Volume %s is not replica 3" % volume_id)

        self.assertEqual(output_dict["size"], 10,
                         "Volume %s is not of intended size" % volume_id)

        mount_node = (
            output_dict["mount"]["glusterfs"]["device"].strip().split(":")[0])
        hosts.append(mount_node)

        for backup_volfile_server in (
                output_dict["mount"]["glusterfs"]["options"]
            ["backup-volfile-servers"].strip().split(",")):
            hosts.append(backup_volfile_server)

        for gluster_server in self.gluster_servers:
            gluster_servers.append(
                g.config["gluster_servers"][gluster_server]["storage"])

        self.assertEqual(
            set(hosts), set(gluster_servers),
            "Hosts and gluster servers not matching for %s" % volume_id)

        volume_info = volume_ops.get_volume_info('auto_get_gluster_endpoint',
                                                 volume_name)
        self.assertIsNotNone(volume_info, "get_volume_info returned None")

        volume_status = volume_ops.get_volume_status(
            'auto_get_gluster_endpoint', volume_name)
        self.assertIsNotNone(volume_status, "get_volume_status returned None")

        self.assertEqual(int(volume_info[volume_name]["status"]), 1,
                         "Volume %s status down" % volume_id)
        for brick_details in volume_info[volume_name]["bricks"]["brick"]:
            brick_info.append(brick_details["name"])

        self.assertNotEqual(brick_info, [],
                            "Brick details are empty for %s" % volume_name)

        for brick in brick_info:
            brick_data = brick.strip().split(":")
            brick_ip = brick_data[0]
            brick_name = brick_data[1]
            self.assertEqual(
                int(volume_status[volume_name][brick_ip][brick_name]
                    ["status"]), 1, "Brick %s is not up" % brick_name)
Example #25
0
    def test_remove_brick_status(self):
        '''
        -> Create volume
        -> Enable server quorum on volume
        -> Stop glusterd on all nodes except first node
        -> Verify brick status of nodes where glusterd is running with
        default quorum ratio(51%)
        -> Change the cluster.server-quorum-ratio from default to 95%
        -> Start glusterd on all servers except last node
        -> Verify the brick status again
        '''

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(
            ret, "Failed to set server quorum on volume %s" % self.volname)
        g.log.info("Able to set server quorum on volume %s successfully",
                   self.volname)

        # Getting brick list
        brick_list = get_all_bricks(self.mnode, self.volname)

        # Stopping glusterd on remaining servers except first node
        ret = stop_glusterd(self.servers[1:])
        self.assertTrue(
            ret, "Failed to stop gluterd on some of the servers "
            "%s" % self.servers[1:])
        g.log.info("Glusterd stopped successfully on servers %s",
                   self.servers[1:])

        # Checking brick status for glusterd running nodes with
        # default quorum ratio(51%)
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(
            ret, "Bricks are online when quorum is in not "
            "met condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)

        # Setting quorum ratio to 95%
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set quorum ratio to 95 percentage on "
            "servers %s" % self.servers)
        g.log.info(
            "Able to set server quorum ratio to 95 percentage "
            "on servers %s", self.servers)

        # Starting glusterd on remaining servers except last node
        ret = start_glusterd(self.servers[1:5])
        self.assertTrue(
            ret, "Failed to start glusterd on some of the servers"
            " %s" % self.servers[1:5])
        g.log.info(
            "Glusterd started successfully on all servers except "
            "last node %s", self.servers[1:5])

        # Verfiying node count in volume status after glusterd
        # started on servers, Its not possible to check the brick status
        # immediately after glusterd start, that's why verifying that all
        # glusterd started nodes available in gluster volume status or not
        count = 0
        while count < 50:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 5:
                break
            sleep(2)
            count += 1

        # Checking brick status with quorum ratio(95%)
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:5])
        self.assertTrue(
            ret, "Bricks are online when quorum is in not "
            "met condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)
    def test_add_identical_brick(self):
        """
        In this test case:
        1. Create Dist Volume on Node 1
        2. Down brick on Node 1
        3. Peer Probe N2 from N1
        4. Add identical brick on newly added node
        5. Check volume status
        """

        # pylint: disable=too-many-statements
        # Create a distributed volume on Node1
        number_of_brick = 1
        servers_info_from_single_node = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        self.volname = "testvol"
        bricks_list = form_bricks_list(self.servers[0], self.volname,
                                       number_of_brick, self.servers[0],
                                       servers_info_from_single_node)
        ret, _, _ = volume_create(self.servers[0],
                                  self.volname,
                                  bricks_list,
                                  force=False)
        self.assertEqual(ret, 0, "Volume create failed")
        g.log.info("Volume %s created successfully", self.volname)

        ret, _, _ = volume_start(self.servers[0], self.volname, True)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        ret = bring_bricks_offline(self.volname, bricks_list[0])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret, _, _ = peer_probe(self.servers[0], self.servers[1])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[1]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[1])

        # wait for some time before add-brick
        time.sleep(2)

        # Replace just host IP to create identical brick
        add_bricks = []
        add_bricks.append(
            string.replace(bricks_list[0], self.servers[0], self.servers[1]))
        ret, _, _ = add_brick(self.mnode, self.volname, add_bricks)
        self.assertEqual(ret, 0, "Failed to add the bricks to the volume")
        g.log.info("Successfully added bricks to volume %s", add_bricks[0])

        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(
            vol_status, "Failed to get volume "
            "status for %s" % self.volname)
    def test_status_string(self):
        '''
        -> Create Volume
        -> Start rebalance
        -> Check task type in volume status
        -> Check task status string in volume status
        -> Check task type in volume status xml
        -> Check task status string in volume status xml
        -> Start Remove brick operation
        -> Check task type in volume status
        -> Check task status string in volume status
        -> Check task type in volume status xml
        -> Check task status string in volume status xml
        '''

        # Start rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start rebalance for volume %s"
                         % self.volname)
        g.log.info("Rebalance started successfully on volume %s",
                   self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, "Rebalance failed for volume %s" % self.volname)
        g.log.info("Rebalance completed successfully on volume %s",
                   self.volname)

        # Getting volume status after rebalance start
        ret, out, _ = volume_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to get volume status for volume %s"
                         % self.volname)
        g.log.info("Volume status successful on volume %s", self.volname)
        status_list = out.splitlines()

        # Verifying task type from volume status for rebalance
        self.assertIn('Rebalance', status_list[len(status_list) - 4],
                      "Incorrect task type found in volume status for %s"
                      % self.volname)
        g.log.info("Correct task type found in volume status for %s",
                   self.volname)

        # Verifying task status string in volume status for rebalance
        self.assertIn('completed', status_list[len(status_list) - 2],
                      "Incorrect task status found in volume status for %s"
                      % self.volname)
        g.log.info("Correct task status found in volume status for %s",
                   self.volname)

        # Getting volume status --xml after rebalance start
        vol_status = get_volume_status(self.mnode, self.volname,
                                       options='tasks')

        # Verifying task type  from volume status --xml for rebalance
        self.assertEqual('Rebalance',
                         vol_status[self.volname]['task_status'][0]['type'],
                         "Incorrect task type found in volume status xml "
                         "for %s" % self.volname)
        g.log.info("Correct task type found in volume status xml for %s",
                   self.volname)

        # Verifying task status string from volume status --xml for rebalance
        self.assertEqual(
            'completed',
            vol_status[self.volname]['task_status'][0]['statusStr'],
            "Incorrect task status found in volume status "
            "xml for %s" % self.volname)
        g.log.info("Correct task status found in volume status xml %s",
                   self.volname)

        # Getting sub vols
        subvol_dict = get_subvols(self.mnode, self.volname)
        subvol = subvol_dict['volume_subvols'][1]

        # Perform remove brick start
        ret, _, _ = remove_brick(self.mnode, self.volname, subvol,
                                 'start', replica_count=3)
        self.assertEqual(ret, 0, "Failed to start remove brick operation "
                                 "for %s" % self.volname)
        g.log.info("Remove brick operation started successfully on volume %s",
                   self.volname)

        # Getting volume status after remove brick start
        ret, out, _ = volume_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to get volume status for volume %s"
                         % self.volname)
        g.log.info("Volume status successful on volume %s", self.volname)
        status_list = out.splitlines()

        # Verifying task type from volume status after remove brick start
        self.assertIn('Remove brick', status_list[len(status_list) - 8],
                      "Incorrect task type found in volume status for "
                      "%s" % self.volname)
        g.log.info("Correct task type found in volume status task for %s",
                   self.volname)

        # Verifying task status string in volume status after remove
        # brick start
        ret = False
        remove_status = ['completed', 'in progress']
        if (status_list[len(status_list) - 2].split(':')[1].strip() in
                remove_status):
            ret = True
        self.assertTrue(ret, "Incorrect task status found in volume status "
                             "task for %s" % self.volname)
        g.log.info("Correct task status found in volume status task for %s",
                   self.volname)

        # Getting volume status --xml after remove brick start
        vol_status = get_volume_status(self.mnode, self.volname,
                                       options='tasks')

        # Verifying task type  from volume status --xml after
        # remove brick start
        self.assertEqual('Remove brick',
                         vol_status[self.volname]['task_status'][0]['type'],
                         "Incorrect task type found in volume status xml for "
                         "%s" % self.volname)
        g.log.info("Correct task type found in volume status xml for %s",
                   self.volname)

        # Verifying task status string from volume status --xml
        # after remove brick start
        ret = False
        if (vol_status[self.volname]['task_status'][0]['statusStr'] in
                remove_status):
            ret = True
        self.assertTrue(ret, "Incorrect task status found in volume status "
                             "xml for %s" % self.volname)
        g.log.info("Correct task status found in volume status xml %s",
                   self.volname)
Example #28
0
    def test_peer_probe_validation(self):
        # pylint: disable=too-many-statements
        '''
        -> Create trusted storage pool, by probing with networkshort names
        -> Create volume using IP of host
        -> perform basic operations like
            -> gluster volume start <vol>
            -> gluster volume info <vol>
            -> gluster volume status <vol>
            -> gluster volume stop <vol>
        -> Create a volume using the FQDN of the host
        -> perform basic operations like
            -> gluster volume start <vol>
            -> gluster volume info <vol>
            -> gluster volume status <vol>
            -> gluster volume stop <vol>
        '''
        # Peer probing using short name
        for server in self.servers[1:]:
            ret, hostname, _ = g.run(server, "hostname -s")
            self.assertEqual(ret, 0, ("Unable to get short name "
                                      "for server % s" % server))
            ret, _, _ = peer_probe(self.mnode, hostname)

            if ret == 1:
                ret, hostname, _ = g.run(server, "hostname")
                self.assertEqual(ret, 0, ("Unable to get short name "
                                          "for server % s" % server))

                hostname = hostname.split(".")[0]+"."+hostname.split(".")[1]
                ret, _, _ = peer_probe(self.mnode, hostname)

            self.assertEqual(ret, 0, "Unable to peer"
                             "probe to the server % s" % hostname)
            g.log.info("Peer probe succeeded for server %s", hostname)

        # Create a volume
        self.volname = "test-vol"
        self.brick_list = form_bricks_list(self.mnode, self.volname, 3,
                                           self.servers,
                                           self.all_servers_info)
        g.log.info("Creating a volume")
        ret, _, _ = volume_create(self.mnode, self.volname,
                                  self.brick_list, force=False)
        self.assertEqual(ret, 0, "Unable"
                         "to create volume % s" % self.volname)
        g.log.info("Volume created successfully % s", self.volname)

        # Start a volume
        g.log.info("Start a volume")
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Unable"
                         "to start volume % s" % self.volname)
        g.log.info("Volume started successfully % s", self.volname)

        # Get volume info
        g.log.info("get volume info")
        volinfo = get_volume_info(self.mnode, self.volname)
        self.assertIsNotNone(volinfo, "Failed to get the volume "
                                      "info for %s" % self.volname)

        # Get volume status
        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                                         "status for %s" % self.volname)

        # stop volume
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Unable"
                         "to stop volume % s" % self.volname)
        g.log.info("Volume stopped successfully % s", self.volname)

        # Create a volume
        self.volname = "test-vol-fqdn"

        self.brick_list = form_bricks_list(self.mnode, self.volname, 3,
                                           self.servers,
                                           self.all_servers_info)

        # Getting FQDN (Full qualified domain name) of each host and
        # replacing ip with FQDN name for each brick for example
        # 10.70.37.219:/bricks/brick0/vol1 is a brick, here ip is replaced
        # with FQDN name now brick looks like
        # dhcp35-219.lab.eng.blr.redhat.com:/bricks/brick0/vol1

        my_brick_list = []
        for brick in self.brick_list:
            fqdn_list = brick.split(":")
            fqdn = socket.getfqdn(fqdn_list[0])
            fqdn = fqdn + ":" + fqdn_list[1]
            my_brick_list.append(fqdn)

        g.log.info("Creating a volume")
        ret, _, _ = volume_create(self.mnode, self.volname,
                                  my_brick_list, force=False)
        self.assertEqual(ret, 0, "Unable"
                         "to create volume % s" % self.volname)
        g.log.info("Volume created successfully % s", self.volname)

        # Start a volume
        g.log.info("Start a volume")
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Unable"
                         "to start volume % s" % self.volname)
        g.log.info("Volume started successfully % s", self.volname)

        # Get volume info
        g.log.info("get volume info")
        volinfo = get_volume_info(self.mnode, self.volname)
        self.assertIsNotNone(volinfo, "Failed to get the volume "
                                      "info for %s" % self.volname)

        # Get volume status
        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                                         "status for %s" % self.volname)

        # stop volume
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Unable"
                         "to stop volume % s" % self.volname)
        g.log.info("Volume stopped successfully % s", self.volname)
    def test_brick_port(self):
        # pylint: disable=too-many-statements, too-many-branches
        """
        In this test case:
        1. Trusted storage Pool of 2 nodes
        2. Create a distributed volumes with 2 bricks
        3. Start the volume
        4. Stop glusterd on one node 2
        5. Modify any of the volume option on node 1
        6. Start glusterd on node 2
        7. Check volume status, brick should get port
        """
        my_server_info = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        my_servers = self.servers[0:2]
        index = 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[index])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[index]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[index])
        key = self.servers[index]
        my_server_info[key] = self.all_servers_info[key]

        self.volname = "testvol"
        bricks_list = form_bricks_list(self.mnode, self.volname, 2,
                                       my_servers,
                                       my_server_info)
        g.log.info("Creating a volume %s ", self.volname)
        ret = volume_create(self.mnode, self.volname,
                            bricks_list, force=False)
        self.assertEqual(ret[0], 0, ("Unable"
                                     "to create volume %s" % self.volname))
        g.log.info("Volume created successfully %s", self.volname)

        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")

        g.log.info("Successfully got the list of bricks of volume")

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                             "status for %s" % self.volname)
        totport = 0
        for _, value in vol_status.items():
            for _, val in value.items():
                for _, value1 in val.items():
                    if int(value1["port"]) > 0:
                        totport += 1

        self.assertEqual(totport, 2, ("Volume %s is not started successfully"
                                      "because no. of brick port is not equal"
                                      " to 2", self.volname))

        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to stop glusterd on one of the node")
        ret = wait_for_glusterd_to_start(self.servers[1])
        self.assertFalse(ret, "glusterd is still running on %s"
                         % self.servers[1])
        g.log.info("Glusterd stop on the nodes : %s "
                   "succeeded", self.servers[1])

        option = {'performance.readdir-ahead': 'on'}
        ret = set_volume_options(self.servers[0], self.volname, option)
        self.assertTrue(ret, "gluster volume set %s performance.readdir-ahead"
                             "on is failed on server %s"
                        % (self.volname, self.servers[0]))
        g.log.info("gluster volume set %s performance.readdir-ahead on"
                   "successfully on :%s", self.volname, self.servers[0])

        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to start glusterd on one of the node")
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])
        ret = wait_for_glusterd_to_start(self.servers[1])
        self.assertTrue(ret, "glusterd is not running on %s"
                        % self.servers[1])
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])

        ret = wait_for_peers_to_connect(self.servers[0], self.servers[1])
        self.assertTrue(ret, "glusterd is not connected %s with peer %s"
                        % (self.servers[0], self.servers[1]))

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                             "status for %s" % self.volname)
        totport = 0
        for _, value in vol_status.items():
            for _, val in value.items():
                for _, value1 in val.items():
                    if int(value1["port"]) > 0:
                        totport += 1

        self.assertEqual(totport, 2, ("Volume %s is not started successfully"
                                      "because no. of brick port is not equal"
                                      " to 2", self.volname))
    def test_rebalance_hang(self):
        """
        In this test case:
        1. Trusted storage Pool of 2 nodes
        2. Create a distributed volumes with 2 bricks
        3. Start the volume
        4. Mount the volume
        5. Add some data file on mount
        6. Start rebalance with force
        7. kill glusterd on 2nd node
        8. Issue volume related command
        """

        # pylint: disable=too-many-statements
        my_server_info = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        my_servers = self.servers[0:2]
        index = 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[index])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[index]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[index])
        key = self.servers[index]
        my_server_info[key] = self.all_servers_info[key]

        self.volname = "testvol"
        bricks_list = form_bricks_list(self.mnode, self.volname, 2, my_servers,
                                       my_server_info)
        g.log.info("Creating a volume %s ", self.volname)
        ret, _, _ = volume_create(self.mnode,
                                  self.volname,
                                  bricks_list,
                                  force=False)
        self.assertEqual(ret, 0, ("Unable"
                                  "to create volume %s" % self.volname))
        g.log.info("Volume created successfully %s", self.volname)

        ret, _, _ = volume_start(self.mnode, self.volname, False)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        # Mounting a volume
        ret, _, _ = mount_volume(self.volname,
                                 mtype=self.mount_type,
                                 mpoint=self.mounts[0].mountpoint,
                                 mserver=self.mnode,
                                 mclient=self.mounts[0].client_system)
        self.assertEqual(ret, 0, ("Volume %s is not mounted") % self.volname)
        g.log.info("Volume mounted successfully : %s", self.volname)

        self.all_mounts_procs = []
        # Creating files
        command = ("cd %s/ ; "
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=128k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; " % (self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False
        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        g.log.info("Starting rebalance with force on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, False, True)
        self.assertEqual(
            ret, 0, ("Failed to start rebalance for volume %s", self.volname))
        g.log.info("Successfully rebalance on the volume %s", self.volname)

        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to stop glusterd on one of the node")
        ret = is_glusterd_running(self.servers[1])
        self.assertNotEqual(
            ret, 0, ("Glusterd is not stopped on servers %s", self.servers[1]))
        g.log.info("Glusterd stop on the nodes : %s succeeded",
                   self.servers[1])

        # Wait for fix-layout to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(
            vol_status, "Failed to get volume "
            "status for %s" % self.volname)

        # Start glusterd on the node where it is stopped
        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "glusterd start on the node failed")
        count = 0
        while count < 60:
            ret = is_glusterd_running(self.servers[1])
            if not ret:
                break
            sleep(2)
            count += 1
        self.assertEqual(ret, 0,
                         "glusterd is not running on %s" % self.servers[1])
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])
def compute_data_usage_stats_on_servers(nodes, test_name):
    """Compute min, max, mean and median for servers

    Args:
     nodes(list): Servers from which data is to be used to compute min, max
                  , mean, mode and median
     test_name(str): Name of testcase for which data has to be processed

    Returns:
     dict: dict of min, max, mean and median for a given process

    NOTE:
     This function has to be always run before cleanup.
    """
    data_dict = {}
    for node in nodes:
        # Get the volume status on the node
        volume_status = get_volume_status(node)
        data_dict[node] = {}
        for process in ('glusterd', 'glusterfs', 'glusterfsd'):

            # Generate a dataframe from the csv file
            dataframe = create_dataframe_from_csv(node, process, test_name)
            if dataframe.empty:
                return {}

            data_dict[node][process] = {}
            if process == 'glusterd':
                # Checking if glusterd is restarted.
                if len(set(dataframe['Process ID'])) > 1:
                    data_dict[node][process]['is_restarted'] = True
                else:
                    data_dict[node][process]['is_restarted'] = False

                # Call function to compute min, max, mean and median
                _compute_min_max_mean_median(dataframe, data_dict, process,
                                             node)
                continue

            # Map volumes to volume process
            for volume in volume_status.keys():
                for proc in volume_status[volume][node].keys():
                    if (proc == 'Self-heal Daemon' and process == 'glusterfs'):
                        # Fetching pid from volume status output and create a
                        # dataframe with the entries of only that pid
                        pid = volume_status[volume][node][proc]['pid']
                        proc_dataframe = dataframe[dataframe['Process ID'] ==
                                                   pid]

                        # Call function to compute min, max, mean
                        # and median
                        _compute_min_max_mean_median(proc_dataframe, data_dict,
                                                     process, node, volume)

                    if (proc.count('/') >= 2 and process == 'glusterfsd'):
                        # Fetching pid from volume status output and create a
                        # dataframe with the entries of only that pid
                        pid = volume_status[volume][node][proc]['pid']
                        proc_dataframe = dataframe[dataframe['Process ID'] ==
                                                   pid]

                        # Call function to compute min, max, mean and median
                        _compute_min_max_mean_median(proc_dataframe, data_dict,
                                                     process, node, volume,
                                                     proc)

    return data_dict
    def test_add_brick_when_quorum_not_met(self):

        # pylint: disable=too-many-statements
        # create and start a volume
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, ("Failed to create "
                              "and start volume %s" % self.volname))
        g.log.info("Volume is created and started successfully")

        # set cluster.server-quorum-type as server
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(ret, ("Failed to set the quorum type as a server"
                              " on volume %s", self.volname))
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)

        # Setting quorum ratio to 95%
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set server quorum ratio on %s" % self.volname)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # bring down glusterd of half nodes
        num_of_servers = len(self.servers)
        num_of_nodes_to_bring_down = num_of_servers / 2

        for node in range(num_of_nodes_to_bring_down, num_of_servers):
            ret = stop_glusterd(self.servers[node])
            self.assertTrue(
                ret, ("Failed to stop glusterd on %s" % self.servers[node]))
            g.log.info("Glusterd stopped successfully on server %s",
                       self.servers[node])

        for node in range(num_of_nodes_to_bring_down, num_of_servers):
            count = 0
            while count < 80:
                ret = is_glusterd_running(self.servers[node])
                if ret:
                    break
                sleep(2)
                count += 1
            self.assertNotEqual(
                ret, 0, "glusterd is still running on %s" % self.servers[node])

        # Verifying node count in volume status after glusterd stopped
        # on half of the servers, Its not possible to check the brick status
        # immediately in volume status after glusterd stop
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname])
            if servers_count == (num_of_servers - num_of_nodes_to_bring_down):
                break
            sleep(2)
            count += 1

        # confirm that quorum is not met, brick process should be down
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        bricks_to_check = bricks_list[0:num_of_nodes_to_bring_down]
        ret = are_bricks_offline(self.mnode, self.volname, bricks_to_check)
        self.assertTrue(
            ret, "Unexpected: Server quorum is not met, "
            "Bricks are up")
        g.log.info("Server quorum is not met, bricks are down as expected")

        # try add brick operation, which should fail
        num_bricks_to_add = 1
        brick = form_bricks_list(self.mnode, self.volname, num_bricks_to_add,
                                 self.servers, self.all_servers_info)
        ret, _, _ = add_brick(self.mnode, self.volname, brick)
        self.assertNotEqual(ret, 0, ("Unexpected: add brick is success, "
                                     "when quorum is not met"))
        g.log.info("Add brick is failed as expected, when quorum is not met")

        # confirm that, newly added brick is not part of volume
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        if brick in bricks_list:
            ret = False
            self.assertTrue(ret, ("Unexpected: add brick is success, "
                                  "when quorum is not met"))
        g.log.info("Add brick is failed as expected, when quorum is not met")

        # set cluster.server-quorum-type as none
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'none'})
        self.assertTrue(ret, ("Failed to set the quorum type as a server"
                              " on volume %s", self.volname))
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)
Example #33
0
    def test_xml_dump_of_gluster_volume_status_during_rebalance(self):
        """
        1. Create a trusted storage pool by peer probing the node
        2.  Create a distributed-replicated volume
        3. Start the volume and fuse mount the volume and start IO
        4. Create another replicated volume and start it and stop it
        5. Start rebalance on the volume
        6. While rebalance in progress, stop glusterd on one of the nodes
            in the Trusted Storage pool.
        7. Get the status of the volumes with --xml dump
        """
        self.volname_2 = "test_volume_2"

        # create volume
        # Fetching all the parameters for volume_create
        list_of_three_servers = []
        server_info_for_three_nodes = {}
        for server in self.servers[:3]:
            list_of_three_servers.append(server)
            server_info_for_three_nodes[server] = self.all_servers_info[server]

        bricks_list = form_bricks_list(self.mnode, self.volname, 3,
                                       list_of_three_servers,
                                       server_info_for_three_nodes)
        # Creating volumes using 3 servers
        ret, _, _ = volume_create(self.mnode,
                                  self.volname_2,
                                  bricks_list,
                                  force=True)
        self.assertFalse(ret, "Volume creation failed")
        g.log.info("Volume %s created successfully", self.volname_2)
        ret, _, _ = volume_start(self.mnode, self.volname_2)
        self.assertFalse(ret,
                         "Failed to start volume {}".format(self.volname_2))
        ret, _, _ = volume_stop(self.mnode, self.volname_2)
        self.assertFalse(ret,
                         "Failed to stop volume {}".format(self.volname_2))

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))

        # Get rebalance status
        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']

        self.assertIn('in progress', status,
                      "Rebalance process is not running")
        g.log.info("Rebalance process is running")

        # Stop glusterd
        ret = stop_glusterd(self.servers[2])
        self.assertTrue(ret, "Failed to stop glusterd")

        ret, out, _ = g.run(
            self.mnode,
            "gluster v status  | grep -A 4 'Rebalance' | awk 'NR==3{print "
            "$3,$4}'")

        ret = get_volume_status(self.mnode, self.volname, options="tasks")
        rebalance_status = ret[self.volname]['task_status'][0]['statusStr']
        self.assertIn(rebalance_status, out.replace("\n", ""))