Ejemplo n.º 1
0
    def test_rebalance_start_when_glusterd_down(self):

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand success", self.volname)

        # Form a new list of servers without mnode in it to prevent mnode
        # from glusterd failure
        nodes = self.servers[:]
        nodes.remove(self.mnode)

        # Stop glusterd on a server
        self.random_server = random.choice(nodes)
        g.log.info("Stop glusterd on server %s", self.random_server)
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, ("Server %s: Failed to stop glusterd",
                              self.random_server))
        g.log.info("Server %s: Stopped glusterd", self.random_server)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Volume %s: Failed to start rebalance",
                                  self.volname))
        g.log.info("Volume %s: Rebalance start success", self.volname)
        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertFalse(ret, ("Volume %s: Rebalance is completed",
                               self.volname))
        g.log.info("Rebalance failed on one or more nodes. Check rebalance "
                   "status for more details")
Ejemplo n.º 2
0
    def test_offline_brick_status_when_quorum_not_met(self):
        """
        Test Brick status when Quorum is not met after glusterd restart.
        1. Create a volume and mount it.
        2. Set the quorum type to 'server'.
        3. Bring some nodes down such that quorum won't be met.
        4. Brick status should be offline in the node which is up.
        5. Restart glusterd in this node.
        6. The brick status still should be offline as quorum isn't met.
        """
        # Set the quorum type to server and validate it.
        vol_option = {'cluster.server-quorum-type': 'server'}
        ret = set_volume_options(self.mnode, self.volname, vol_option)
        self.assertTrue(ret, "gluster volume option set of %s to %s failed"
                        % ('cluster.server-quorum-type', 'server'))
        g.log.info("Cluster quorum set to type server.")

        # Get the brick list.
        brick_list = get_all_bricks(self.mnode, self.volname)

        # Stop glusterd processes.
        ret = stop_glusterd(self.servers[1:])
        self.assertTrue(ret, "Failed to stop glusterd on specified nodes.")
        g.log.info("Glusterd processes stopped in the desired servers.")

        # Get the brick status in a node where glusterd is up.
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(ret, "Bricks are online")
        g.log.info("Bricks are offline as expected.")

        # Restart one of the node which is up.
        ret = restart_glusterd(self.servers[0])
        self.assertTrue(ret, ("Failed to restart glusterd on desired node."))
        g.log.info("Glusterd restarted on the desired node.")

        # Wait for glusterd to be online and validate it's running.
        self.assertTrue(wait_for_glusterd_to_start(self.servers[0]),
                        "Glusterd not up on the desired server.")
        g.log.info("Glusterd is up in the desired server.")

        # Get the brick status from the restarted node.
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(ret, "Bricks are online")
        g.log.info("Bricks are offline as expected.")

        # Start glusterd on all servers.
        ret = start_glusterd(self.servers)
        self.assertTrue(ret, "Failed to start glusterd on the specified nodes")
        g.log.info("Initiated start of glusterd on all nodes.")

        # Wait for glusterd to start.
        ret = wait_for_glusterd_to_start(self.servers)
        self.assertTrue(ret, "Glusterd not up on all nodes.")
        g.log.info("Glusterd is up and running on all nodes.")

        # Wait for all volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   timeout=600)
        self.assertTrue(ret, ("All volume processes not up."))
        g.log.info("All volume processes are up.")
Ejemplo n.º 3
0
 def _stop_gluster(self, server):
     """
     Stop glusterd on a server
     """
     ret = stop_glusterd(server)
     self.assertTrue(ret, "Failed to stop glusterd on node:%s" % server)
     g.log.info("Successfully stopped glusterd process on node:%s", server)
Ejemplo n.º 4
0
    def test_ops_when_one_node_is_down(self):

        # pylint: disable=too-many-statements
        """
        Test Case:
        1) Create a N node gluster cluster.
        2) Stop gluster on one node.
        3) Execute gluster peer status on other node.
        4) Execute gluster v list on other node.
        5) Execute gluster v info on other node.
        """

        # Fetching a random server from list.
        self.random_server = randint(1, len(self.servers) - 1)

        # Stopping glusterd on one node.
        ret = stop_glusterd(self.servers[self.random_server])
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Successfully stopped glusterd on one node.")

        # Running peer status on another node.
        ret, _, err = peer_status(self.mnode)
        self.assertEqual(ret, 0, ("Failed to get peer status from %s with "
                                  "error message %s" % (self.mnode, err)))
        g.log.info("Successfully got peer status from %s.", self.mnode)

        # Running volume list on another node.
        ret, _, _ = volume_list(self.mnode)
        self.assertEqual(ret, 0, "Failed to get volume list.")
        g.log.info("Successfully got volume list from %s.", self.mnode)

        # Running volume info on another node.
        ret, _, _ = volume_info(self.mnode)
        self.assertEqual(ret, 0, "Failed to get volume info.")
        g.log.info("Successfully got volume info from %s.", self.mnode)
Ejemplo n.º 5
0
    def scratch_cleanup(cls, error_or_failure_exists):
        """
        This scratch_cleanup script will run only when the code
        currently running goes into execution or assertion error.

        Args:
            error_or_failure_exists (bool): If set True will cleanup setup
                atlast of testcase only if exectution or assertion error in
                teststeps. False will skip this scratch cleanup step.

        Returns (bool): True if setup cleanup is successful.
            False otherwise.
        """
        if error_or_failure_exists:
            ret = stop_glusterd(cls.servers)
            if not ret:
                g.log.error("Failed to stop glusterd")
                cmd_list = ("pkill pidof glusterd",
                            "rm /var/run/glusterd.socket")
                for server in cls.servers:
                    for cmd in cmd_list:
                        ret, _, _ = g.run(server, cmd, "root")
                        if ret:
                            g.log.error("Failed to stop glusterd")
                            return False
            for server in cls.servers:
                cmd_list = ("rm -rf /var/lib/glusterd/vols/*",
                            "rm -rf /var/lib/glusterd/snaps/*",
                            "rm -rf /var/lib/glusterd/peers/*",
                            "rm -rf {}/*/*".format(
                                cls.all_servers_info[server]['brick_root']))
                for cmd in cmd_list:
                    ret, _, _ = g.run(server, cmd, "root")
                    if ret:
                        g.log.error(
                            "failed to cleanup server {}".format(server))
                        return False
            ret = restart_glusterd(cls.servers)
            if not ret:
                g.log.error("Failed to start glusterd")
                return False
            sleep(2)
            ret = wait_for_glusterd_to_start(cls.servers)
            if not ret:
                g.log.error("Failed to bring glusterd up")
                return False
            ret = peer_probe_servers(cls.mnode, cls.servers)
            if not ret:
                g.log.error("Failed to peer probe servers")
                return False
            for client in cls.clients:
                cmd_list = ("umount /mnt/*", "rm -rf /mnt/*")
                for cmd in cmd_list:
                    ret = g.run(client, cmd, "root")
                    if ret:
                        g.log.error(
                            "failed to unmount/already unmounted {}".format(
                                client))
            return True
Ejemplo n.º 6
0
    def test_glusterd_services(self):
        """Test restart, stop, start of glusterd
        """
        # restart glusterd on all servers
        g.log.info("Restart glusterd on all servers %s", self.servers)
        ret = restart_glusterd(self.servers)
        self.assertTrue(
            ret,
            ("Failed to restart glusterd on all servers %s", self.servers))
        g.log.info("Successfully restarted glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: active)
        g.log.info(
            "Check if glusterd is running on all servers %s"
            "(expected: active)", self.servers)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(
            ret, 0,
            ("Glusterd is not running on all servers %s", self.servers))
        g.log.info("Glusterd is running on all the servers %s", self.servers)

        # Stop glusterd on all servers
        g.log.info("Stop glusterd on all servers %s", self.servers)
        ret = stop_glusterd(self.servers)
        self.assertTrue(
            ret, ("Failed to stop glusterd on all servers %s", self.servers))
        g.log.info("Successfully stopped glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: not running)
        g.log.info(
            "Check if glusterd is running on all servers %s"
            "(expected: not running)", self.servers)
        ret = is_glusterd_running(self.servers)
        self.assertNotEqual(ret, 0, ("Glusterd is still running on some "
                                     "servers %s", self.servers))
        g.log.info("Glusterd not running on any servers %s as expected.",
                   self.servers)

        # Start glusterd on all servers
        g.log.info("Start glusterd on all servers %s", self.servers)
        ret = start_glusterd(self.servers)
        self.assertTrue(
            ret, ("Failed to start glusterd on all servers %s", self.servers))
        g.log.info("Successfully started glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: active)
        g.log.info(
            "Check if glusterd is running on all servers %s"
            "(expected: active)", self.servers)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(
            ret, 0,
            ("Glusterd is not running on all servers %s", self.servers))
        g.log.info("Glusterd is running on all the servers %s", self.servers)
    def test_rebalance_start_when_glusterd_down(self):

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand success", self.volname)

        # Get all servers IP addresses which are part of volume
        ret = get_all_bricks(self.mnode, self.volname)
        list_of_servers_used = []
        for brick in ret:
            list_of_servers_used.append(brick.split(":")[0])
        self.assertTrue(ret, ("Failed to get server IP list for volume %s",
                              self.volname))
        g.log.info("Succesfully got server IP list for volume %s",
                   self.volname)

        # Form a new list of servers without mnode in it to prevent mnode
        # from glusterd failure
        for element in list_of_servers_used:
            if element == self.mnode:
                list_of_servers_used.remove(element)

        # Stop glusterd on a server
        self.random_server = choice(list_of_servers_used)
        g.log.info("Stop glusterd on server %s", self.random_server)
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, ("Server %s: Failed to stop glusterd",
                              self.random_server))
        g.log.info("Server %s: Stopped glusterd", self.random_server)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Volume %s: Failed to start rebalance",
                                  self.volname))
        g.log.info("Volume %s: Rebalance start success", self.volname)
        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertFalse(ret, ("Volume %s: Rebalance is completed",
                               self.volname))
        g.log.info("Expected: Rebalance failed on one or more nodes."
                   " Check rebalance status for more details")
        error_msg1 = "\"fix layout on / failed\""
        error_msg2 = "\"Transport endpoint is not connected\""
        ret, _, _ = g.run(self.mnode, "grep -w %s /var/log/glusterfs/"
                          "%s-rebalance.log| grep -w %s"
                          % (error_msg1, self.volname, error_msg2))
        self.assertEqual(ret, 0, ("Unexpected : Rebalance failed on volume %s"
                                  "not because of glusterd down on a node",
                                  self.volname))
        g.log.info("\n\nRebalance failed on volume %s due to glusterd down on"
                   "one of the nodes\n\n", self.volname)
Ejemplo n.º 8
0
    def test_glusterd_restart_stop_start(self):
        """Tests glusterd stop, start, restart and validate if all
        peers are in connected state after glusterd restarts.
        """
        # restart glusterd on all servers
        ret = restart_glusterd(self.servers)
        self.assertTrue(
            ret,
            ("Failed to restart glusterd on all servers %s", self.servers))
        g.log.info("Successfully restarted glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: active)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(
            ret, 0,
            ("Glusterd is not running on all servers %s", self.servers))
        g.log.info("Glusterd is running on all the servers %s", self.servers)

        # Stop glusterd on all servers
        ret = stop_glusterd(self.servers)
        self.assertTrue(
            ret, ("Failed to stop glusterd on all servers %s", self.servers))
        g.log.info("Successfully stopped glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: not running)
        ret = is_glusterd_running(self.servers)
        self.assertNotEqual(ret, 0, ("Glusterd is still running on some "
                                     "servers %s", self.servers))
        g.log.info("Glusterd not running on any servers %s as expected.",
                   self.servers)

        # Start glusterd on all servers
        ret = start_glusterd(self.servers)
        self.assertTrue(
            ret, ("Failed to start glusterd on all servers %s", self.servers))
        g.log.info("Successfully started glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: active)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(
            ret, 0,
            ("Glusterd is not running on all servers %s", self.servers))
        g.log.info("Glusterd is running on all the servers %s", self.servers)

        # Wait for all the glusterd's to establish communication.
        time.sleep(30)

        # Validate all the peers are in connected state
        ret = self.validate_peers_are_connected()
        self.assertTrue(ret, "Validating Peers to be in Cluster Failed")

        self.test_method_complete = True
    def test_glusterd_default_vol_behavior_and_quorum_options(self):
        """
        Test default volume behavior and quorum options
        1. Create a volume and start it.
        2. Check that no quorum options are found in vol info.
        3. Kill two glusterd processes.
        4. There shouldn't be any effect to the running glusterfsd
        processes.
        """
        # Check the default quorum options are correct.
        self._validate_vol_options('cluster.server-quorum-type', 'off')
        self._validate_vol_options('cluster.server-quorum-ratio', '51', True)

        # Get the count of number of glusterfsd processes running.
        count_before_glusterd_kill = self._get_total_brick_processes_count()

        # Kill two glusterd processes.
        server_list = [self.servers[1], self.servers[2]]
        ret = stop_glusterd(server_list)
        self.assertTrue(ret, "Failed to stop glusterd on the specified nodes.")
        ret = is_glusterd_running(server_list)
        self.assertNotEqual(ret, 0, ("Glusterd is not stopped on the servers"
                                     " where it was desired to be stopped."))
        g.log.info("Glusterd processes stopped in the desired servers.")

        # Get the count of number of glusterfsd processes running.
        count_after_glusterd_kill = self._get_total_brick_processes_count()

        # The count of glusterfsd processes should match
        self.assertEqual(count_before_glusterd_kill, count_after_glusterd_kill,
                         ("Glusterfsd processes are affected."))
        g.log.info("Glusterd processes are not affected.")

        # Start glusterd on all servers.
        ret = start_glusterd(self.servers)
        self.assertTrue(ret, "Failed to Start glusterd on the specified"
                        " nodes")
        g.log.info("Started glusterd on all nodes.")

        # Wait for glusterd to restart.
        ret = wait_for_glusterd_to_start(self.servers)
        self.assertTrue(ret, "Glusterd not up on all nodes.")
        g.log.info("Glusterd is up and running on all nodes.")
Ejemplo n.º 10
0
    def test_vol_delete_when_one_of_nodes_is_down(self):

        # create a volume and start it
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, "Failed to create and start the volume")
        g.log.info("Successfully created and started the volume")

        # get the bricks list
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the bricks list")

        # get a random node other than self.mnode
        if len(bricks_list) >= len(self.servers):
            random_index = random.randint(1, len(self.servers) - 1)
        else:
            random_index = random.randint(1, len(bricks_list) - 1)

        # stop glusterd on the random node

        node_to_stop_glusterd = self.servers[random_index]
        ret = stop_glusterd(node_to_stop_glusterd)
        self.assertTrue(ret, "Failed to stop glusterd")

        # stop the volume, it should succeed
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume stop failed")

        # try to delete the volume, it should fail
        ret, out, err = g.run(
            self.mnode, "gluster volume delete %s "
            "--mode=script" % self.volname)
        self.assertNotEqual(
            ret, 0, "Volume delete succeeded when one of the"
            " brick node is down")
        if re.search(r'Some of the peers are down', err):
            g.log.info("Volume delete failed with expected error message")
        else:
            g.log.info("Volume delete failed with unexpected error message")
Ejemplo n.º 11
0
    def test_remove_brick_status(self):
        '''
        -> Create volume
        -> Enable server quorum on volume
        -> Stop glusterd on all nodes except first node
        -> Verify brick status of nodes where glusterd is running with
        default quorum ratio(51%)
        -> Change the cluster.server-quorum-ratio from default to 95%
        -> Start glusterd on all servers except last node
        -> Verify the brick status again
        '''

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(
            ret, "Failed to set server quorum on volume %s" % self.volname)
        g.log.info("Able to set server quorum on volume %s successfully",
                   self.volname)

        # Getting brick list
        brick_list = get_all_bricks(self.mnode, self.volname)

        # Stopping glusterd on remaining servers except first node
        ret = stop_glusterd(self.servers[1:])
        self.assertTrue(
            ret, "Failed to stop gluterd on some of the servers "
            "%s" % self.servers[1:])
        g.log.info("Glusterd stopped successfully on servers %s",
                   self.servers[1:])

        # Checking brick status for glusterd running nodes with
        # default quorum ratio(51%)
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(
            ret, "Bricks are online when quorum is in not "
            "met condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)

        # Setting quorum ratio to 95%
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set quorum ratio to 95 percentage on "
            "servers %s" % self.servers)
        g.log.info(
            "Able to set server quorum ratio to 95 percentage "
            "on servers %s", self.servers)

        # Starting glusterd on remaining servers except last node
        ret = start_glusterd(self.servers[1:5])
        self.assertTrue(
            ret, "Failed to start glusterd on some of the servers"
            " %s" % self.servers[1:5])
        g.log.info(
            "Glusterd started successfully on all servers except "
            "last node %s", self.servers[1:5])

        # Verfiying node count in volume status after glusterd
        # started on servers, Its not possible to check the brick status
        # immediately after glusterd start, that's why verifying that all
        # glusterd started nodes available in gluster volume status or not
        count = 0
        while count < 50:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 5:
                break
            sleep(2)
            count += 1

        # Checking brick status with quorum ratio(95%)
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:5])
        self.assertTrue(
            ret, "Bricks are online when quorum is in not "
            "met condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)
    def test_gfind_when_node_down(self):
        """
        Verifying the glusterfind functionality when node is down.

        1. Create a volume
        2. Create a session on the volume
        3. Create various files from mount point
        4. Bring down glusterd on one of the node
        5. Perform glusterfind pre
        6. Perform glusterfind post
        7. Check the contents of outfile
        8. Create more files from mountpoint
        9. Reboot one of the nodes
        10. Perform gluserfind pre
        11. Perform glusterfind post
        12. Check the contents of outfile
        """

        # pylint: disable=too-many-statements
        # Create a session for the volume
        ret, _, _ = gfind_create(self.mnode, self.volname, self.session)
        self.assertEqual(ret, 0, ("Unexpected: Creation of a session for the "
                                  "volume %s failed" % self.volname))
        g.log.info("Successfully created a session for the volume %s",
                   self.volname)

        # Perform glusterfind list to check if session exists
        _, out, _ = gfind_list(self.mnode, volname=self.volname,
                               sessname=self.session)
        self.assertNotEqual(out, "No sessions found.",
                            "Failed to list the glusterfind session")
        g.log.info("Successfully listed the glusterfind session")

        self._perform_io_and_validate_presence_of_files()

        # Wait for changelog to get updated
        sleep(2)

        # Bring one of the node down.
        self.random_server = choice(self.servers[1:])
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Succesfully stopped glusterd on one node.")

        # Wait till glusterd is completely down.
        while is_glusterd_running(self.random_server) != 1:
            sleep(2)

        self._perform_glusterfind_pre_and_validate_outfile()

        # Perform glusterfind post for the session
        ret, _, _ = gfind_post(self.mnode, self.volname, self.session)
        self.assertEqual(ret, 0, ("Failed to perform glusterfind post"))
        g.log.info("Successfully performed glusterfind post")

        # Bring glusterd which was downed on a random node, up.
        ret = start_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to start glusterd on %s"
                        % self.random_server)
        g.log.info("Successfully started glusterd on node : %s",
                   self.random_server)

        # Waiting for glusterd to start completely.
        ret = wait_for_glusterd_to_start(self.random_server)
        self.assertTrue(ret, "glusterd is not running on %s"
                        % self.random_server)
        g.log.info("glusterd is started and running on %s",
                   self.random_server)

        self._perform_io_and_validate_presence_of_files()

        # Perform IO
        self._perform_io_and_validate_presence_of_files()

        # Wait for changelog to get updated
        sleep(2)

        # Reboot one of the nodes.
        self.random_server = choice(self.servers[1:])
        ret = reboot_nodes(self.random_server)
        self.assertTrue(ret, "Failed to reboot the said node.")
        g.log.info("Successfully started reboot process on one node.")

        self._perform_glusterfind_pre_and_validate_outfile()

        # Perform glusterfind post for the session
        ret, _, _ = gfind_post(self.mnode, self.volname, self.session)
        self.assertEqual(ret, 0, ("Failed to perform glusterfind post"))
        g.log.info("Successfully performed glusterfind post")

        # Gradual sleep backoff till the node has rebooted.
        counter = 0
        timeout = 300
        ret = False
        while counter < timeout:
            ret, _ = are_nodes_online(self.random_server)
            if not ret:
                g.log.info("Node's offline, Retrying after 5 seconds ...")
                sleep(5)
                counter += 5
            else:
                ret = True
                break
        self.assertTrue(ret, "Node is still offline.")
        g.log.info("Rebooted node is online")

        # Wait for glusterd to start completely
        ret = wait_for_glusterd_to_start(self.random_server)
        self.assertTrue(ret, "glusterd is not running on %s"
                        % self.random_server)
        g.log.info("glusterd is started and running on %s",
                   self.random_server)
Ejemplo n.º 13
0
    def test_glusterd_split_brain_with_quorum(self):
        """
        - On a 6 node cluster
        - Create a volume using first four nodes
        - Set the volumes options
        - Stop two gluster nodes
        - Perform gluster vol reset
        - Start the glusterd on the nodes where it stopped
        - Check the peer status, all the nodes should be in connected state

        """
        # Before starting the testcase, proceed only it has minimum of 6 nodes
        self.assertGreaterEqual(len(self.servers), 6,
                                "Not enough servers to run this test")

        # Volume options to set on the volume
        volume_options = {
            'nfs.disable': 'off',
            'auth.allow': '1.1.1.1',
            'nfs.rpc-auth-allow': '1.1.1.1',
            'nfs.addr-namelookup': 'on',
            'cluster.server-quorum-type': 'server',
            'network.ping-timeout': '20',
            'nfs.port': '2049',
            'performance.nfs.write-behind': 'on',
        }

        # Set the volume options
        ret = set_volume_options(self.mnode, self.volname, volume_options)
        self.assertTrue(ret, "Unable to set the volume options")
        g.log.info("All the volume_options set succeeded")

        # Stop glusterd on two gluster nodes where bricks aren't present
        ret = stop_glusterd(self.servers[-2:])
        self.assertTrue(ret, "Failed to stop glusterd on one of the node")
        g.log.info("Glusterd stop on the nodes : %s "
                   "succeeded", self.servers[-2:])

        # Check glusterd is stopped
        ret = is_glusterd_running(self.servers[-2:])
        self.assertEqual(ret, 1, "Glusterd is running on nodes")
        g.log.info("Expected: Glusterd stopped on nodes %s", self.servers[-2:])

        # Performing volume reset on the volume to remove all the volume
        # options set earlier
        ret, _, err = volume_reset(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume reset failed with below error "
                         "%s" % err)
        g.log.info("Volume reset on the volume %s succeeded", self.volname)

        # Bring back glusterd online on the nodes where it stopped earlier
        ret = start_glusterd(self.servers[-2:])
        self.assertTrue(ret, "Failed to start glusterd on the nodes")
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[-2:])

        # Check peer status whether all peer are in connected state none of the
        # nodes should be in peer rejected state
        halt = 20
        counter = 0
        _rc = False
        g.log.info("Wait for some seconds, right after glusterd start it "
                   "will create two daemon process it need few seconds "
                   "(like 3-5) to initialize the glusterd")
        while counter < halt:
            ret = is_peer_connected(self.mnode, self.servers)
            if not ret:
                g.log.info("Peers are not connected state,"
                           " Retry after 2 seconds .......")
                sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("Peers are in connected state in the cluster")
                break
        if not _rc:
            raise ExecutionError("Peers are not connected state after "
                                 "bringing back glusterd online on the "
                                 "nodes in which previously glusterd "
                                 "had been stopped")
    def test_quorum_remove_brick(self):
        '''
        -> Create volume
        -> Enabling server quorum
        -> Set server quorum ratio to 95%
        -> Stop the glusterd on any one of the node
        -> Perform remove brick operation
        -> start glusterd
        -> Check gluster vol info, bricks should be same before and after
        performing remove brick operation.
        '''
        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(ret, "Failed to set server quorum for volume %s"
                        % self.volname)
        g.log.info("Able to set server quorum successfully for %s",
                   self.volname)

        # Setting server quorum ratio in percentage
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(ret, "Failed to set server quorum ratio for %s"
                        % self.servers)
        g.log.info("Able to set server quorum ratio successfully for %s",
                   self.servers)

        # Getting brick list from volume
        brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(brick_list, "Failed to get brick list of %s"
                             % self.volname)
        g.log.info("Successful in getting brick list of %s", self.volname)

        # Stopping glusterd
        self.random_server = random.choice(self.servers[1:])
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to stop glusterd on %s"
                        % self.random_server)
        g.log.info("Glusterd stopped successfully on %s", self.random_server)

        # Forming brick list for performing remove brick operation
        remove_brick_list = form_bricks_list_to_remove_brick(self.mnode,
                                                             self.volname)
        self.assertIsNotNone(remove_brick_list, "Failed to get brick list for "
                                                "performing remove brick "
                                                "operation")
        g.log.info("Successful in getting brick list for performing remove "
                   "brick operation")

        # Performing remove brick operation
        ret, _, err = remove_brick(self.mnode, self.volname,
                                   remove_brick_list, 'force')
        self.assertNotEqual(ret, 0, "Remove brick should fail when quorum is "
                                    "in not met condition, but brick removed "
                                    "successfully for %s" % self.volname)
        g.log.info("Failed to remove brick when quorum is in not met condition"
                   " as expected for %s", self.volname)

        # Expected error message for remove brick operation
        msg = ("volume remove-brick commit force: failed: "
               "Quorum not met. Volume operation not allowed")

        # Checking error message for remove brick operation
        self.assertIn(msg, err, "Error message is not correct for "
                                "remove brick operation when quorum not met")
        g.log.info("Error message is correct for remove brick operation when "
                   "quorum not met")

        # Starting glusterd
        ret = start_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to start glusterd on %s"
                        % self.random_server)
        g.log.info("Glusted started successfully on %s", self.random_server)

        # Checking glusterd status
        count = 0
        while count < 60:
            ret = is_glusterd_running(self.random_server)
            if not ret:
                break
            sleep(2)
            count += 1
        self.assertEqual(ret, 0, "Glusterd is not running on %s"
                         % self.random_server)
        g.log.info("Glusterd is running on %s", self.random_server)

        # Getting brick list of volume after performing remove brick operation
        new_brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(new_brick_list, "Failed to get brick list of %s"
                             % self.volname)
        g.log.info("Successful in getting brick list of %s", self.volname)

        # Comparing bricks info before and after performing
        # remove brick operation
        self.assertListEqual(brick_list, new_brick_list,
                             "Bricks are not same before and after performing"
                             " remove brick operation")
        g.log.info("Bricks are same before and after "
                   "performing remove brick operation")
Ejemplo n.º 15
0
    def test_profile_start_with_quorum_not_met(self):
        # pylint: disable=too-many-statements
        """
        1. Create a volume
        2. Set the quorum type to server and ratio to 90
        3. Stop glusterd randomly on one of the node
        4. Start profile on the volume
        5. Start glusterd on the node where it is stopped
        6. Start profile on the volume
        7. Stop profile on the volume where it is started
        """

        # Enabling server quorum
        self.quorum_options = {'cluster.server-quorum-type': 'server'}
        ret = set_volume_options(self.mnode, self.volname, self.quorum_options)
        self.assertTrue(ret, "gluster volume set %s cluster.server-quorum-type"
                             " server Failed" % self.volname)
        g.log.info("gluster volume set %s cluster.server-quorum-type server "
                   "enabled successfully", self.volname)

        # Setting Quorum ratio to 90%
        self.quorum_perecent = {'cluster.server-quorum-ratio': '90%'}
        ret = set_volume_options(self.mnode, 'all', self.quorum_perecent)
        self.assertTrue(ret, "gluster volume set all cluster.server-quorum-rat"
                             "io percentage Failed :%s" % self.servers)
        g.log.info("gluster volume set all cluster.server-quorum-ratio 90 "
                   "percentage enabled successfully on :%s", self.servers)

        # Stop glusterd on one of the node randomly
        self.node_on_glusterd_to_stop = choice(self.servers)
        ret = stop_glusterd(self.node_on_glusterd_to_stop)
        self.assertTrue(ret, "glusterd stop on the node failed")
        g.log.info("glusterd stop on the node: % "
                   "succeeded", self.node_on_glusterd_to_stop)

        # checking whether peers are connected or not
        count = 0
        while count < 5:
            ret = self.validate_peers_are_connected()
            if not ret:
                break
            sleep(2)
            count += 1
        self.assertFalse(ret, "Peers are in connected state even after "
                              "stopping glusterd on one node")

        # Starting volume profile when quorum is not met
        self.new_servers = self.servers[:]
        self.new_servers.remove(self.node_on_glusterd_to_stop)
        ret, _, _ = profile_start(choice(self.new_servers),
                                  self.volname)
        self.assertNotEqual(ret, 0, "Expected: Should not be able to start "
                                    "volume profile. Acutal: Able to start "
                                    "the volume profile start")
        g.log.info("gluster vol profile start is failed as expected")

        # Start glusterd on the node where it is stopped
        ret = start_glusterd(self.node_on_glusterd_to_stop)
        self.assertTrue(ret, "glusterd start on the node failed")
        g.log.info("glusterd start succeeded")

        # checking whether peers are connected or not
        count = 0
        while count < 5:
            ret = self.validate_peers_are_connected()
            if ret:
                break
            sleep(5)
            count += 1
        self.assertTrue(ret, "Peer are not in connected state ")

        # Starting profile when volume quorum is met
        ret, _, _ = profile_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Expected: Should be able to start the"
                                 "volume profile start. Acutal: Not able"
                                 " to start the volume profile start")
        g.log.info("gluster vol profile start is successful")

        # Stop the profile
        ret, _, _ = profile_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Expected: Should be able to stop the "
                                 "profile stop. Acutal: Not able to stop"
                                 " the profile stop")
        g.log.info("gluster volume profile stop is successful")
    def test_peer_probe_when_glusterd_down(self):
        # pylint: disable=too-many-statements
        '''
        Test script to verify the behavior when we try to peer
        probe a valid node whose glusterd is down
        Also post validate to make sure no core files are created
        under "/", /var/log/core and /tmp  directory

        Ref: BZ#1257394 Provide meaningful error on peer probe and peer detach
        Test Steps:
        1 check the current peer status
        2 detach one of the valid nodes which is already part of cluster
        3 stop glusterd on that node
        4 try to attach above node to cluster, which must fail with
          Transport End point error
        5 Recheck the test using hostname, expected to see same result
        6 start glusterd on that node
        7 halt/reboot the node
        8 try to peer probe the halted node, which must fail again.
        9 The only error accepted is
          "peer probe: failed: Probe returned with Transport endpoint is not
          connected"
        10 Check peer status and make sure no other nodes in peer reject state
        '''

        ret, test_timestamp, _ = g.run_local('date +%s')
        test_timestamp = test_timestamp.strip()

        # Detach one of the nodes which is part of the cluster
        g.log.info("detaching server %s ", self.servers[1])
        ret, _, err = peer_detach(self.mnode, self.servers[1])
        msg = 'peer detach: failed: %s is not part of cluster\n' \
              % self.servers[1]
        if ret:
            self.assertEqual(err, msg, "Failed to detach %s "
                             % (self.servers[1]))

        # Bring down glusterd of the server which has been detached
        g.log.info("Stopping glusterd on %s ", self.servers[1])
        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Fail to stop glusterd on %s " % self.servers[1])

        # Trying to peer probe the node whose glusterd was stopped using IP
        g.log.info("Peer probing %s when glusterd down ", self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertNotEqual(ret, 0, "Peer probe should not pass when "
                                    "glusterd is down")
        self.assertEqual(err, "peer probe: failed: Probe returned with "
                              "Transport endpoint is not connected\n")

        # Trying to peer probe the same node with hostname
        g.log.info("Peer probing node %s using hostname with glusterd down ",
                   self.servers[1])
        hostname = g.run(self.servers[1], "hostname")
        ret, _, err = peer_probe(self.mnode, hostname[1].strip())
        self.assertNotEqual(ret, 0, "Peer probe should not pass when "
                                    "glusterd is down")
        self.assertEqual(err, "peer probe: failed: Probe returned with"
                              " Transport endpoint is not connected\n")

        # Start glusterd again for the next set of test steps
        g.log.info("starting glusterd on %s ", self.servers[1])
        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "glusterd couldn't start successfully on %s"
                        % self.servers[1])

        # Bring down the network for sometime
        network_status = bring_down_network_interface(self.servers[1], 150)

        # Peer probing the node using IP when it is still not online
        g.log.info("Peer probing node %s when network is down",
                   self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertNotEqual(ret, 0, "Peer probe passed when it was expected to"
                                    " fail")
        self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe "
                                             "returned with Transport endpoint"
                                             " is not connected")

        # Peer probing the node using hostname when it is still not online
        g.log.info("Peer probing node %s using hostname which is still "
                   "not online ",
                   self.servers[1])
        ret, _, err = peer_probe(self.mnode, hostname[1].strip())
        self.assertNotEqual(ret, 0, "Peer probe should not pass when node "
                                    "has not come online")
        self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe "
                                             "returned with Transport endpoint"
                                             " is not connected")

        ret, _, _ = network_status.async_communicate()
        if ret != 0:
            g.log.error("Failed to perform network interface ops")

        # Peer probe the node must pass
        g.log.info("peer probing node %s", self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertEqual(ret, 0, "Peer probe has failed unexpectedly with "
                                 "%s " % err)

        # Checking if core file created in "/", "/tmp" and "/var/log/core"
        ret = is_core_file_created(self.servers, test_timestamp)
        self.assertTrue(ret, "core file found")
Ejemplo n.º 17
0
    def test_volume_set_wit_quorum_enabled(self):
        # pylint: disable=too-many-statements
        """
        Create a volume
        Set the quorum type to server and ratio to 90
        Stop glusterd randomly on one of the node
        Set the volume option on the volume
        Start glusterd on the node where it is stopped
        Set the volume option on the volume
        """

        # Enabling server quorum
        self.quorum_options = {'cluster.server-quorum-type': 'server'}
        ret = set_volume_options(self.mnode, self.volname, self.quorum_options)
        self.assertTrue(
            ret, "gluster volume set %s cluster.server-quorum-type"
            " server Failed" % self.volname)
        g.log.info(
            "gluster volume set %s cluster.server-quorum-type server "
            "enabled successfully", self.volname)

        # Setting Quorum ratio to 90%
        self.quorum_perecent = {'cluster.server-quorum-ratio': '90%'}
        ret = set_volume_options(self.mnode, 'all', self.quorum_perecent)
        self.assertTrue(
            ret, "gluster volume set all cluster.server-quorum-rat"
            "io percentage Failed :%s" % self.servers)
        g.log.info(
            "gluster volume set all cluster.server-quorum-ratio 90 "
            "percentage enabled successfully on :%s", self.servers)

        # Stop glusterd on one of the node randomly
        self.node_on_glusterd_to_stop = random.choice(self.servers)
        ret = stop_glusterd(self.node_on_glusterd_to_stop)
        self.assertTrue(ret, "glusterd stop on the node failed")
        g.log.info("glusterd stop on the node: % "
                   "succeeded", self.node_on_glusterd_to_stop)

        # checking whether peers are connected or not
        count = 0
        while count < 5:
            sleep(2)
            ret = self.validate_peers_are_connected()
            if not ret:
                break
            count += 1
        self.assertFalse(
            ret, "Peers are in connected state even after "
            "stopping glusterd on one node")

        # Setting volume option when quorum is not met
        self.new_servers = self.servers[:]
        self.new_servers.remove(self.node_on_glusterd_to_stop)
        self.nfs_options = {"nfs.disable": "off"}
        ret = set_volume_options(random.choice(self.new_servers), self.volname,
                                 self.nfs_options)
        self.assertFalse(
            ret, "gluster volume set %s nfs.disable off "
            "succeeded" % self.volname)
        g.log.info("gluster volume set %s nfs.disable off"
                   "successfully", self.volname)

        # Start glusterd on the node where it is stopped
        ret = start_glusterd(self.node_on_glusterd_to_stop)
        self.assertTrue(ret, "glusterd start on the node failed")
        g.log.info("glusterd start succeeded")

        # checking whether peers are connected or not
        count = 0
        while count < 5:
            sleep(5)
            ret = self.validate_peers_are_connected()
            if ret:
                break
            count += 1
        self.assertTrue(ret, "Peer are not in connected state ")

        # Setting the volume option when quorum is met
        self.nfs_options['nfs.disable'] = 'off'
        ret = set_volume_options(self.mnode, self.volname, self.nfs_options)
        self.assertTrue(
            ret, "gluster volume set %s nfs.disable "
            "off failed" % self.volname)
        g.log.info("gluster volume set %s nfs.disable "
                   "off successfully", self.volname)
Ejemplo n.º 18
0
    def test_replace_brick_quorum(self):
        '''
        -> Create volume
        -> Set quorum type
        -> Set quorum ratio to 95%
        -> Start the volume
        -> Stop the glusterd on one node
        -> Now quorum is in not met condition
        -> Check all bricks went to offline or not
        -> Perform replace brick operation
        -> Start glusterd on same node which is already stopped
        -> Check all bricks are in online or not
        -> Verify in vol info that old brick not replaced with new brick
        '''

        # Forming brick list, 6 bricks for creating volume, 7th brick for
        # performing replace brick operation
        brick_list = form_bricks_list(self.mnode, self.volname, 7,
                                      self.servers, self.all_servers_info)

        # Create Volume
        ret, _, _ = volume_create(self.mnode,
                                  self.volname,
                                  brick_list[0:6],
                                  replica_count=3)
        self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname)
        g.log.info("Volume created successfully %s", self.volname)

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(
            ret, "Failed to set server quorum on volume %s" % self.volname)
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)

        # Setting Quorum ratio in percentage
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set server quorum ratio on %s" % self.servers)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # Start the volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)
        g.log.info("Volume started successfully %s", self.volname)

        # Stop glusterd on one of the node
        random_server = random.choice(self.servers[1:])
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server)
        g.log.info("Glusterd stopped successfully on server %s", random_server)

        # Checking whether glusterd is running or not
        ret = is_glusterd_running(random_server)
        self.assertEqual(
            ret, 1, "Glusterd is still running on the node %s "
            "where glusterd stopped" % random_server)
        g.log.info("Glusterd is not running on the server %s", random_server)

        # Verifying node count in volume status after glusterd stopped
        # on one of the server, Its not possible to check the brick status
        # immediately in volume status after glusterd stop
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 5:
                break
            sleep(2)
            count += 1

        # creating brick list from volume status
        offline_bricks = []
        vol_status = get_volume_status(self.mnode, self.volname)
        for node in vol_status[self.volname]:
            for brick_path in vol_status[self.volname][node]:
                if brick_path != 'Self-heal Daemon':
                    offline_bricks.append(':'.join([node, brick_path]))

        # Checking bricks are offline or not with quorum ratio(95%)
        ret = are_bricks_offline(self.mnode, self.volname, offline_bricks)
        self.assertTrue(
            ret, "Bricks are online when quorum is in not met "
            "condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)

        # Getting random brick from offline brick list
        self.random_brick = random.choice(offline_bricks)

        # Performing replace brick commit force when quorum not met
        self.replace_brick_failed = False
        ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick,
                                  brick_list[6])
        self.assertNotEqual(
            ret, 0, "Replace brick should fail when quorum is "
            "in not met condition but replace brick "
            "success on %s" % self.volname)
        g.log.info(
            "Failed to replace brick when quorum is in not met "
            "condition %s", self.volname)
        self.replace_brick_failed = True

        # Start glusterd on one of the node
        ret = start_glusterd(random_server)
        self.assertTrue(
            ret, "Failed to start glusterd on server %s" % random_server)
        g.log.info("Glusterd started successfully on server %s", random_server)

        # Verifying node count in volume status after glusterd started
        # on one of the servers, Its not possible to check the brick status
        # immediately in volume status after glusterd start
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 6:
                break
            sleep(2)
            count += 1

        # Checking bricks are online or not
        count = 0
        while count < 100:
            ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6])
            if ret:
                break
            sleep(2)
            count += 1
        self.assertTrue(ret, "All bricks are not online for %s" % self.volname)
        g.log.info("All bricks are online for volume %s", self.volname)

        # Comparing brick lists of before and after performing replace brick
        # operation
        after_brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertListEqual(
            after_brick_list, brick_list[0:6],
            "Bricks are not same before and after performing "
            "replace brick operation for volume %s" % self.volname)
        g.log.info(
            "Bricks are same before and after performing replace "
            "brick operation for volume %s", self.volname)
    def test_glusterd_quorum_validation(self):
        """
        -> Creating two volumes and starting them, stop the second volume
        -> set the server quorum and set the ratio to 90
        -> Stop the glusterd in one of the node, so the quorum won't meet
        -> Peer probing a new node should fail
        -> Volume stop will fail
        -> volume delete will fail
        -> volume reset will fail
        -> Start the glusterd on the node where it is stopped
        -> Volume stop, start, delete will succeed once quorum is met
        """
        # pylint: disable=too-many-statements, too-many-branches

        # Peer probe first 3 servers
        servers_info_from_three_nodes = {}
        for server in self.servers[0:3]:
            servers_info_from_three_nodes[server] = self.all_servers_info[
                server]

            # Peer probe the first 3 servers
            ret, _, _ = peer_probe(self.mnode, server)
            self.assertEqual(ret, 0,
                             ("Peer probe failed to one of the server"))
        g.log.info("Peer probe to first 3 nodes succeeded")

        self.volume['servers'] = self.servers[0:3]
        # Create a volume using the first 3 nodes
        ret = setup_volume(self.mnode,
                           servers_info_from_three_nodes,
                           self.volume,
                           force=True)
        self.assertTrue(ret, ("Failed to create and start volume"))
        g.log.info("Volume created and started successfully")

        # Creating another volume and stopping it
        second_volume = "second_volume"
        self.volume['name'] = second_volume
        ret = setup_volume(self.mnode,
                           servers_info_from_three_nodes,
                           self.volume,
                           force=True)
        self.assertTrue(ret, ("Failed to create and start volume"))
        g.log.info("Volume created and started succssfully")

        # stopping the second volume
        g.log.info("Stopping the second volume %s", second_volume)
        ret, _, _ = volume_stop(self.mnode, second_volume)
        self.assertEqual(ret, 0, ("Failed to stop the volume"))
        g.log.info("Successfully stopped second volume %s", second_volume)

        # Setting the server-quorum-type as server
        self.options = {"cluster.server-quorum-type": "server"}
        vol_list = get_volume_list(self.mnode)
        self.assertIsNotNone(vol_list, "Failed to get the volume list")
        g.log.info("Fetched the volume list")
        for volume in vol_list:
            g.log.info(
                "Setting the server-quorum-type as server"
                " on volume %s", volume)
            ret = set_volume_options(self.mnode, volume, self.options)
            self.assertTrue(ret, ("Failed to set the quorum type as a server"
                                  " on volume %s", volume))
        g.log.info("Server Quorum type is set as a server")

        # Setting the server quorum ratio to 90
        self.quorum_perecent = {'cluster.server-quorum-ratio': '90%'}
        ret = set_volume_options(self.mnode, 'all', self.quorum_perecent)
        self.assertTrue(ret, ("Failed to set the server quorum ratio "
                              "to 90 on servers"))
        g.log.info("Successfully set server quorum ratio to 90% on servers")

        # Stop glusterd on one of the node
        ret = stop_glusterd(self.servers[2])
        self.assertTrue(ret, ("Failed to stop glusterd on "
                              "node %s", self.servers[2]))
        g.log.info("Glusterd stop on the nodes : %s"
                   " succeeded", self.servers[2])

        # Check glusterd is stopped
        ret = is_glusterd_running(self.servers[2])
        self.assertEqual(ret, 1, "Unexpected: Glusterd is running on node")
        g.log.info("Expected: Glusterd stopped on node %s", self.servers[2])

        # Adding a new peer will fail as quorum not met
        ret, _, _ = peer_probe(self.mnode, self.servers[3])
        self.assertNotEqual(ret, 0,
                            ("Unexpected:"
                             "Succeeded to peer probe new node %s when quorum "
                             "is not met", self.servers[3]))
        g.log.info("Failed to peer probe new node as expected"
                   " when quorum not met")

        # Stopping an already started volume should fail as quorum is not met
        ret, _, _ = volume_start(self.mnode, second_volume)
        self.assertNotEqual(
            ret, 0, "Unexpected: Successfuly started "
            "volume even when quorum not met.")
        g.log.info(
            "Volume start %s failed as expected when quorum "
            "is not met", second_volume)

        # Stopping a volume should fail stop the first volume
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(
            ret, 1, "Unexpected: Successfully stopped"
            " volume even when quourm is not met")
        g.log.info(
            "volume stop %s failed as expected when quorum "
            "is not met", self.volname)

        # Stopping a volume with force option should fail
        ret, _, _ = volume_stop(self.mnode, self.volname, force=True)
        self.assertNotEqual(
            ret, 0, "Unexpected: Successfully "
            "stopped volume with force. Expected: "
            "Volume stop should fail when quourm is not met")
        g.log.info("volume stop failed as expected when quorum is not met")

        # Deleting a volume should fail. Deleting the second volume.
        ret = volume_delete(self.mnode, second_volume)
        self.assertFalse(
            ret, "Unexpected: Volume delete was "
            "successful even when quourm is not met")
        g.log.info("volume delete failed as expected when quorum is not met")

        # Volume reset should fail when quorum is not met
        ret, _, _ = volume_reset(self.mnode, self.volname)
        self.assertNotEqual(
            ret, 0, "Unexpected: Volume reset was "
            "successful even when quorum is not met")
        g.log.info("volume reset failed as expected when quorum is not met")

        # Volume reset should fail even with force when quourum is not met
        ret, _, _ = volume_reset(self.mnode, self.volname, force=True)
        self.assertNotEqual(
            ret, 0, "Unexpected: Volume reset was "
            "successful with force even "
            "when quourm is not met")
        g.log.info("volume reset failed as expected when quorum is not met")

        # Start glusterd on the node where glusterd is stopped
        ret = start_glusterd(self.servers[2])
        self.assertTrue(ret, "Failed to start glusterd on one node")
        g.log.info("Started glusterd on server"
                   " %s successfully", self.servers[2])

        ret = is_glusterd_running(self.servers[2])
        self.assertEqual(ret, 0, ("glusterd is not running on "
                                  "node %s", self.servers[2]))
        g.log.info("glusterd is running on node" " %s ", self.servers[2])

        # Check peer status whether all peer are in connected state none of the
        # nodes should be in peer rejected state
        halt, counter, _rc = 30, 0, False
        g.log.info("Wait for some seconds, right after glusterd start it "
                   "will create two daemon process it need few seconds "
                   "(like 3-5) to initialize the glusterd")
        while counter < halt:
            ret = is_peer_connected(self.mnode, self.servers[0:3])
            if not ret:
                g.log.info("Peers are not connected state,"
                           " Retry after 2 seconds .......")
                sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("Peers are in connected state in the cluster")
                break

        self.assertTrue(_rc, ("Peers are not connected state after "
                              "bringing back glusterd online on the "
                              "nodes in which previously glusterd "
                              "had been stopped"))

        # Check all bricks are online or wait for the bricks to be online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "All bricks are not online")
        g.log.info("All bricks of the volume %s are online", self.volname)

        # Once quorum is met should be able to cleanup the volume
        ret = volume_delete(self.mnode, second_volume)
        self.assertTrue(ret, "Volume delete failed even when quorum is met")
        g.log.info("volume delete succeed without any issues")

        # Volume stop should succeed
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume stop failed")
        g.log.info("succeeded stopping the volume as expected")

        # volume reset should succeed
        ret, _, _ = volume_reset(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume reset failed ")
        g.log.info("volume reset succeeded as expected when quorum is not met")

        # Peer probe new node should succeed
        ret, _, _ = peer_probe(self.mnode, self.servers[3])
        self.assertEqual(
            ret, 0, ("Failed to peer probe new node even when quorum is met"))
        g.log.info("Succeeded to peer probe new node when quorum met")

        # Check peer status whether all peer are in connected state none of the
        # nodes should be in peer rejected state
        halt, counter, _rc = 30, 0, False
        g.log.info("Wait for some seconds, right after peer probe")
        while counter < halt:
            ret = is_peer_connected(self.mnode, self.servers[0:3])
            if not ret:
                g.log.info("Peers are not connected state,"
                           " Retry after 2 seconds .......")
                sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("Peers are in connected state in the cluster")
                break

        self.assertTrue(_rc, ("Peers are not connected state"))
    def test_snap_glusterd_down(self):
        # pylint: disable=too-many-statements
        """
        Steps:

        1. create a volume
        2. mount volume
        3. create snapshot of that volume
        4. validate using snapshot info
        5. Activate snapshot
        6. List all snapshots present
        7. validate using snapshot info
        8. Stop glusterd on one node
        9. Check glusterd status
       10. deactivate created snapshot
       11. Start glusterd on that node
       12. Check glusterd status
       13. validate using snapshot info
       13. Check all peers are connected

        """
        # Creating snapshot:
        g.log.info("Starting to Create snapshot")
        ret, _, _ = snap_create(self.mnode, self.volname, self.snap)
        self.assertEqual(ret, 0, ("Failed to create snapshot %s for volume %s"
                                  % (self.snap, self.volname)))
        g.log.info("Snapshot %s created successfully "
                   "for volume %s", self.snap, self.volname)

        # Check snapshot info
        g.log.info("Checking snapshot info")
        snap_info = get_snap_info_by_snapname(self.mnode, self.snap)
        self.assertIsNotNone(snap_info, "Failed to get snap information"
                             "for snapshot %s" % self.snap)
        status = snap_info['snapVolume']['status']
        self.assertNotEqual(status, 'Started', "snapshot %s "
                            "not started" % self.snap)
        g.log.info("Successfully checked snapshot info")

        # Activating snapshot
        g.log.info("Starting to Activate Snapshot")
        ret, _, _ = snap_activate(self.mnode, self.snap)
        self.assertEqual(ret, 0, ("Failed to Activate snapshot %s"
                                  % self.snap))
        g.log.info("Snapshot %s activated successfully", self.snap)

        # snapshot list
        g.log.info("Starting to validate list of snapshots")
        snap_list1 = get_snap_list(self.mnode)
        self.assertIsNotNone(snap_list1, "Failed to list all the snapshot")
        self.assertEqual(len(snap_list1), 1, "Failed to validate snap list")
        g.log.info("Snapshot list successfully validated")

        # Check snapshot info
        g.log.info("Checking snapshot info")
        snap_info = get_snap_info_by_snapname(self.mnode, self.snap)
        status = snap_info['snapVolume']['status']
        self.assertEqual(status, 'Started', "Failed to"
                         "start snapshot info")
        g.log.info("Successfully checked snapshot info")

        # Stop Glusterd on one node
        g.log.info("Stopping Glusterd on one node")
        ret = stop_glusterd(self.servers[1])

        # Check Glusterd status
        g.log.info("Check glusterd running or not")
        count = 0
        while count < 80:
            ret = is_glusterd_running(self.servers[1])
            if ret == 1:
                break
            time.sleep(2)
            count += 2
        self.assertEqual(ret, 1, "Unexpected: glusterd running on node %s" %
                         self.servers[1])
        g.log.info("Expected: Glusterd not running on node %s",
                   self.servers[1])

        # de-activating snapshot
        g.log.info("Starting to de-activate Snapshot")
        ret, _, _ = snap_deactivate(self.mnode, self.snap)
        self.assertEqual(ret, 0, ("Failed to deactivate snapshot %s"
                                  % self.snap))
        g.log.info("Snapshot %s deactivated successfully", self.snap)

        # validate snapshot info
        g.log.info("Checking snapshot info")
        snap_info = get_snap_info_by_snapname(self.mnode, self.snap)
        status = snap_info['snapVolume']['status']
        self.assertNotEqual(status, 'Started', "snapshot %s "
                            "not started" % self.snap)
        g.log.info("Successfully validated snapshot info")

        # Start Glusterd on node
        g.log.info("Starting Glusterd on node %s", self.servers[1])
        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to start glusterd on %s node"
                        % self.servers[1])
        g.log.info("Successfully started glusterd on "
                   "%s node", self.servers[1])

        # Check Glusterd status
        g.log.info("Check glusterd running or not")
        count = 0
        while count < 80:
            ret = is_glusterd_running(self.servers[1])
            if ret:
                break
            time.sleep(2)
            count += 2
        self.assertEqual(ret, 0, "glusterd not running on node %s "
                         % self.servers[1])
        g.log.info("glusterd is running on %s node",
                   self.servers[1])

        # validate snapshot info
        g.log.info("Checking snapshot info")
        snap_info = get_snap_info_by_snapname(self.mnode, self.snap)
        self.assertIsNotNone(snap_info, "Failed to get snap info for"
                             " snapshot %s" % self.snap)
        status = snap_info['snapVolume']['status']
        self.assertNotEqual(status, 'Started', "snapshot"
                            " %s failed to validate with snap info"
                            % self.snap)
        g.log.info("Successfully validated snapshot info")

        # Check all the peers are in connected state
        g.log.info("Validating all the peers are in connected state")
        for servers in self.servers:
            count = 0
            while count < 80:
                ret = is_peer_connected(self.mnode, servers)
                if ret:
                    break
                time.sleep(2)
                count += 2
            self.assertTrue(ret, "All the nodes are not in cluster")
        g.log.info("Successfully validated all the peers")
Ejemplo n.º 21
0
    def test_peer_probe_when_glusterd_down(self):
        # pylint: disable=too-many-statements
        '''
        Test script to verify the behavior when we try to peer
        probe a valid node whose glusterd is down
        Also post validate to make sure no core files are created
        under "/", /var/log/core and /tmp  directory

        Ref: BZ#1257394 Provide meaningful error on peer probe and peer detach
        Test Steps:
        1 check the current peer status
        2 detach one of the valid nodes which is already part of cluster
        3 stop glusterd on that node
        4 try to attach above node to cluster, which must fail with
          Transport End point error
        5 Recheck the test using hostname, expected to see same result
        6 start glusterd on that node
        7 halt/reboot the node
        8 try to peer probe the halted node, which must fail again.
        9 The only error accepted is
          "peer probe: failed: Probe returned with Transport endpoint is not
          connected"
        10 Check peer status and make sure no other nodes in peer reject state
        '''

        ret, test_timestamp, _ = g.run_local('date +%s')
        test_timestamp = test_timestamp.strip()

        # detach one of the nodes which is part of the cluster
        g.log.info("detaching server %s ", self.servers[1])
        ret, _, err = peer_detach(self.mnode, self.servers[1])
        msg = 'peer detach: failed: %s is not part of cluster\n' \
              % self.servers[1]
        if ret:
            self.assertEqual(err, msg, "Failed to detach %s "
                             % (self.servers[1]))

        # bring down glusterd of the server which has been detached
        g.log.info("Stopping glusterd on %s ", self.servers[1])
        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Fail to stop glusterd on %s " % self.servers[1])

        # trying to peer probe the node whose glusterd was stopped using its IP
        g.log.info("Peer probing %s when glusterd down ", self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertNotEqual(ret, 0, "Peer probe should not pass when "
                                    "glusterd is down")
        self.assertEqual(err, "peer probe: failed: Probe returned with "
                              "Transport endpoint is not connected\n")

        # trying to peer probe the same node with hostname
        g.log.info("Peer probing node %s using hostname with glusterd down ",
                   self.servers[1])
        hostname = g.run(self.servers[1], "hostname")
        ret, _, err = peer_probe(self.mnode, hostname[1].strip())
        self.assertNotEqual(ret, 0, "Peer probe should not pass when "
                                    "glusterd is down")
        self.assertEqual(err, "peer probe: failed: Probe returned with"
                              " Transport endpoint is not connected\n")

        # start glusterd again for the next set of test steps
        g.log.info("starting glusterd on %s ", self.servers[1])
        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "glusterd couldn't start successfully on %s"
                        % self.servers[1])

        # reboot a server and then trying to peer probe at the time of reboot
        g.log.info("Rebooting %s and checking peer probe", self.servers[1])
        reboot = g.run_async(self.servers[1], "reboot")

        # Mandatory sleep for 3 seconds to make sure node is in halted state
        sleep(3)

        # Peer probing the node using IP when it is still not online
        g.log.info("Peer probing node %s which has been issued a reboot ",
                   self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertNotEqual(ret, 0, "Peer probe passed when it was expected to"
                                    " fail")
        self.assertEqual(err, "peer probe: failed: Probe returned with "
                              "Transport endpoint is not connected\n")

        # Peer probing the node using hostname when it is still not online
        g.log.info("Peer probing node %s using hostname which is still "
                   "not online ",
                   self.servers[1])
        ret, _, err = peer_probe(self.mnode, hostname[1].strip())
        self.assertNotEqual(ret, 0, "Peer probe should not pass when node "
                                    "has not come online")
        self.assertEqual(err, "peer probe: failed: Probe returned with "
                              "Transport endpoint is not connected\n")

        ret, _, _ = reboot.async_communicate()
        self.assertEqual(ret, 255, "reboot failed")

        # Validate if rebooted node is online or not
        count = 0
        while count < 40:
            sleep(15)
            ret, _ = are_nodes_online(self.servers[1])
            if ret:
                g.log.info("Node %s is online", self.servers[1])
                break
            count += 1
        self.assertTrue(ret, "Node in test not yet online")

        # check if glusterd is running post reboot
        ret = wait_for_glusterd_to_start(self.servers[1],
                                         glusterd_start_wait_timeout=120)
        self.assertTrue(ret, "Glusterd service is not running post reboot")

        # peer probe the node must pass
        g.log.info("peer probing node %s", self.servers[1])
        ret, _, err = peer_probe(self.mnode, self.servers[1])
        self.assertEqual(ret, 0, "Peer probe has failed unexpectedly with "
                                 "%s " % err)

        # checking if core file created in "/", "/tmp" and "/var/log/core"
        ret = is_core_file_created(self.servers, test_timestamp)
        self.assertTrue(ret, "core file found")
Ejemplo n.º 22
0
    def test_rebalance_hang(self):
        """
        In this test case:
        1. Trusted storage Pool of 2 nodes
        2. Create a distributed volumes with 2 bricks
        3. Start the volume
        4. Mount the volume
        5. Add some data file on mount
        6. Start rebalance with force
        7. kill glusterd on 2nd node
        8. Issue volume related command
        """

        # pylint: disable=too-many-statements
        my_server_info = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        my_servers = self.servers[0:2]
        index = 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[index])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[index]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[index])
        key = self.servers[index]
        my_server_info[key] = self.all_servers_info[key]

        self.volname = "testvol"
        bricks_list = form_bricks_list(self.mnode, self.volname, 2, my_servers,
                                       my_server_info)
        g.log.info("Creating a volume %s ", self.volname)
        ret, _, _ = volume_create(self.mnode,
                                  self.volname,
                                  bricks_list,
                                  force=False)
        self.assertEqual(ret, 0, ("Unable"
                                  "to create volume %s" % self.volname))
        g.log.info("Volume created successfully %s", self.volname)

        ret, _, _ = volume_start(self.mnode, self.volname, False)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        # Mounting a volume
        ret, _, _ = mount_volume(self.volname,
                                 mtype=self.mount_type,
                                 mpoint=self.mounts[0].mountpoint,
                                 mserver=self.mnode,
                                 mclient=self.mounts[0].client_system)
        self.assertEqual(ret, 0, ("Volume %s is not mounted") % self.volname)
        g.log.info("Volume mounted successfully : %s", self.volname)

        self.all_mounts_procs = []
        # Creating files
        command = ("cd %s/ ; "
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=128k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; " % (self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False
        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        g.log.info("Starting rebalance with force on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, False, True)
        self.assertEqual(
            ret, 0, ("Failed to start rebalance for volume %s", self.volname))
        g.log.info("Successfully rebalance on the volume %s", self.volname)

        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to stop glusterd on one of the node")
        ret = is_glusterd_running(self.servers[1])
        self.assertNotEqual(
            ret, 0, ("Glusterd is not stopped on servers %s", self.servers[1]))
        g.log.info("Glusterd stop on the nodes : %s succeeded",
                   self.servers[1])

        # Wait for fix-layout to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(
            vol_status, "Failed to get volume "
            "status for %s" % self.volname)

        # Start glusterd on the node where it is stopped
        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "glusterd start on the node failed")
        count = 0
        while count < 60:
            ret = is_glusterd_running(self.servers[1])
            if not ret:
                break
            sleep(2)
            count += 1
        self.assertEqual(ret, 0,
                         "glusterd is not running on %s" % self.servers[1])
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])
    def test_brick_port(self):
        # pylint: disable=too-many-statements, too-many-branches
        """
        In this test case:
        1. Trusted storage Pool of 2 nodes
        2. Create a distributed volumes with 2 bricks
        3. Start the volume
        4. Stop glusterd on one node 2
        5. Modify any of the volume option on node 1
        6. Start glusterd on node 2
        7. Check volume status, brick should get port
        """
        my_server_info = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        my_servers = self.servers[0:2]
        index = 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[index])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[index]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[index])
        key = self.servers[index]
        my_server_info[key] = self.all_servers_info[key]

        self.volname = "testvol"
        bricks_list = form_bricks_list(self.mnode, self.volname, 2,
                                       my_servers,
                                       my_server_info)
        g.log.info("Creating a volume %s ", self.volname)
        ret = volume_create(self.mnode, self.volname,
                            bricks_list, force=False)
        self.assertEqual(ret[0], 0, ("Unable"
                                     "to create volume %s" % self.volname))
        g.log.info("Volume created successfully %s", self.volname)

        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")

        g.log.info("Successfully got the list of bricks of volume")

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                             "status for %s" % self.volname)
        totport = 0
        for _, value in vol_status.items():
            for _, val in value.items():
                for _, value1 in val.items():
                    if int(value1["port"]) > 0:
                        totport += 1

        self.assertEqual(totport, 2, ("Volume %s is not started successfully"
                                      "because no. of brick port is not equal"
                                      " to 2", self.volname))

        ret = stop_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to stop glusterd on one of the node")
        ret = wait_for_glusterd_to_start(self.servers[1])
        self.assertFalse(ret, "glusterd is still running on %s"
                         % self.servers[1])
        g.log.info("Glusterd stop on the nodes : %s "
                   "succeeded", self.servers[1])

        option = {'performance.readdir-ahead': 'on'}
        ret = set_volume_options(self.servers[0], self.volname, option)
        self.assertTrue(ret, "gluster volume set %s performance.readdir-ahead"
                             "on is failed on server %s"
                        % (self.volname, self.servers[0]))
        g.log.info("gluster volume set %s performance.readdir-ahead on"
                   "successfully on :%s", self.volname, self.servers[0])

        ret = start_glusterd(self.servers[1])
        self.assertTrue(ret, "Failed to start glusterd on one of the node")
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])
        ret = wait_for_glusterd_to_start(self.servers[1])
        self.assertTrue(ret, "glusterd is not running on %s"
                        % self.servers[1])
        g.log.info("Glusterd start on the nodes : %s "
                   "succeeded", self.servers[1])

        ret = wait_for_peers_to_connect(self.servers[0], self.servers[1])
        self.assertTrue(ret, "glusterd is not connected %s with peer %s"
                        % (self.servers[0], self.servers[1]))

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(vol_status, "Failed to get volume "
                             "status for %s" % self.volname)
        totport = 0
        for _, value in vol_status.items():
            for _, val in value.items():
                for _, value1 in val.items():
                    if int(value1["port"]) > 0:
                        totport += 1

        self.assertEqual(totport, 2, ("Volume %s is not started successfully"
                                      "because no. of brick port is not equal"
                                      " to 2", self.volname))
Ejemplo n.º 24
0
    def test_volume_create(self):

        # create and start a volume
        self.volume['name'] = "first_volume"
        self.volname = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, "Failed to create and start volume")

        # bring a brick down and volume start force should bring it to online

        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        ret = bring_bricks_offline(self.volname, bricks_list[0:2])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to start the volume")
        g.log.info("Volume start with force is success")

        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to bring the bricks online")
        g.log.info("Volume start with force successfully brought all the "
                   "bricks online")

        # create volume with previously used bricks and different volume name
        self.volname = "second_volume"
        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: It should fail to create a "
            "volume with previously used bricks. Actual:"
            "Successfully created the volume with previously"
            " used bricks")
        g.log.info("Failed to create the volume with previously used bricks")

        # create a volume with already existing volume name
        self.volume['name'] = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(
            ret, "Expected: It should fail to create a volume"
            " with already existing volume name. Actual: "
            "Successfully created the volume with "
            "already existing volname")
        g.log.info("Failed to create the volume with already existing volname")

        # creating a volume with non existing brick path should fail

        self.volname = "second_volume"
        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       len(self.servers), self.servers,
                                       self.all_servers_info)
        nonexisting_brick_index = random.randint(0, len(bricks_list) - 1)
        non_existing_brick = bricks_list[nonexisting_brick_index].split(":")[0]
        non_existing_path = ":/brick/non_existing_path"
        non_existing_brick = non_existing_brick + non_existing_path
        bricks_list[nonexisting_brick_index] = non_existing_brick

        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: Creating a volume with non "
            "existing brick path should fail. Actual: "
            "Successfully created the volume with "
            "non existing brick path")
        g.log.info("Failed to create the volume with non existing brick path")

        # cleanup the volume and peer detach all servers. form two clusters,try
        # to create a volume with bricks whose nodes are in different clusters

        # cleanup volumes
        vol_list = get_volume_list(self.mnode)
        self.assertIsNotNone(vol_list, "Failed to get the volume list")

        for volume in vol_list:
            ret = cleanup_volume(self.mnode, volume)
            self.assertTrue(ret, "Unable to delete volume % s" % volume)

        # peer detach all servers
        ret = peer_detach_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer detach to all servers is failed")
        g.log.info("Peer detach to all the servers is success")

        # form cluster 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[1])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[0], self.servers[1]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[0], self.servers[1]))

        # form cluster 2
        ret, _, _ = peer_probe(self.servers[2], self.servers[3])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[2], self.servers[3]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[2], self.servers[3]))

        # Creating a volume with bricks which are part of another
        # cluster should fail
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: Creating a volume with bricks"
            " which are part of another cluster should fail."
            " Actual: Successfully created the volume with "
            "bricks which are part of another cluster")
        g.log.info("Failed to create the volume with bricks which are "
                   "part of another cluster")

        # form a cluster, bring a node down. try to create a volume when one of
        # the brick node is down
        ret, _, _ = peer_detach(self.servers[2], self.servers[3])
        self.assertEqual(ret, 0, "Peer detach is failed")
        g.log.info("Peer detach is success")

        ret = peer_probe_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer probe is failed")
        g.log.info("Peer probe to all the servers is success")

        random_server = self.servers[random.randint(1, len(self.servers) - 1)]
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Glusterd is stopped successfully")

        self.volume['name'] = "third_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: It should fail to create a volume "
            "when one of the node is down. Actual: Successfully "
            "created the volume with bbrick whose node is down")

        g.log.info("Failed to create the volume with brick whose node is down")
    def tearDown(self):

        # Stop glusterd
        ret = stop_glusterd(self.mnode)
        if not ret:
            raise ExecutionError("Failed to stop glusterd on %s" % self.mnode)
        g.log.info("Successfully stopped glusterd.")

        # Reverting log level in /usr/lib/systemd/system/glusterd.service
        # to INFO
        glusterd_file = "/usr/lib/systemd/system/glusterd.service"
        ret = find_and_replace_in_file(self.mnode, 'LOG_LEVEL=DEBUG',
                                       'LOG_LEVEL=INFO', glusterd_file)
        if not ret:
            raise ExecutionError("Changes")
        g.log.info("Changes to glusterd.services reverted.")

        # Archiving the glusterd log file of test case.
        ret = move_file(self.mnode, '/var/log/glusterfs/glusterd.log',
                        '/var/log/glusterfs/EnableDebugMode-glusterd.log')
        if not ret:
            raise ExecutionError("Archiving present log file failed.")
        g.log.info("Archiving present log file successful.")

        # Reverting back to old glusterd log file.
        ret = move_file(self.mnode, '/var/log/glusterfs/old.log',
                        '/var/log/glusterfs/glusterd.log')
        if not ret:
            raise ExecutionError("Reverting glusterd log failed.")
        g.log.info("Reverting of glusterd log successful.")

        # Daemon should be reloaded as unit file is changed
        ret = daemon_reload(self.mnode)
        if not ret:
            raise ExecutionError("Unable to reload the daemon")
        g.log.info("Daemon reloaded successfully")

        # Restart glusterd
        ret = start_glusterd(self.mnode)
        if not ret:
            raise ExecutionError("Failed to start glusterd on %s" % self.mnode)
        g.log.info("Successfully restarted glusterd.")

        # Checking if glusterd is running on all nodes or not.
        count = 0
        while count < 60:
            ret = is_glusterd_running(self.mnode)
            if not ret:
                break
            sleep(2)
            count += 1
        if ret:
            raise ExecutionError("glusterd is not running on %s" % self.mnode)
        g.log.info("glusterd running with log level INFO.")

        # Checking if peers are in connected state or not.
        count = 0
        while count < 60:
            ret = self.validate_peers_are_connected()
            if ret:
                break
            sleep(3)
            count += 1
        if not ret:
            raise ExecutionError("Peers are not in connected state.")
        g.log.info("Peers are in connected state.")

        self.get_super_method(self, 'tearDown')()
Ejemplo n.º 26
0
    def test_volume_set_when_glusterd_stopped_on_one_node(self):
        """
        Test Case:
        1) Setup and mount a volume on client.
        2) Stop glusterd on a random server.
        3) Start IO on mount points
        4) Set an option on the volume
        5) Start glusterd on the stopped node.
        6) Verify all the bricks are online after starting glusterd.
        7) Check if the volume info is synced across the cluster.
        """
        # Fetching the bricks list and storing it for later use
        list1 = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(
            list1, "Failed to get the list of online bricks "
            "for volume: %s" % self.volname)

        # Fetching a random server from list.
        self.random_server = choice(self.servers[1:])

        # Stopping glusterd on one node.
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Successfully stopped glusterd on one node.")

        self.glusterd_is_stopped = True

        # Start IO on mount points.
        self.all_mounts_procs = []
        counter = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-depth 4 "
                   "--dir-length 6 "
                   "--dirname-start-num %d "
                   "--max-num-of-dirs 3 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            counter += 1

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("IO validation complete.")

        # set a option on volume, stat-prefetch on
        self.options = {"stat-prefetch": "on"}
        ret = set_volume_options(self.mnode, self.volname, self.options)
        self.assertTrue(ret, ("Failed to set option stat-prefetch to on"
                              "for the volume %s" % self.volname))
        g.log.info(
            "Succeeded in setting stat-prefetch option to on"
            "for the volume %s", self.volname)

        # start glusterd on the node where glusterd is stopped
        ret = start_glusterd(self.random_server)
        self.assertTrue(ret,
                        "Failed to start glusterd on %s" % self.random_server)
        g.log.info("Successfully started glusterd on node: %s",
                   self.random_server)

        # Waiting for glusterd to start completely
        ret = wait_for_glusterd_to_start(self.random_server)
        self.assertTrue(ret,
                        "glusterd is not running on %s" % self.random_server)
        g.log.info("glusterd is started and running on %s", self.random_server)

        self.glusterd_is_stopped = False

        # Confirm if all the bricks are online or not
        count = 0
        while count < 10:
            list2 = get_online_bricks_list(self.mnode, self.volname)
            if list1 == list2:
                break
            sleep(2)
            count += 1

        self.assertListEqual(
            list1, list2, "Unexpected: All the bricks in the"
            "volume are not online")
        g.log.info("All the bricks in the volume are back online")

        # volume info should be synced across the cluster
        out1 = get_volume_info(self.mnode, self.volname)
        self.assertIsNotNone(
            out1, "Failed to get the volume info from %s" % self.mnode)
        g.log.info("Getting volume info from %s is success", self.mnode)

        count = 0
        while count < 60:
            out2 = get_volume_info(self.random_server, self.volname)
            self.assertIsNotNone(
                out2,
                "Failed to get the volume info from %s" % self.random_server)
            if out1 == out2:
                break
            sleep(2)
            count += 1

        self.assertDictEqual(
            out1, out2, "Volume info is not synced in the"
            "restarted node")
        g.log.info("Volume info is successfully synced across the cluster")
    def test_add_brick_when_quorum_not_met(self):

        # pylint: disable=too-many-statements
        # create and start a volume
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, ("Failed to create "
                              "and start volume %s" % self.volname))
        g.log.info("Volume is created and started successfully")

        # set cluster.server-quorum-type as server
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(ret, ("Failed to set the quorum type as a server"
                              " on volume %s", self.volname))
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)

        # Setting quorum ratio to 95%
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set server quorum ratio on %s" % self.volname)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # bring down glusterd of half nodes
        num_of_servers = len(self.servers)
        num_of_nodes_to_bring_down = num_of_servers / 2

        for node in range(num_of_nodes_to_bring_down, num_of_servers):
            ret = stop_glusterd(self.servers[node])
            self.assertTrue(
                ret, ("Failed to stop glusterd on %s" % self.servers[node]))
            g.log.info("Glusterd stopped successfully on server %s",
                       self.servers[node])

        for node in range(num_of_nodes_to_bring_down, num_of_servers):
            count = 0
            while count < 80:
                ret = is_glusterd_running(self.servers[node])
                if ret:
                    break
                sleep(2)
                count += 1
            self.assertNotEqual(
                ret, 0, "glusterd is still running on %s" % self.servers[node])

        # Verifying node count in volume status after glusterd stopped
        # on half of the servers, Its not possible to check the brick status
        # immediately in volume status after glusterd stop
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname])
            if servers_count == (num_of_servers - num_of_nodes_to_bring_down):
                break
            sleep(2)
            count += 1

        # confirm that quorum is not met, brick process should be down
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        bricks_to_check = bricks_list[0:num_of_nodes_to_bring_down]
        ret = are_bricks_offline(self.mnode, self.volname, bricks_to_check)
        self.assertTrue(
            ret, "Unexpected: Server quorum is not met, "
            "Bricks are up")
        g.log.info("Server quorum is not met, bricks are down as expected")

        # try add brick operation, which should fail
        num_bricks_to_add = 1
        brick = form_bricks_list(self.mnode, self.volname, num_bricks_to_add,
                                 self.servers, self.all_servers_info)
        ret, _, _ = add_brick(self.mnode, self.volname, brick)
        self.assertNotEqual(ret, 0, ("Unexpected: add brick is success, "
                                     "when quorum is not met"))
        g.log.info("Add brick is failed as expected, when quorum is not met")

        # confirm that, newly added brick is not part of volume
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        if brick in bricks_list:
            ret = False
            self.assertTrue(ret, ("Unexpected: add brick is success, "
                                  "when quorum is not met"))
        g.log.info("Add brick is failed as expected, when quorum is not met")

        # set cluster.server-quorum-type as none
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'none'})
        self.assertTrue(ret, ("Failed to set the quorum type as a server"
                              " on volume %s", self.volname))
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)
Ejemplo n.º 28
0
    def test_rebalance_quorum(self):
        '''
        -> Create volume
        -> Stop the volume
        -> Enabling serve quorum
        -> start the volume
        -> Set server quorum ratio to 95%
        -> Stop the glusterd of any one of the node
        -> Perform rebalance operation operation
        -> Check gluster volume status
        -> start glusterd
        '''
        # Stop the Volume
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to stop the volume %s" % self.volname)
        g.log.info("Volume stopped successfully %s", self.volname)

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(ret, "Failed to set quorum type for volume %s"
                        % self.volname)
        g.log.info("Able to set quorum type successfully for %s", self.volname)

        # Start the volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start the volume %s"
                         % self.volname)
        g.log.info("Volume started successfully %s", self.volname)

        # Setting Quorum ratio in percentage
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(ret, "Failed to set server quorum ratio on %s"
                        % self.servers)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # Stopping glusterd
        self.random_server = random.choice(self.servers[1:])
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to stop glusterd on %s"
                        % self.random_server)
        g.log.info("Glusterd stopped successfully on %s", self.random_server)

        msg = ("volume rebalance: " + self.volname + ": failed: Quorum not "
                                                     "met. Volume operation "
                                                     "not allowed")

        # Start Rebalance
        ret, _, err = rebalance_start(self.mnode, self.volname)
        self.assertNotEqual(ret, 0, "Unexpected: Rebalance should fail when "
                                    "quorum is in not met condition but "
                                    "Rebalance succeeded %s" % self.volname)
        g.log.info("Expected: Rebalance failed when quorum is in not met "
                   "condition %s", self.volname)

        # Checking Rebalance failed message
        self.assertIn(msg, err, "Error message is not correct for rebalance "
                                "operation when quorum not met")
        g.log.info("Error message is correct for rebalance operation "
                   "when quorum not met")

        # Volume Status
        ret, out, _ = volume_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to get volume status for %s"
                         % self.volname)
        g.log.info("Successful in getting volume status for %s", self.volname)

        # Checking volume status message
        self.assertNotIn('rebalance', out, "Unexpected: Found rebalance task "
                                           "in vol status of %s"
                         % self.volname)
        g.log.info("Expected: Not Found rebalance task in vol status of %s",
                   self.volname)

        # Starting glusterd
        ret = start_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to start glusterd on %s"
                        % self.random_server)
        g.log.info("Glusted started successfully on %s", self.random_server)
    def test_profile_operations_with_one_node_down(self):

        # pylint: disable=too-many-statements
        """
        Test Case:
        1) Create a volume and start it.
        2) Mount volume on client and start IO.
        3) Start profile info on the volume.
        4) Stop glusterd on one node.
        5) Run profile info with different parameters
           and see if all bricks are present or not.
        6) Stop profile on the volume.
        """

        # Start IO on mount points.
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        counter = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-depth 4 "
                   "--dirname-start-num %d "
                   "--dir-length 6 "
                   "--max-num-of-dirs 3 "
                   "--num-of-files 5 %s" % (
                       self.script_upload_path,
                       counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            counter += 1

        # Start profile on volume.
        ret, _, _ = profile_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start profile on volume: %s"
                         % self.volname)
        g.log.info("Successfully started profile on volume: %s",
                   self.volname)

        # Fetching a random server from list.
        self.random_server = randint(1, len(self.servers)-1)

        # Stopping glusterd on one node.
        ret = stop_glusterd(self.servers[self.random_server])
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Successfully stopped glusterd on one node.")
        ret = wait_for_glusterd_to_start(self.servers[self.random_server])
        self.assertFalse(ret, "glusterd is still running on %s"
                         % self.servers[self.random_server])
        g.log.info("Glusterd stop on the nodes : %s "
                   "succeeded", self.servers[self.random_server])

        # Getting and checking output of profile info.
        ret, out, _ = profile_info(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to run profile info on volume: %s"
                         % self.volname)
        g.log.info("Successfully executed profile info on volume: %s",
                   self.volname)

        # Checking if all bricks are present in profile info.
        brick_list = get_online_bricks_list(self.mnode, self.volname)
        for brick in brick_list:
            self.assertTrue(brick in out,
                            "Brick %s not a part of profile info output."
                            % brick)
            g.log.info("Brick %s showing in profile info output.",
                       brick)

        # Running profile info with different profile options.
        profile_options = ['peek', 'incremental', 'clear', 'incremental peek',
                           'cumulative']
        for option in profile_options:

            # Getting and checking output of profile info.
            ret, out, _ = profile_info(self.mnode, self.volname,
                                       options=option)
            self.assertEqual(ret, 0,
                             "Failed to run profile info %s on volume: %s"
                             % (option, self.volname))
            g.log.info("Successfully executed profile info %s on volume: %s",
                       option, self.volname)

            # Checking if all bricks are present in profile info peek.
            for brick in brick_list:
                self.assertTrue(brick in out,
                                "Brick %s not a part of profile"
                                " info %s output."
                                % (brick, option))
                g.log.info("Brick %s showing in profile info %s output.",
                           brick, option)

        # Starting glusterd on node where stopped.
        ret = start_glusterd(self.servers[self.random_server])
        self.assertTrue(ret, "Failed to start glusterd.")
        g.log.info("Successfully started glusterd.")

        # Checking if peer is connected
        ret = wait_for_peers_to_connect(self.mnode, self.servers)
        self.assertTrue(ret, "Peers are not in connected state.")
        g.log.info("Peers are in connected state.")

        # Stop profile on volume.
        ret, _, _ = profile_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to stop profile on volume: %s"
                         % self.volname)
        g.log.info("Successfully stopped profile on volume: %s", self.volname)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        g.log.info("IO validation complete.")
Ejemplo n.º 30
0
    def test_snap_del_gd_down(self):
        """
        Steps:
        1. Create volumes
        2. Create 5 snapshots
        3. Bring one node down
        4. Delete one snapshot
        5. list snapshot and validate delete
        6. Bring up the downed node
        7. Validate number of snaps after handshake on the
           brought down node.
        """
        # Create 5 snapshot
        g.log.info("Creating 5 snapshots for volume %s", self.volname)
        for i in range(0, 5):
            ret, _, _ = snap_create(self.mnode, self.volname, "snapy%s" % i)
            self.assertEqual(
                ret, 0, ("Failed to create snapshot for %s" % self.volname))
            g.log.info("Snapshot %s created successfully for volume  %s",
                       "snapy%s" % i, self.volname)

        # Check for no of snaps using snap_list it should be 5 now
        snap_list = get_snap_list(self.mnode)
        self.assertEqual(
            5, len(snap_list), "No of snaps not consistent "
            "for volume %s" % self.volname)
        g.log.info("Successfully validated number of snaps.")

        # Stopping glusterd service on server[1]
        ret = stop_glusterd(self.servers[1])
        self.assertTrue(
            ret,
            "Failed to stop glusterd service on node : %s" % self.servers[1])
        g.log.info("Stopped glusterd services successfully on: %s",
                   self.servers[1])

        # Delete one snapshot snapy1
        ret, _, _ = snap_delete(self.servers[0], "snapy1")
        self.assertEqual(ret, 0, "Failed to delete snapshot snapy1")
        g.log.info("Successfully deleted snapshot of snapy1")

        # Check for no of snaps using snap_list it should be 4 now
        snap_list = get_snap_list(self.mnode)
        self.assertEqual(
            4, len(snap_list), "No of snaps not consistent "
            "for volume %s" % self.volname)
        g.log.info("Successfully validated number of snaps.")

        # Starting glusterd services on server[1]
        ret = start_glusterd(self.servers[1])
        self.assertTrue(
            ret, "Failed to start glusterd on node "
            ": %s" % self.servers[1])
        g.log.info("Started glusterd services successfully on: %s",
                   self.servers[1])

        # Check for no of snaps using snap_list it should be 4 for server[1]
        count = 0
        # putting wait here for glusterd handshake
        while count < 60:
            snap_list = get_snap_list(self.servers[1])
            if len(snap_list) == 4:
                break
            time.sleep(2)
            count += 2
        self.assertEqual(
            4, len(snap_list), "No of snaps not consistent "
            "for volume %s" % self.volname)
        g.log.info("Successfully validated number of snaps.")