def test_replicated_to_arbiter_volume(self):
        """
        Description:-
        Reduce the replica count from replica 3 to arbiter
        """
        # pylint: disable=too-many-statements
        # Remove brick to reduce the replica count from replica 3
        g.log.info("Removing bricks to form replica 2 volume")
        ret = shrink_volume(self.mnode, self.volname, replica_num=0)
        self.assertTrue(ret,
                        "Failed to remove brick on volume %s" % self.volname)
        g.log.info("Successfully removed brick on volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s process not online despite waiting "
            "for 300 seconds" % self.volname)
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s : All process are not online" % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)

        # Adding the bricks to make arbiter brick
        g.log.info("Adding bricks to convert to Arbiter Volume")
        replica_arbiter = {'replica_count': 1, 'arbiter_count': 1}
        ret = expand_volume(self.mnode,
                            self.volname,
                            self.servers,
                            self.all_servers_info,
                            add_to_hot_tier=False,
                            **replica_arbiter)
        self.assertTrue(ret, "Failed to expand the volume  %s" % self.volname)
        g.log.info("Changing volume to arbiter volume is successful %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Failed to wait for volume %s processes "
            "to be online" % self.volname)
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s : All process are not online" % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)
Example #2
0
    def _validate_brick_down_scenario(self,
                                      validate_heal=False,
                                      monitor_heal=False):
        """
        Refactor of common steps across volume type for validating brick down
        scenario
        """
        if validate_heal:
            # Wait for ample amount of IO to be written to file
            sleep(180)

            # Validate heal info shows o/p and exit in <8s
            self._validate_heal()

        # Force start volume and verify all process are online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, 'Unable to force start volume')

        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, 'Not able to confirm all process of volume are online')

        if monitor_heal:
            # Wait for IO to be written to file
            sleep(30)

            # Monitor heal and validate data was appended successfully to file
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret,
                            'Self heal is not completed post brick online')
    def _bring_bricks_online_heal(self, mnode, volname, bricks_list):
        """
        Bring bricks online and monitor heal completion
        """
        # Bring bricks online
        ret = bring_bricks_online(
            mnode,
            volname,
            bricks_list,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(ret, 'Failed to bring bricks online')

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(mnode, volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(mnode, volname)
        self.assertTrue(
            ret, ("Volume {} : All process are not online".format(volname)))
        g.log.info("Volume %s : All process are online", volname)

        # Monitor heal completion
        ret = monitor_heal_completion(mnode, volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(mnode, volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
Example #4
0
    def test_offline_brick_status_when_quorum_not_met(self):
        """
        Test Brick status when Quorum is not met after glusterd restart.
        1. Create a volume and mount it.
        2. Set the quorum type to 'server'.
        3. Bring some nodes down such that quorum won't be met.
        4. Brick status should be offline in the node which is up.
        5. Restart glusterd in this node.
        6. The brick status still should be offline as quorum isn't met.
        """
        # Set the quorum type to server and validate it.
        vol_option = {'cluster.server-quorum-type': 'server'}
        ret = set_volume_options(self.mnode, self.volname, vol_option)
        self.assertTrue(ret, "gluster volume option set of %s to %s failed"
                        % ('cluster.server-quorum-type', 'server'))
        g.log.info("Cluster quorum set to type server.")

        # Get the brick list.
        brick_list = get_all_bricks(self.mnode, self.volname)

        # Stop glusterd processes.
        ret = stop_glusterd(self.servers[1:])
        self.assertTrue(ret, "Failed to stop glusterd on specified nodes.")
        g.log.info("Glusterd processes stopped in the desired servers.")

        # Get the brick status in a node where glusterd is up.
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(ret, "Bricks are online")
        g.log.info("Bricks are offline as expected.")

        # Restart one of the node which is up.
        ret = restart_glusterd(self.servers[0])
        self.assertTrue(ret, ("Failed to restart glusterd on desired node."))
        g.log.info("Glusterd restarted on the desired node.")

        # Wait for glusterd to be online and validate it's running.
        self.assertTrue(wait_for_glusterd_to_start(self.servers[0]),
                        "Glusterd not up on the desired server.")
        g.log.info("Glusterd is up in the desired server.")

        # Get the brick status from the restarted node.
        ret = are_bricks_offline(self.mnode, self.volname, brick_list[0:1])
        self.assertTrue(ret, "Bricks are online")
        g.log.info("Bricks are offline as expected.")

        # Start glusterd on all servers.
        ret = start_glusterd(self.servers)
        self.assertTrue(ret, "Failed to start glusterd on the specified nodes")
        g.log.info("Initiated start of glusterd on all nodes.")

        # Wait for glusterd to start.
        ret = wait_for_glusterd_to_start(self.servers)
        self.assertTrue(ret, "Glusterd not up on all nodes.")
        g.log.info("Glusterd is up and running on all nodes.")

        # Wait for all volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   timeout=600)
        self.assertTrue(ret, ("All volume processes not up."))
        g.log.info("All volume processes are up.")
Example #5
0
    def setup_samba_ctdb_cluster(cls):
        """
        Create ctdb-samba cluster if doesn't exists

        Returns:
            bool: True if successfully setup samba else false
        """
        # Check if ctdb setup is up and running
        if is_ctdb_status_healthy(cls.primary_node):
            g.log.info("ctdb setup already up skipping " "ctdb setup creation")
            return True
        g.log.info("Proceeding with ctdb setup creation")
        for mnode in cls.servers:
            ret = edit_hook_script(mnode, cls.ctdb_volname)
            if not ret:
                return False
            ret = enable_ctdb_cluster(mnode)
            if not ret:
                return False
            ret = create_nodes_file(mnode, cls.ctdb_nodes)
            if not ret:
                return False
            ret = create_public_address_file(mnode, cls.ctdb_vips)
            if not ret:
                return False
        server_info = cls.all_servers_info
        ctdb_config = cls.ctdb_volume_config
        g.log.info("Setting up ctdb volume %s", cls.ctdb_volname)
        ret = setup_volume(mnode=cls.primary_node,
                           all_servers_info=server_info,
                           volume_config=ctdb_config)
        if not ret:
            g.log.error("Failed to setup ctdb volume %s", cls.ctdb_volname)
            return False
        g.log.info("Successful in setting up volume %s", cls.ctdb_volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume %s processes to be online",
                   cls.ctdb_volname)
        ret = wait_for_volume_process_to_be_online(cls.mnode, cls.ctdb_volname)
        if not ret:
            g.log.error(
                "Failed to wait for volume %s processes to "
                "be online", cls.ctdb_volname)
            return False
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", cls.ctdb_volname)

        # start ctdb services
        ret = start_ctdb_service(cls.servers)
        if not ret:
            return False

        ret = is_ctdb_status_healthy(cls.primary_node)
        if not ret:
            g.log.error("CTDB setup creation failed - exiting")
            return False
        g.log.info("CTDB setup creation successfull")
        return True
Example #6
0
    def test_nfs_ganesha_remove_brick(self):
        """
        Verify remove brick operation while IO is running
        Steps:
        1. Start IO on mount points
        2. Perform remove brick operation
        3. Validate IOs
        """
        # pylint: disable=too-many-statements
        # Start IO on all mount points
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (self.script_upload_path, count,
                                            mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Perform remove brick operation
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info("All volume %s processes came up "
                   "online successfully after remove brick operation",
                   self.volname)

        # Log volume info and status after performing remove brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")
Example #7
0
    def test_shrinking_volume_when_io_in_progress(self):
        """Test shrinking volume (Decrease distribute count) using existing
        servers bricks when IO is in progress.

        Description:
            - remove brick (start, status, commit)
            - validate IO
        """
        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Shrinking volume by removing bricks from volume when IO in progress
        g.log.info("Start removing bricks from volume when IO in progress")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Shrinking volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online after "
                   "shrinking volume")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online after shrinking volume",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
Example #8
0
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status after replacing the brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Wait for self-heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=1800)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 30 minutes. 30 minutes is too much a time for "
            "current test workload")

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
Example #9
0
    def test_volume_create_start_stop_start(self):
        """Tests volume create, start, status, stop, start.
        Also Validates whether all the brick process are running after the
        start of the volume.
        """
        # Verify volume processes are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online" %
                              self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Stop Volume
        ret, _, _ = volume_stop(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to stop volume %s" % self.volname)
        g.log.info("Successfully stopped volume %s", self.volname)

        # Start Volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)
        g.log.info("Successfully started volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to Log volume %s info and status",
                              self.volname))
        g.log.info("Successfully logged Volume %s Info and Status",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online" %
                              self.volname))
        g.log.info("Successfully verified volume %s processes are online",
                   self.volname)

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to Log volume %s info and status",
                              self.volname))
        g.log.info("Successfully logged Volume %s Info and Status",
                   self.volname)

        # Check if glusterd is running on all servers(expected: active)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(ret, 0, "Glusterd is not running on all servers")
        g.log.info("Glusterd is running on all the servers")
    def setUpClass(cls):
        """
        setup volume and initialize necessary variables
        which is used in tests
        """
        # calling GlusterBaseClass setUpClass
        cls.get_super_method(cls, 'setUpClass')()

        list_of_vol = [
            'distributed-dispersed', 'replicated', 'dispersed', 'distributed',
            'distributed-replicated'
        ]
        cls.volume_configs = []
        if cls.default_volume_type_config['distributed']['dist_count'] > 3:
            cls.default_volume_type_config['distributed']['dist_count'] = 3

        for volume_type in list_of_vol:
            cls.volume_configs.append({
                'name':
                'testvol_%s' % (volume_type),
                'servers':
                cls.servers,
                'voltype':
                cls.default_volume_type_config[volume_type]
            })
        for volume_config in cls.volume_configs:
            ret = setup_volume(mnode=cls.mnode,
                               all_servers_info=cls.all_servers_info,
                               volume_config=volume_config,
                               multi_vol=True)
            volname = volume_config['name']
            if not ret:
                raise ExecutionError("Failed to setup Volume" " %s" % volname)
            g.log.info("Successful in setting volume %s", volname)

            # Verify volume's all process are online for 60 sec
            g.log.info("Verifying volume's all process are online")
            ret = wait_for_volume_process_to_be_online(cls.mnode, volname, 60)
            if not ret:
                raise ExecutionError("Volume %s : All process are not online" %
                                     volname)
            g.log.info("Successfully Verified volume %s processes are online",
                       volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("Verifying Self Heal Daemon process is daemonized")
        ret = is_shd_daemonized(cls.servers)
        if not ret:
            raise ExecutionError("Self Heal Daemon process was still"
                                 " holding parent process.")
        g.log.info("Self Heal Daemon processes are online")
    def setUp(self):
        """
        setup volume and initialize necessary variables
        which is used in tests
        """
        # Calling GlusterBaseClass setUp
        self.get_super_method(self, 'setUp')()

        # Setup Volume for all the volume types
        self.volume_configs = []
        for volume_type in self.default_volume_type_config:
            self.volume_configs.append({
                'name':
                'testvol_%s' % volume_type,
                'servers':
                self.servers,
                'voltype':
                self.default_volume_type_config[volume_type]
            })

        for volume_config in self.volume_configs[1:]:
            ret = setup_volume(mnode=self.mnode,
                               all_servers_info=self.all_servers_info,
                               volume_config=volume_config,
                               multi_vol=True)
            volname = volume_config['name']
            if not ret:
                raise ExecutionError("Failed to setup Volume" " %s" % volname)
            g.log.info("Successful in setting volume %s", volname)

            # Verify volume's all process are online for 60 sec
            ret = wait_for_volume_process_to_be_online(self.mnode, volname, 60)
            if not ret:
                raise ExecutionError("Volume %s : All process are not online" %
                                     volname)
            g.log.info("Successfully Verified volume %s processes are online",
                       volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(self.servers)
        if not ret:
            raise ExecutionError("Self Heal Daemon process was still"
                                 " holding parent process.")
        g.log.info("Self Heal Daemon processes are online")

        self.glustershd = "/var/lib/glusterd/glustershd/glustershd-server.vol"
Example #12
0
 def _enable_xlator(self, xlator, parent, xtype, xsfail=False):
     self.verified_bricks = []
     option = '{0}{1}.{2}'.format(xtype,
                                  '.xlator' if xtype == 'user' else '',
                                  xlator)
     self._set_and_assert_volume_option(option, parent)
     ret, _, _ = volume_stop(self.mnode, self.volname)
     self.assertEqual(ret, 0, 'Unable to stop volume')
     sleep(self.timeout)
     ret, _, _ = volume_start(self.mnode, self.volname)
     if xsfail:
         self.assertNotEqual(ret, 0, 'Expected volume start to fail')
         return
     self.assertEqual(ret, 0, 'Unable to start a stopped volume')
     self._verify_position(xlator, parent, xtype)
     ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
     self.assertTrue(
         ret, 'Not all volume processes are online after '
         'starting a stopped volume')
     sleep(self.timeout)
Example #13
0
    def _wait_for_gluster_process_online_state(self):
        """
        Function which waits for the glusterfs processes to come up
        """
        # Wait for glusterd to be online and validate it's running.
        self.assertTrue(wait_for_glusterd_to_start(self.servers),
                        "glusterd not up on the desired nodes.")
        g.log.info("Glusterd is up and running on desired nodes.")

        # Wait for peers to connect
        ret = wait_for_peers_to_connect(self.mnode, self.servers, 50)
        self.assertTrue(ret, "Peers not in connected state.")
        g.log.info("Peers in connected state.")

        # Wait for all volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode,
                                                   self.volname,
                                                   timeout=600)
        self.assertTrue(ret, ("All volume processes not up."))
        g.log.info("All volume processes are up.")
Example #14
0
    def test_rebalance_with_hidden_files(self):
        # pylint: disable=too-many-statements
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_files "
                   "--base-file-name . "
                   "-f 99 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")

        # Verify DHT values across mount points
        for mount_obj in self.mounts:
            g.log.debug("Verifying hash layout values %s:%s",
                        mount_obj.client_system, mount_obj.mountpoint)
            ret = validate_files_in_dir(mount_obj.client_system,
                                        mount_obj.mountpoint,
                                        test_type=FILE_ON_HASHED_BRICKS,
                                        file_type=FILETYPE_FILES)
            self.assertTrue(
                ret, "Expected - Files are created on only "
                "sub-volume according to its hashed value")
            g.log.info("Hash layout values are verified %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

        # Getting areequal checksum before rebalance
        g.log.info("Getting areequal checksum before rebalance")
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online ", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            failed_files_count = int(each_node['failures'])
            self.assertEqual(
                failed_files_count, 0,
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("There are no migration failures")

        # Getting areequal checksum after rebalance
        g.log.info("Getting areequal checksum after rebalance")
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance
        g.log.info("Comparing arequals checksum before and after rebalance")
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")
    def test_self_heal_differing_in_file_type(self):
        """
        testing self heal of files with different file types
        with default configuration

        Description:
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequal and compare with arequal before
        getting bricks offline
        - modify the data
        - arequal before getting bricks online
        - bring bricks online
        - check daemons and healing completion
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        online and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        all_mounts_procs = []
        test_file_type_differs_self_heal_folder = \
            'test_file_type_differs_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=1k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; "
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(sorted(result_before_offline),
                         sorted(result_after_offline),
                         'Checksums before and after bringing bricks'
                         ' offline are not equal')
        g.log.info('Checksums before and after '
                   'bringing bricks offline are equal')

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 10` ; "
                   "do for j in `seq 1 5` ; "
                   "do for k in `seq 1 10` ; "
                   "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; "
                   "mkdir l1_dir.$i/l2_dir.$j/test.$k ; "
                   "done ; "
                   "done ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums before and after bringing bricks'
                         ' online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
Example #16
0
    def test_restart_glusterd_after_rebalance(self):

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volume)
        g.log.info("Successful in logging volume info and status of "
                   "volume %s", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand success", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   timeout=600)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volume)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on %s ",
                                  self.volname))
        g.log.info("Successfully started rebalance on %s ",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # restart glusterd on all servers
        g.log.info("Restart glusterd on all servers %s", self.servers)
        ret = restart_glusterd(self.servers)
        self.assertTrue(ret, ("Failed to restart glusterd on all servers %s",
                              self.servers))
        g.log.info("Successfully restarted glusterd on all servers %s",
                   self.servers)

        # Check if glusterd is running on all servers(expected: active)
        g.log.info("Check if glusterd is running on all servers %s"
                   "(expected: active)", self.servers)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(ret, 0, ("Glusterd is not running on all servers %s",
                                  self.servers))
        g.log.info("Glusterd is running on all the servers %s", self.servers)

        # Check if rebalance process has started after glusterd restart
        g.log.info("Checking if rebalance process has started after "
                   "glusterd restart")
        for server in self.servers:
            ret, _, _ = g.run(server, "pgrep rebalance")
            self.assertNotEqual(ret, 0, ("Rebalance process is triggered on "
                                         "%s after glusterd restart", server))
            g.log.info("Rebalance is NOT triggered on %s after glusterd "
                       "restart", server)
    def test_nfs_ganesha_export_with_multiple_volumes(self):
        """
        Test case to verify multiple volumes gets exported when IO is in
        progress.
        """
        # Starting IO on the mounts
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Create and export five new volumes
        for i in range(5):
            # Check availability of bricks to create new volume
            num_of_unused_bricks = 0

            servers_unused_bricks_dict = get_servers_unused_bricks_dict(
                self.mnode, self.all_servers, self.all_servers_info)
            for each_server_unused_bricks_list in list(
                    servers_unused_bricks_dict.values()):
                num_of_unused_bricks = (num_of_unused_bricks +
                                        len(each_server_unused_bricks_list))

            if num_of_unused_bricks < 2:
                self.assertNotEqual(
                    i, 0, "New volume cannot be created due "
                    "to unavailability of bricks.")
                g.log.warning(
                    "Tried to create five new volumes. But could "
                    "create only %s volume due to unavailability "
                    "of bricks.", str(i))
                break

            self.volume['name'] = "nfsvol" + str(i)
            self.volume['voltype']['type'] = 'distributed'
            self.volume['voltype']['replica_count'] = 1
            self.volume['voltype']['dist_count'] = 2

            new_vol = self.volume['name']

            # Create volume
            ret = setup_volume(mnode=self.mnode,
                               all_servers_info=self.all_servers_info,
                               volume_config=self.volume,
                               force=True)
            if not ret:
                self.assertTrue(ret, "Setup volume [%s] failed" % self.volume)

            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode, new_vol)
            self.assertTrue(
                ret, "Volume %s process not online despite "
                "waiting for 300 seconds" % new_vol)

            # Export volume with nfs ganesha
            ret, _, _ = export_nfs_ganesha_volume(mnode=self.mnode,
                                                  volname=new_vol)
            self.assertEqual(ret, 0, ("Failed to export volume %s "
                                      "using nfs-ganesha" % new_vol))

            # Wait for volume to get exported
            ret = wait_for_nfs_ganesha_volume_to_get_exported(
                self.mnode, new_vol)
            self.assertTrue(
                ret, "Volume %s is not exported after setting "
                "ganesha.enable 'on'" % new_vol)
            g.log.info("Exported nfs-ganesha volume %s", new_vol)

            # Log Volume Info and Status
            ret = log_volume_info_and_status(self.mnode, new_vol)
            self.assertTrue(
                ret, "Logging volume %s info and status failed" % new_vol)

        # Validate IO
        g.log.info("Validating IO")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all IO")
Example #18
0
    def test_expanding_volume_when_io_in_progress(self):
        # pylint: disable=too-many-statements
        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume when IO in progress
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume while IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume while IO in progress on "
            "volume %s : Success", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Waiting for volume %s process to be online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Started rebalance on the volume %s: Success", self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=1800)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for the "
                                  "volume %s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO on all mounts: Complete")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("List all files and directories: Success")

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            self.assertEqual(
                0, int(each_node['failures']),
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("No migration failures on %s", each_node['nodeName'])
Example #19
0
    def test_afr_reset_brick_heal_full(self):
        """
         1. Create files/dirs from mount point
         2. With IO in progress execute reset-brick start
         3. Now format the disk from back-end, using rm -rf <brick path>
         4. Execute reset brick commit and check for the brick is online.
         5. Issue volume heal using "gluster vol heal <volname> full"
         6. Check arequal for all bricks to verify all backend bricks
            including the resetted brick have same data
        """
        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick_to_reset = choice(all_bricks)

        # Start reset brick
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="start")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick: %s started", brick_to_reset)

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname, [brick_to_reset])
        self.assertTrue(ret, "Brick:{} is still online".format(brick_to_reset))

        # rm -rf of the brick directory
        node, brick_path = brick_to_reset.split(":")
        ret = rmdir(node, brick_path, force=True)
        self.assertTrue(
            ret, "Unable to delete the brick {} on "
            "node {}".format(brick_path, node))

        # Reset brick commit
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="commit")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick committed successfully")

        # Check the brick is online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Few volume processess are offline for the "
            "volume: {}".format(self.volname))

        # Trigger full heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, "Unable  to trigger the heal full command")

        # Wait for the heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Heal didn't complete in 20 mins time")

        # Validate io on the clients
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on the mounts")
        self.all_mounts_procs *= 0

        # Check arequal of the back-end bricks after heal completion
        all_subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in all_subvols:
            ret, arequal_from_subvol = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across the"
                " bricks in the subvol {}".format(subvol))
            self.assertEqual(
                len(set(arequal_from_subvol)), 1, "Arequal is "
                "same on all the bricks in the subvol")
    def test_fix_layout_start(self):
        # pylint: disable=too-many-statements
        # Get arequal checksum before starting fix-layout
        g.log.info("Getting arequal checksum before fix-layout")
        arequal_checksum_before_fix_layout = collect_mounts_arequal(
            self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and status of volume "
            "%s", self.volname)

        # Form brick list for expanding volume
        add_brick_list = form_bricks_list_to_add_brick(self.mnode,
                                                       self.volname,
                                                       self.servers,
                                                       self.all_servers_info,
                                                       distribute_count=1)
        self.assertIsNotNone(add_brick_list,
                             ("Volume %s: Failed to form "
                              "bricks list to expand", self.volname))
        g.log.info("Volume %s: Formed bricks list to expand", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Volume %s: Expand start")
        ret, _, _ = add_brick(self.mnode, self.volname, add_brick_list)
        self.assertEqual(ret, 0, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand successful", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and status of volume "
            "%s", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance fix-layout
        g.log.info("Starting fix-layout on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, fix_layout=True)
        self.assertEqual(ret, 0, ("Volume %s: fix-layout start failed"
                                  "%s", self.volname))
        g.log.info("Volume %s: fix-layout start success", self.volname)

        # Wait for fix-layout to complete
        g.log.info("Waiting for fix-layout to complete")
        ret = wait_for_fix_layout_to_complete(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s: Fix-layout is still in-progress", self.volname))
        g.log.info("Volume %s: Fix-layout completed successfully",
                   self.volname)

        # Check Rebalance status after fix-layout is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(
            ret, 0,
            ("Volume %s: Failed to get rebalance status", self.volname))
        g.log.info("Volume %s: Successfully got rebalance status",
                   self.volname)

        # Get arequal checksum after fix-layout is complete
        g.log.info("arequal after fix-layout is complete")
        arequal_checksum_after_fix_layout = collect_mounts_arequal(self.mounts)

        # Compare arequals checksum before and after fix-layout
        g.log.info("Comparing checksum before and after fix-layout")
        self.assertEqual(arequal_checksum_before_fix_layout,
                         arequal_checksum_after_fix_layout,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")

        # Check if there are any file migrations after fix-layout
        status_info = get_rebalance_status(self.mnode, self.volname)
        for node in range(len(status_info['node'])):
            status_info = get_rebalance_status(self.mnode, self.volname)
            file_migration_count = status_info['node'][node]['files']
            self.assertEqual(
                int(file_migration_count), 0,
                ("Server %s: Few files are migrated", self.servers[node]))
            g.log.info("Server %s: No files are migrated")

        # Check if new bricks contains any files
        for brick in add_brick_list:
            brick_node, brick_path = brick.split(":")
            cmd = ('find %s -type f ! -perm 1000 | grep -ve .glusterfs' %
                   brick_path)
            _, out, _ = g.run(brick_node, cmd)
            self.assertEqual(
                len(out), 0,
                (("Files(excluded linkto files) are present on %s:%s"),
                 (brick_node, brick_path)))
            g.log.info("No files (excluded linkto files) are present on %s:%s",
                       brick_node, brick_path)
Example #21
0
    def test_metadata_self_heal(self):
        """
        Test MetaData Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Change the permissions, ownership and the group
        of the files under "test_meta_data_self_heal" folder
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - check group and user are 'qa'
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create files
        g.log.info('Creating files...')
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 50` ; "
                   "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder,
                      test_meta_data_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Changing the permissions, ownership and the group
        # of the files under "test_meta_data_self_heal" folder
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Change permissions to 444
        g.log.info('Changing permissions...')
        command = ("cd %s/%s/ ; "
                   "chmod -R 444 *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Permissions are changed successfully')

        # Change the ownership to qa
        g.log.info('Changing the ownership...')
        command = ("cd %s/%s/ ; "
                   "chown -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Ownership is changed successfully')

        # Change the group to qa
        g.log.info('Changing the group...')
        command = ("cd %s/%s/ ; "
                   "chgrp -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Group is changed successfully')

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume process %s not online "
                              "despite waiting for 5 minutes", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')

        # Adding servers and client in single dict to check permissions
        nodes_to_check = {}
        all_bricks = get_all_bricks(self.mnode, self.volname)
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            nodes_to_check[node] = brick_path
        nodes_to_check[self.mounts[0].client_system] = \
            self.mounts[0].mountpoint

        # Checking for user and group
        for node in nodes_to_check:
            # Get file list
            command = ("cd %s/%s/ ; "
                       "ls"
                       % (nodes_to_check[node],
                          test_meta_data_self_heal_folder))
            ret, out, err = g.run(node, command)
            file_list = out.split()

            for file_name in file_list:
                file_to_check = '%s/%s/%s' % (nodes_to_check[node],
                                              test_meta_data_self_heal_folder,
                                              file_name)

                g.log.info('Checking for permissions, user and group for %s',
                           file_name)

                # Check for permissions
                cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'"
                       .format(file_to_check))
                ret, permissions, _ = g.run(node, cmd)
                self.assertEqual(permissions.split('\n')[0], '444',
                                 'Permissions %s is not equal to 444'
                                 % permissions)
                g.log.info("Permissions are '444' for %s", file_name)

                # Check for user
                cmd = ("ls -ld {} | awk '{{print $3}}'"
                       .format(file_to_check))
                ret, username, _ = g.run(node, cmd)
                self.assertEqual(username.split('\n')[0],
                                 'qa', 'User %s is not equal qa'
                                 % username)
                g.log.info("User is 'qa' for %s", file_name)

                # Check for group
                cmd = ("ls -ld {} | awk '{{print $4}}'"
                       .format(file_to_check))
                ret, groupname, _ = g.run(node, cmd)
                self.assertEqual(groupname.split('\n')[0],
                                 'qa', 'Group %s is not equal qa'
                                 % groupname)
                g.log.info("Group is 'qa' for %s", file_name)
    def test_subdir_with_addbrick(self):

        # pylint: disable=too-many-statements
        """
        Mount the volume
        Create 2 subdir on mount point, subdir1 and subdir2
        Auth allow - Client1(subdir1,subdir2),Client2(subdir1,subdir2)
        Mount the subdir1 on client 1 and subdir2 on client2
        Start IO's on both subdirs
        Perform add-brick and rebalance
        """

        # Create  directories subdir1 and subdir2 on mount point
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir1" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir1' on"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir2" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir2' on"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        # unmount volume
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, "Volumes Unmount failed")
        g.log.info("Volumes Unmounted successfully")

        # Set authentication on the subdirectory subdir1
        # and subdir2 to access by 2 clients
        g.log.info(
            'Setting authentication on subdir1 and subdir2'
            'for client %s and %s', self.clients[0], self.clients[0])
        ret = set_auth_allow(
            self.volname, self.mnode, {
                '/subdir1': [self.clients[0], self.clients[1]],
                '/subdir2': [self.clients[0], self.clients[1]]
            })
        self.assertTrue(
            ret, 'Failed to set Authentication on volume %s' % self.volume)

        # Creating mount list for subdirectories
        self.subdir_mounts = [
            copy.deepcopy(self.mounts[0]),
            copy.deepcopy(self.mounts[1])
        ]
        self.subdir_mounts[0].volname = "%s/subdir1" % self.volname
        self.subdir_mounts[1].volname = "%s/subdir2" % self.volname

        # Mount Subdirectory "subdir1" on client 1 and "subdir2" on client 2
        for mount_obj in self.subdir_mounts:
            ret = mount_obj.mount()
            self.assertTrue(
                ret, ("Failed to mount  %s on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Successfully mounted %s on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully mounted subdirectories on client1"
                   "and clients 2")

        # Start IO on all mounts.
        all_mounts_procs = []
        count = 1
        for mount_obj in self.subdir_mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.subdir_mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.subdir_mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Start add-brick (subvolume-increase)
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume when IO in progress is successful on "
            "volume %s", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All process  for volume %s are not"
                              "online", self.volname))
        g.log.info("All volume %s processes are now online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname, 600)
        self.assertTrue(
            ret, "Rebalance did not complete "
            "despite waiting for 10 minutes")
        g.log.info("Rebalance successfully completed on the volume %s",
                   self.volname)

        # Again validate if subdirectories are still mounted post add-brick

        for mount_obj in self.subdir_mounts:
            ret = mount_obj.is_mounted()
            self.assertTrue(
                ret, ("Subdirectory %s is not mounted on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Subdirectory %s is mounted on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully validated that subdirectories are mounted"
                   "on client1 and clients 2 post add-brick operation")
    def test_rebalance_with_force(self):

        # Getting arequal checksum before rebalance
        g.log.info("Getting arequal checksum before rebalance")
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and"
            "status of volume %s", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand successful", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and"
            "status of volume %s", self.volname)

        # Start Rebalance with force
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, ("Volume %s: Failed to start rebalance with "
                                  "force", self.volname))
        g.log.info("Volume %s: Started rebalance with force option",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=600)
        self.assertTrue(
            ret, ("Volume %s: Rebalance is still in-progress ", self.volname))
        g.log.info("Volume %s: Rebalance completed", self.volname)

        # Getting arequal checksum after rebalance
        g.log.info("Getting arequal checksum after rebalance with force "
                   "option")
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance with force
        # option
        g.log.info("Comparing arequals checksum before and after rebalance"
                   "with force option")
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")

        # Checking if rebalance skipped any files
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            self.assertEqual(
                int(each_node['skipped']), 0,
                "Few files are skipped on node %s" % each_node['nodeName'])
            g.log.info("No files are skipped on %s", each_node['nodeName'])
    def test_write_io_mount_point_resumed_quorum_restored_x3(self):
        """
        - set cluster.quorum-type to auto
        - start I/O from the mount point
        - Do IO and check on subvols with two nodes to reboot
        (do for each subvol)
        - get files to delete/create for nodes to be offline
        - delete files from mountpoint
        - reboot nodes
        - creating files on nodes while rebooting
        - validate for rofs
        - wait for volume processes to be online
        - creating files on nodes after rebooting
        - validate IO
        - Do IO and check on subvols without nodes to reboot
        (do for each subvol)
        - get files to delete/create for nodes to be online
        - delete files from mountpoint
        - reboot nodes
        - creating files on online nodes while rebooting other nodes
        - validate IO
        - Do IO and check and reboot two nodes on all subvols
        - get files to delete/create for nodes to be offline
        - delete files from mountpoint
        - reboot nodes
        - creating files on nodes while rebooting
        - validate for rofs
        - wait for volume processes to be online
        - creating files on nodes after rebooting
        - validate IO
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        # set cluster.quorum-type to auto
        options = {"cluster.quorum-type": "auto"}
        g.log.info("setting cluster.quorum-type to auto on volume %s",
                   self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for"
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Creating files
            cmd = "/usr/bin/env python %s create_files -f 30 %s" % (
                self.script_upload_path, mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # Validate IO
        self.io_validation_complete = False
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Do IO and check on subvols with nodes to reboot
        subvols_dict = get_subvols(self.mnode, self.volname)
        for subvol in subvols_dict['volume_subvols']:
            # define nodes to reboot
            brick_list = subvol[0:2]
            nodes_to_reboot = []
            for brick in brick_list:
                node, brick_path = brick.split(':')
                nodes_to_reboot.append(node)

            # get files to delete/create for nodes to be offline
            node, brick_path = brick_list[0].split(':')
            ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path)
            self.assertFalse(ret, 'Failed to ls files on %s' % node)
            file_list = brick_file_list.splitlines()

            # delete files from mountpoint
            for mount_obj in self.mounts:
                g.log.info("Deleting data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)
                cmd = ('cd %s/ ; rm -rf %s' %
                       (mount_obj.mountpoint, ' '.join(file_list)))
                ret, _, _ = g.run(mount_obj.client_system, cmd)
                self.assertFalse(
                    ret, 'Failed to rm file on %s' % mount_obj.client_system)
            g.log.info('Files %s are deleted', file_list)

            # reboot nodes on subvol and wait while rebooting
            g.log.info("Rebooting the nodes %s", nodes_to_reboot)
            ret = reboot_nodes(nodes_to_reboot)
            self.assertTrue(ret,
                            'Failed to reboot nodes %s ' % nodes_to_reboot)

            # Creating files on nodes while rebooting
            self.all_mounts_procs = []
            for mount_obj in self.mounts:
                g.log.info("Creating data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)

                # Creating files
                cmd = ("cd %s/ ;"
                       "touch %s" %
                       (mount_obj.mountpoint, ' '.join(file_list)))

                proc = g.run_async(mount_obj.client_system,
                                   cmd,
                                   user=mount_obj.user)
                self.all_mounts_procs.append(proc)

                # Validate IO
                self.io_validation_complete = False
                g.log.info("Validating if IO failed with read-only filesystem")
                ret = is_io_procs_fail_with_rofs(self, self.all_mounts_procs,
                                                 self.mounts)
                self.assertTrue(ret, ("Unexpected error and IO successful"
                                      " on read-only filesystem"))
                self.io_validation_complete = True
                g.log.info("EXPECTED: "
                           "Read-only file system in IO while creating file")

            # check if nodes are online
            counter = 0
            timeout = 300
            _rc = False
            while counter < timeout:
                ret, reboot_results = are_nodes_online(nodes_to_reboot)
                if not ret:
                    g.log.info("Nodes are offline, Retry after 5 seconds ... ")
                    time.sleep(5)
                    counter = counter + 5
                else:
                    _rc = True
                    break

            if not _rc:
                for node in reboot_results:
                    if reboot_results[node]:
                        g.log.info("Node %s is online", node)
                    else:
                        g.log.error(
                            "Node %s is offline even after "
                            "%d minutes", node, timeout / 60.0)
            else:
                g.log.info("All nodes %s are up and running", nodes_to_reboot)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info(
                "Successful in waiting for volume %s processes to be "
                "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(
                self.mnode, self.volname)
            self.assertTrue(
                ret, ("Volume %s : All process are not online" % self.volname))
            g.log.info("Volume %s : All process are online", self.volname)

            # Creating files on nodes after rebooting
            self.all_mounts_procs = []
            for mount_obj in self.mounts:
                g.log.info("Creating data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)

                # Creating files
                cmd = ("cd %s/ ;"
                       "touch %s" %
                       (mount_obj.mountpoint, ' '.join(file_list)))

                proc = g.run_async(mount_obj.client_system,
                                   cmd,
                                   user=mount_obj.user)
                self.all_mounts_procs.append(proc)

            # Validate IO
            self.io_validation_complete = False
            self.assertTrue(
                validate_io_procs(self.all_mounts_procs, self.mounts),
                "IO failed on some of the clients")
            self.io_validation_complete = True

        # Do IO and check on subvols without nodes to reboot
        subvols_dict = get_subvols(self.mnode, self.volname)
        for subvol in subvols_dict['volume_subvols']:
            # define nodes to reboot
            brick_list = subvol[0:2]
            nodes_to_reboot = []
            for brick in brick_list:
                node, brick_path = brick.split(':')
                nodes_to_reboot.append(node)

            # get files to delete/create for nodes to be online
            new_subvols_dict = get_subvols(self.mnode, self.volname)
            subvol_to_operate = new_subvols_dict['volume_subvols']
            subvol_to_operate.remove(subvol)
            brick_list_subvol_online = subvol_to_operate[0]

            node, brick_path_vol_online = \
                brick_list_subvol_online[0].split(':')
            ret, brick_file_list, _ = g.run(node,
                                            'ls %s' % brick_path_vol_online)
            self.assertFalse(ret, 'Failed to ls files on %s' % node)
            file_list = brick_file_list.splitlines()

            # delete files from mountpoint
            for mount_obj in self.mounts:
                g.log.info("Deleting data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)
                cmd = ('cd %s/ ; rm -rf %s' %
                       (mount_obj.mountpoint, ' '.join(file_list)))
                ret, _, _ = g.run(mount_obj.client_system, cmd)
                self.assertFalse(
                    ret, 'Failed to rm file on %s' % mount_obj.client_system)
            g.log.info('Files %s are deleted', file_list)

            # reboot nodes on subvol and wait while rebooting
            g.log.info("Rebooting the nodes %s", nodes_to_reboot)
            ret = reboot_nodes(nodes_to_reboot)
            self.assertTrue(ret,
                            'Failed to reboot nodes %s ' % nodes_to_reboot)

            # Creating files on nodes while rebooting
            self.all_mounts_procs = []
            for mount_obj in self.mounts:
                g.log.info("Creating data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)

                # Creating files
                cmd = ("cd %s/ ;"
                       "touch %s" %
                       (mount_obj.mountpoint, ' '.join(file_list)))

                proc = g.run_async(mount_obj.client_system,
                                   cmd,
                                   user=mount_obj.user)
                self.all_mounts_procs.append(proc)

                # Validate IO
                self.io_validation_complete = False
                self.assertTrue(
                    validate_io_procs(self.all_mounts_procs, self.mounts),
                    "IO failed on some of the clients")
                self.io_validation_complete = True

            # check if nodes are online
            counter = 0
            timeout = 300
            _rc = False
            while counter < timeout:
                ret, reboot_results = are_nodes_online(nodes_to_reboot)
                if not ret:
                    g.log.info("Nodes are offline, Retry after 5 seconds ... ")
                    time.sleep(5)
                    counter = counter + 5
                else:
                    _rc = True
                    break

            if not _rc:
                for node in reboot_results:
                    if reboot_results[node]:
                        g.log.info("Node %s is online", node)
                    else:
                        g.log.error(
                            "Node %s is offline even after "
                            "%d minutes", node, timeout / 60.0)
            else:
                g.log.info("All nodes %s are up and running", nodes_to_reboot)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info(
                "Successful in waiting for volume %s processes to be "
                "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(
                self.mnode, self.volname)
            self.assertTrue(
                ret, ("Volume %s : All process are not online" % self.volname))
            g.log.info("Volume %s : All process are online", self.volname)

        # Do IO and check and reboot nodes on all subvols
        subvols_dict = get_subvols(self.mnode, self.volname)
        nodes_to_reboot = []
        file_list_for_all_subvols = []
        for subvol in subvols_dict['volume_subvols']:
            # define nodes to reboot
            brick_list = subvol[0:2]
            for brick in brick_list:
                node, brick_path = brick.split(':')
                nodes_to_reboot.append(node)

            # get files to delete/create for nodes to be offline
            node, brick_path = brick_list[0].split(':')
            ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path)
            self.assertFalse(ret, 'Failed to ls files on %s' % node)
            file_list = brick_file_list.splitlines()
            file_list_for_all_subvols.append(file_list)

            # delete files from mountpoint
            for mount_obj in self.mounts:
                g.log.info("Deleting data for %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)
                cmd = ('cd %s/ ; rm -rf %s' %
                       (mount_obj.mountpoint, ' '.join(file_list)))
                ret, _, _ = g.run(mount_obj.client_system, cmd)
                self.assertFalse(ret, 'Failed to rm file on %s' % node)
            g.log.info('Files %s are deleted', file_list)

        # reboot nodes on subvol and wait while rebooting
        g.log.info("Rebooting the nodes %s", nodes_to_reboot)
        ret = reboot_nodes(nodes_to_reboot)
        self.assertTrue(ret, 'Failed to reboot nodes %s ' % nodes_to_reboot)

        # Creating files on nodes while rebooting
        all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], []
        # Create files for 1-st subvol and get all_mounts_procs_1
        for mount_obj in self.mounts:
            g.log.info("Creating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Creating files
            cmd = (
                "cd %s/ ;"
                "touch %s" %
                (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0])))

            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs_1.append(proc)
            all_mounts_procs.append(all_mounts_procs_1)

        # Create files for 2-st subvol and get all_mounts_procs_2
        for mount_obj in self.mounts:
            g.log.info("Creating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Creating files
            cmd = (
                "cd %s/ ;"
                "touch %s" %
                (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1])))

            proc2 = g.run_async(mount_obj.client_system,
                                cmd,
                                user=mount_obj.user)
            all_mounts_procs_2.append(proc2)
            all_mounts_procs.append(all_mounts_procs_2)

        for mounts_procs in all_mounts_procs:
            # Validate IO
            self.io_validation_complete = False
            g.log.info("Validating if IO failed with read-only filesystem")
            ret = is_io_procs_fail_with_rofs(self, mounts_procs, self.mounts)
            self.assertTrue(ret, ("Unexpected error and IO successful"
                                  " on read-only filesystem"))
            self.io_validation_complete = True
            g.log.info("EXPECTED: "
                       "Read-only file system in IO while creating file")

        # check if nodes are online
        counter = 0
        timeout = 300
        _rc = False
        while counter < timeout:
            ret, reboot_results = are_nodes_online(nodes_to_reboot)
            if not ret:
                g.log.info("Nodes are offline, Retry after 5 seconds ... ")
                time.sleep(5)
                counter = counter + 5
            else:
                _rc = True
                break

        if not _rc:
            for node in reboot_results:
                if reboot_results[node]:
                    g.log.info("Node %s is online", node)
                else:
                    g.log.error("Node %s is offline even after "
                                "%d minutes", node, timeout / 60.0)
        else:
            g.log.info("All nodes %s are up and running", nodes_to_reboot)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Creating files on nodes after rebooting
        all_mounts_procs, all_mounts_procs_1, all_mounts_procs_2 = [], [], []
        # Create files for 1-st subvol and get all_mounts_procs_1
        for mount_obj in self.mounts:
            g.log.info("Creating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Creating files
            cmd = (
                "cd %s/ ;"
                "touch %s" %
                (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[0])))

            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs_1.append(proc)
            all_mounts_procs.append(all_mounts_procs_1)

        # Create files for 2-st subvol and get all_mounts_procs_2
        for mount_obj in self.mounts:
            g.log.info("Creating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Creating files
            cmd = (
                "cd %s/ ;"
                "touch %s" %
                (mount_obj.mountpoint, ' '.join(file_list_for_all_subvols[1])))

            proc2 = g.run_async(mount_obj.client_system,
                                cmd,
                                user=mount_obj.user)
            all_mounts_procs_2.append(proc2)
            all_mounts_procs.append(all_mounts_procs_2)

        for mounts_procs in all_mounts_procs:
            # Validate IO
            self.io_validation_complete = False
            self.assertTrue(
                validate_io_procs(self.all_mounts_procs, self.mounts),
                "IO failed on some of the clients")
            self.io_validation_complete = True
    def setup_volume(cls, volume_create_force=False):
        """Setup the volume:
            - Create the volume, Start volume, Set volume
            options, enable snapshot/quota/tier if specified in the config
            file.
            - Wait for volume processes to be online
            - Export volume as NFS/SMB share if mount_type is NFS or SMB
            - Log volume info and status

        Args:
            volume_create_force(bool): True if create_volume should be
                executed with 'force' option.

        Returns (bool): True if all the steps mentioned in the descriptions
            passes. False otherwise.
        """
        force_volume_create = False
        if volume_create_force or cls.volume_create_force:
            force_volume_create = True

        # Validate peers before setting up volume
        g.log.info("Validate peers before setting up volume ")
        ret = cls.validate_peers_are_connected()
        if not ret:
            g.log.error("Failed to validate peers are in connected state "
                        "before setting up volume")
            return False
        g.log.info("Successfully validated peers are in connected state "
                   "before setting up volume")

        # Setup Volume
        g.log.info("Setting up volume %s", cls.volname)
        ret = setup_volume(mnode=cls.mnode,
                           all_servers_info=cls.all_servers_info,
                           volume_config=cls.volume,
                           force=force_volume_create)
        if not ret:
            g.log.error("Failed to Setup volume %s", cls.volname)
            return False
        g.log.info("Successful in setting up volume %s", cls.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume %s processes to be online", cls.volname)
        ret = wait_for_volume_process_to_be_online(cls.mnode, cls.volname)
        if not ret:
            g.log.error(
                "Failed to wait for volume %s processes to "
                "be online", cls.volname)
            return False
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", cls.volname)

        # Export/Share the volume based on mount_type
        if cls.mount_type != "glusterfs":
            g.log.info("Export/Sharing the volume %s", cls.volname)
            if "nfs" in cls.mount_type:
                ret = export_volume_through_nfs(
                    mnode=cls.mnode,
                    volname=cls.volname,
                    enable_ganesha=cls.enable_nfs_ganesha)
                if not ret:
                    g.log.error("Failed to export volume %s "
                                "as NFS export", cls.volname)
                    return False
                g.log.info(
                    "Successful in exporting the volume %s "
                    "as NFS export", cls.volname)

                # Set NFS-Ganesha specific volume options
                if cls.enable_nfs_ganesha and cls.nfs_ganesha_export_options:
                    g.log.info(
                        "Setting NFS-Ganesha export specific "
                        "volume options on volume %s", cls.volname)
                    ret = set_volume_options(
                        mnode=cls.mnode,
                        volname=cls.volname,
                        options=cls.nfs_ganesha_export_options)
                    if not ret:
                        g.log.error(
                            "Failed to set NFS-Ganesha "
                            "export specific options on "
                            "volume %s", cls.volname)
                        return False
                    g.log.info(
                        "Successful in setting NFS-Ganesha export "
                        "specific volume options on volume %s", cls.volname)

            if "smb" in cls.mount_type or "cifs" in cls.mount_type:
                ret = share_volume_over_smb(mnode=cls.mnode,
                                            volname=cls.volname,
                                            smb_users_info=cls.smb_users_info)
                if not ret:
                    g.log.error("Failed to export volume %s "
                                "as SMB Share", cls.volname)
                    return False
                g.log.info("Successful in exporting volume %s as SMB Share",
                           cls.volname)

                # Set SMB share specific volume options
                if cls.smb_share_options:
                    g.log.info(
                        "Setting SMB share specific volume options "
                        "on volume %s", cls.volname)
                    ret = set_volume_options(mnode=cls.mnode,
                                             volname=cls.volname,
                                             options=cls.smb_share_options)
                    if not ret:
                        g.log.error(
                            "Failed to set SMB share "
                            "specific options "
                            "on volume %s", cls.volname)
                        return False
                    g.log.info(
                        "Successful in setting SMB share specific "
                        "volume options on volume %s", cls.volname)

        # Log Volume Info and Status
        g.log.info("Log Volume %s Info and Status", cls.volname)
        ret = log_volume_info_and_status(cls.mnode, cls.volname)
        if not ret:
            g.log.error("Logging volume %s info and status failed",
                        cls.volname)
            return False
        g.log.info("Successful in logging volume %s info and status",
                   cls.volname)

        return True
Example #26
0
    def test_snapshot_while_rebalance(self):
        # pylint: disable=too-many-statements, missing-docstring
        # Start IO on all mounts.
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Create one snapshot of volume using no-timestamp option
        cmd_str = ("gluster snapshot create %s %s %s" %
                   ("snapy", self.volname, "no-timestamp"))
        ret, _, _ = g.run(self.mnode, cmd_str)
        self.assertEqual(ret, 0,
                         ("Failed to create snapshot for %s" % self.volname))
        g.log.info("Snapshot snapy created successfully "
                   "for volume %s", self.volname)

        # Check for no of snaps using snap_list it should be 1
        snap_list = get_snap_list(self.mnode)
        self.assertEqual(
            1, len(snap_list), "Expected 1 snapshot "
            "found %s snapshots" % len(snap_list))
        g.log.info("Successfully validated number of snaps.")

        # validate snap name
        self.assertIn("snapy", snap_list, " snap not found")
        g.log.info("Successfully validated names of snap")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # expanding volume
        g.log.info("Start adding bricks to volume %s", self.volname)
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to add bricks to "
                              "volume %s " % self.volname))
        g.log.info("Add brick successful")

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and status "
            "of volume %s", self.volname)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s "
                   "processes are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, err = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0,
                         ("Failed to start rebalance on "
                          "the volume %s with error %s" % (self.volname, err)))
        g.log.info("Successfully started rebalance on the "
                   "volume %s", self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to log rebalance status")
        g.log.info("successfully logged rebalance status")

        # Create one snapshot of volume during rebalance
        cmd_str = ("gluster snapshot create %s %s %s" %
                   ("snapy_rebal", self.volname, "no-timestamp"))
        ret, _, _ = g.run(self.mnode, cmd_str)
        self.assertNotEqual(ret, 0, ("successfully created 'snapy_rebal'"
                                     " for %s" % self.volname))
        g.log.info("Snapshot 'snapy_rebal' not created as rebalance is in "
                   "progress check log")
        # Check for no of snaps using snap_list it should be 1
        snap_list = get_snap_list(self.mnode)
        self.assertEqual(
            1, len(snap_list), "Expected 1 snapshot "
            "found %s snapshot" % len(snap_list))
        g.log.info("Successfully validated number of snaps.")

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete "
                              "on the volume %s", self.volname))
        g.log.info("Rebalance is successfully complete on "
                   "the volume %s", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for "
                                  "the volume %s", self.volname))
        g.log.info("Successfully got rebalance status of the "
                   "volume %s", self.volname)

        # Create one snapshot of volume post rebalance with same name
        cmd_str = ("gluster snapshot create %s %s %s" %
                   ("snapy_rebal", self.volname, "no-timestamp"))
        ret, _, _ = g.run(self.mnode, cmd_str)
        self.assertEqual(ret, 0,
                         ("Failed to create snapshot for %s" % self.volname))
        g.log.info(
            "Snapshot snapy_rebal created successfully "
            "for volume  %s", self.volname)

        # Check for no of snaps using snap_list it should be 2
        snap_list = get_snap_list(self.mnode)
        self.assertEqual(
            2, len(snap_list), "Expected 2 snapshots "
            "found %s snapshot" % len(snap_list))
        g.log.info("Successfully validated number of snaps.")

        # validate snap name
        self.assertIn("snapy_rebal", snap_list, " snap not found")
        g.log.info("Successfully validated names of snap")
    def test_self_heal_daemon(self):
        """
        Test Data-Self-Heal(heal command)
        Description:
        - Create directory test_hardlink_self_heal
        - Create directory test_data_self_heal
        - Creating files for hardlinks and data files
        - Get arequal before getting bricks offline
        - Select bricks to bring offline
        - Bring brick offline
        - Create hardlinks and append data to data files
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        - Select bricks to bring offline
        - Bring brick offline
        - Truncate data to data files and verify hardlinks
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal again

        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_hardlink_self_heal
        ret = mkdir(
            self.mounts[0].client_system,
            "{}/test_hardlink_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_hardlink_self_heal' on %s created "
            "successfully", self.mounts[0])

        # Creating directory test_data_self_heal
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_data_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory test_hardlink_self_heal on %s created "
            "successfully", self.mounts[0])

        # Creating files for hardlinks and data files
        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5`;'
               'do mkdir dir.$i ; for j in `seq 1 10` ; do dd if='
               '/dev/urandom of=dir.$i/file.$j bs=1k count=$j;done; done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=128K count=$i;done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        # Get arequal before getting bricks offline
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal before getting bricks online-%s',
                   result_before_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Append data to data files and create hardlinks
        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=512K count=$i ; done ;'
               'cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to modify data files.")
        g.log.info("Successfully modified data files")

        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5` ;do '
               'for j in `seq 1 10`;do ln dir.$i/file.$j dir.$i/link_file.$j;'
               'done ; done ; cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Hardlinks creation failed")
        g.log.info("Successfully created hardlinks of files")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal after getting bricks online '
                   'is %s', result_after_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Truncate data to data files and verify hardlinks
        cmd = ('cd %s/test_data_self_heal ; for i in `seq 1 100` ;'
               'do truncate -s $(( $i * 128)) file.$i ; done ; cd ..' %
               self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to truncate files")
        g.log.info("Successfully truncated files on mountpoint")

        file_path = ('%s/test_hardlink_self_heal/dir{1..5}/file{1..10}' %
                     (self.mounts[0].mountpoint))
        link_path = ('%s/test_hardlink_self_heal/dir{1..5}/link_file{1..10}' %
                     (self.mounts[0].mountpoint))
        file_stat = get_file_stat(self.mounts[0], file_path)
        link_stat = get_file_stat(self.mounts[0], link_path)
        self.assertEqual(file_stat, link_stat, "Verification of hardlinks "
                         "failed")
        g.log.info("Successfully verified hardlinks")

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
    def test_glustershd_on_all_volume_types(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes

        * Create multiple volumes and start all volumes
        * Check the glustershd processes - Only One glustershd should be listed
        * Check the glustershd server vol file - should contain entries only
                                             for replicated involved volumes
        * Add bricks to the replicate volume - it should convert to
                                               distributed-replicate
        * Check the glustershd server vol file - newly added bricks
                                                 should present
        * Check the glustershd processes - Only 1 glustershd should be listed

        """
        # pylint: disable=too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % glustershd_pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        # For all the volumes, check whether bricks present in
        # glustershd server vol file
        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:
            g.log.info("Volume Name: %s", volume)
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])

            # get the bricks for the volume
            g.log.info("Fetching bricks for the volume : %s", volume)
            bricks_list = get_all_bricks(self.mnode, volume)
            g.log.info("Brick List : %s", bricks_list)

            # validate the bricks present in volume info with
            # glustershd server volume file
            g.log.info("Start parsing file %s on "
                       "node %s", self.GLUSTERSHD, self.mnode)
            ret = do_bricks_exist_in_shd_volfile(self.mnode, volume,
                                                 bricks_list)
            if volume_type == 'Distribute':
                self.assertFalse(ret,
                                 ("Bricks exist in glustershd server "
                                  "volume file for %s Volume" % volume_type))
                g.log.info(
                    "EXPECTED : Bricks doesn't exist in glustershd "
                    "server volume file for %s Volume", volume_type)
            else:
                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file. Please check log "
                                      "file for details"))
                g.log.info(
                    "Bricks exist in glustershd server volume file "
                    "for %s Volume", volume_type)

        # expanding volume for Replicate
        for volume in volume_list:
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])
            if volume_type == 'Replicate':
                g.log.info("Start adding bricks to volume %s", volume)
                ret = expand_volume(self.mnode, volume, self.servers,
                                    self.all_servers_info)
                self.assertTrue(ret, ("Failed to add bricks to "
                                      "volume %s " % volume))
                g.log.info("Add brick successful")

                # Log Volume Info and Status after expanding the volume
                g.log.info("Logging volume info and Status after "
                           "expanding volume")
                ret = log_volume_info_and_status(self.mnode, volume)
                self.assertTrue(ret, ("Logging volume info and status failed "
                                      "on volume %s", volume))
                g.log.info(
                    "Successful in logging volume info and status "
                    "of volume %s", volume)

                # Verify volume's all process are online for 60 sec
                g.log.info("Verifying volume's all process are online")
                ret = wait_for_volume_process_to_be_online(
                    self.mnode, volume, 60)
                self.assertTrue(ret, ("Volume %s : All process are not "
                                      "online", volume))
                g.log.info(
                    "Successfully verified volume %s processes "
                    "are online", volume)

                # check the type for the replicate volume
                volume_type_info_for_replicate_after_adding_bricks = \
                    get_volume_type_info(self.mnode, volume)
                volume_type_for_replicate_after_adding_bricks = \
                    (volume_type_info_for_replicate_after_adding_bricks
                     ['volume_type_info']['typeStr'])

                self.assertEqual(volume_type_for_replicate_after_adding_bricks,
                                 'Distributed-Replicate',
                                 ("Replicate volume type is not converted to "
                                  "Distributed-Replicate after adding bricks"))
                g.log.info("Replicate Volume is successfully converted to"
                           " Distributed-Replicate after adding bricks")

                # get the bricks for the volume after expanding
                bricks_list_after_expanding = get_all_bricks(
                    self.mnode, volume)
                g.log.info("Brick List after expanding "
                           "volume: %s", bricks_list_after_expanding)

                # validate the bricks present in volume info
                # with glustershd server volume file after adding bricks
                g.log.info("Starting parsing file %s", self.GLUSTERSHD)
                ret = do_bricks_exist_in_shd_volfile(
                    self.mnode, volume, bricks_list_after_expanding)

                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file after expanding bricks. "
                                      "Please check log file for details"))
                g.log.info("Brick List from volume info is same as from "
                           "glustershd server volume file after "
                           "expanding bricks.")

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids_after_adding_bricks = \
            get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret,
                        ("Either No self heal daemon process found or "
                         "more than One self heal daemon process "
                         "found : %s" % glustershd_pids_after_adding_bricks))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        self.assertNotEqual(
            glustershd_pids, glustershd_pids_after_adding_bricks,
            "Self Daemon process is same before and"
            " after adding bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after adding bricks")
    def test_data_self_heal_algorithm_full_default(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full'

        Description:
        - set the volume option "data-self-heal-algorithm" to value "full"
        - create IO
        - bring down all bricks processes from selected set
        - modify the data
        - calculate arequal
        - bring bricks online
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options "data-self-heal-algorithm": "full"...')
        options = {"data-self-heal-algorithm": "full"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'data-self-heal-algorithm' is set to 'full' "
                   "successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')
    def setUpClass(cls):
        """
        setup volume and initialize necessary variables
        which is used in tests
        """
        # calling GlusterBaseClass setUpClass
        cls.get_super_method(cls, 'setUpClass')()

        cls.default_volume_type_config = {
            'replicated': {
                'type': 'replicated',
                'replica_count': 2,
                'transport': 'tcp'
            },
            'dispersed': {
                'type': 'dispersed',
                'disperse_count': 6,
                'redundancy_count': 2,
                'transport': 'tcp'
            },
            'distributed': {
                'type': 'distributed',
                'dist_count': 2,
                'transport': 'tcp'
            },
            'distributed-replicated': {
                'type': 'distributed-replicated',
                'dist_count': 2,
                'replica_count': 3,
                'transport': 'tcp'
            }
        }

        # Setup Volume for all the volume types
        cls.volume_configs = []
        for volume_type in cls.default_volume_type_config:
            cls.volume_configs.append({
                'name':
                'testvol_%s' % volume_type,
                'servers':
                cls.servers,
                'voltype':
                cls.default_volume_type_config[volume_type]
            })

        for volume_config in cls.volume_configs:
            ret = setup_volume(mnode=cls.mnode,
                               all_servers_info=cls.all_servers_info,
                               volume_config=volume_config)
            volname = volume_config['name']
            if not ret:
                raise ExecutionError("Failed to setup Volume" " %s" % volname)
            g.log.info("Successful in setting volume %s", volname)

            # Verify volume's all process are online for 60 sec
            g.log.info("Verifying volume's all process are online")
            ret = wait_for_volume_process_to_be_online(cls.mnode, volname, 60)
            if not ret:
                raise ExecutionError("Volume %s : All process are not online" %
                                     volname)
            g.log.info("Successfully Verified volume %s processes are online",
                       volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("Verifying Self Heal Daemon process is daemonized")
        ret = is_shd_daemonized(cls.servers)
        if not ret:
            raise ExecutionError("Self Heal Daemon process was still"
                                 " holding parent process.")
        g.log.info("Self Heal Daemon processes are online")

        cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol"