def test_self_heal_daemon(self):
        """
        Test Data-Self-Heal(heal command)
        Description:
        - Create directory test_hardlink_self_heal
        - Create directory test_data_self_heal
        - Creating files for hardlinks and data files
        - Get arequal before getting bricks offline
        - Select bricks to bring offline
        - Bring brick offline
        - Create hardlinks and append data to data files
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        - Select bricks to bring offline
        - Bring brick offline
        - Truncate data to data files and verify hardlinks
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal again

        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_hardlink_self_heal
        ret = mkdir(
            self.mounts[0].client_system,
            "{}/test_hardlink_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_hardlink_self_heal' on %s created "
            "successfully", self.mounts[0])

        # Creating directory test_data_self_heal
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_data_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory test_hardlink_self_heal on %s created "
            "successfully", self.mounts[0])

        # Creating files for hardlinks and data files
        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5`;'
               'do mkdir dir.$i ; for j in `seq 1 10` ; do dd if='
               '/dev/urandom of=dir.$i/file.$j bs=1k count=$j;done; done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=128K count=$i;done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        # Get arequal before getting bricks offline
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal before getting bricks online-%s',
                   result_before_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Append data to data files and create hardlinks
        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=512K count=$i ; done ;'
               'cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to modify data files.")
        g.log.info("Successfully modified data files")

        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5` ;do '
               'for j in `seq 1 10`;do ln dir.$i/file.$j dir.$i/link_file.$j;'
               'done ; done ; cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Hardlinks creation failed")
        g.log.info("Successfully created hardlinks of files")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal after getting bricks online '
                   'is %s', result_after_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Truncate data to data files and verify hardlinks
        cmd = ('cd %s/test_data_self_heal ; for i in `seq 1 100` ;'
               'do truncate -s $(( $i * 128)) file.$i ; done ; cd ..' %
               self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to truncate files")
        g.log.info("Successfully truncated files on mountpoint")

        file_path = ('%s/test_hardlink_self_heal/dir{1..5}/file{1..10}' %
                     (self.mounts[0].mountpoint))
        link_path = ('%s/test_hardlink_self_heal/dir{1..5}/link_file{1..10}' %
                     (self.mounts[0].mountpoint))
        file_stat = get_file_stat(self.mounts[0], file_path)
        link_stat = get_file_stat(self.mounts[0], link_path)
        self.assertEqual(file_stat, link_stat, "Verification of hardlinks "
                         "failed")
        g.log.info("Successfully verified hardlinks")

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
Beispiel #2
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
    def test_gluster_clone_heal(self):
        """
        Test gluster compilation on mount point(Heal command)
        - Creating directory test_compilation
        - Compile gluster on mountpoint
        - Select bricks to bring offline
        - Bring brick offline
        - Validate IO
        - Bring bricks online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        - Compile gluster on mountpoint again
        - Select bricks to bring offline
        - Bring brick offline
        - Validate IO
        - Bring bricks online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_compilation
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_compilation".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_compilation' on %s created "
            "successfully", self.mounts[0])

        # Compile gluster on mountpoint
        cmd = ("cd %s/test_compilation ; rm -rf glusterfs; git clone"
               " git://github.com/gluster/glusterfs.git ; cd glusterfs ;"
               " ./autogen.sh ;./configure CFLAGS='-g3 -O0 -DDEBUG'; make ;"
               " cd ../..;" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system, cmd)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs([proc], self.mounts[0]),
                        "IO failed on some of the clients")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info("Arequal of mountpoint %s", result_after_online)

        # Compile gluster on mountpoint again
        proc1 = g.run_async(self.mounts[0].client_system, cmd)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))

        # Validate IO
        self.assertTrue(validate_io_procs([proc1], self.mounts[0]),
                        "IO failed on some of the clients")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info("Arequal of mountpoint %s", result_after_online)
Beispiel #4
0
    def test_existing_glustershd_should_take_care_of_self_healing(self):
        """
        Test Script which verifies that the existing glustershd should take
        care of self healing

        * Create and start the Replicate volume
        * Check the glustershd processes - Note the pids
        * Bring down the One brick ( lets say brick1)  without affecting
          the cluster
        * Create 1000 files on volume
        * bring the brick1 up which was killed in previous steps
        * check the heal info - proactive self healing should start
        * Bring down brick1 again
        * wait for 60 sec and brought up the brick1
        * Check the glustershd processes - pids should be different
        * Monitor the heal till its complete

        """
        # pylint: disable=too-many-locals,too-many-lines,too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids = pids

        # select the bricks to bring offline
        g.log.info("Selecting bricks to brought offline for volume %s",
                   self.volname)
        bricks_to_bring_offline = \
            select_volume_bricks_to_bring_offline(self.mnode,
                                                  self.volname)
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # get the bricks which are running
        g.log.info("getting the brick list which are online")
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # write 1MB files to the mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mounts)
        all_mounts_procs = []
        cmd = ("for i in `seq 1 1000`; "
               "do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=1; "
               "done" % self.mounts[0].mountpoint)
        g.log.info(cmd)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname, heal_info)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Wait for 90 sec to start self healing
        g.log.info('Waiting for 90 sec to start self healing')
        time.sleep(90)

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info_after_brick_online = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname,
                   heal_info_after_brick_online)

        # check heal pending is decreased
        flag = False
        for brick in online_bricks:
            if int(heal_info_after_brick_online[brick]['numberOfEntries'])\
                    < int(heal_info[brick]['numberOfEntries']):
                flag = True
                break

        self.assertTrue(flag, "Pro-active self heal is not started")
        g.log.info("Pro-active self heal is started")

        # bring down bricks again
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # wait for 60 sec and brought up the brick again
        g.log.info('waiting for 60 sec and brought up the brick again')
        time.sleep(60)
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        shd_pids_after_bricks_online = pids

        # compare the glustershd pids
        self.assertNotEqual(glustershd_pids, shd_pids_after_bricks_online,
                            ("self heal daemon process are same before and "
                             "after bringing up bricks online"))
        g.log.info("EXPECTED : self heal daemon process are different before "
                   "and after bringing up bricks online")

        # wait for heal to complete
        g.log.info("Monitoring the heal.....")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret,
                        ("Heal is not completed on volume %s" % self.volname))
        g.log.info("Heal Completed on volume %s", self.volname)

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')
    def perform_test(self, ctime):
        """
        Testcase steps:
        1. Enable/disable features,ctime based on function argument.
        2. Create a directory on the mount point.
        3. Kill a brick and create a file inside the directory.
        4. Bring the brick online.
        5. Trigger heal and wait for its completion.
        6. Verify that the atime, mtime and ctime of the directory are same on
           all bricks of the replica.
        """
        if ctime:
            option = {'features.ctime': 'on'}
        else:
            option = {'features.ctime': 'off'}
        ret = set_volume_options(self.mnode, self.volname, option)
        self.assertTrue(
            ret, 'failed to set option %s on %s' % (option, self.volume))

        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)

        dirpath = '{}/dir1'.format(m_point)
        ret = mkdir(client, dirpath)
        self.assertTrue(ret, 'Unable to create a directory from mount point')

        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))

        cmd = 'touch {}/file1'.format(dirpath)
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, 'Unable to create file from mount point')

        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            bricks_to_bring_offline,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        if ctime:
            ret = self.are_mdata_xattrs_equal()
            self.assertTrue(ret,
                            "glusterfs.mdata mismatch for {}".format(dirpath))
        else:
            ret = self.are_stat_timestamps_equal()
            self.assertTrue(ret, "stat mismatch for {}".format(dirpath))

        ret = rmdir(client, dirpath, force=True)
        self.assertTrue(ret, 'Unable to delete directory from mount point')
Beispiel #6
0
    def test_gfid_self_heal(self):
        """
        Test GFID self heal
        Description:
        - Creating directory test_compilation
        - Write Deep directories and files
        - Get arequal before getting bricks offline
        - Select bricks to bring offline
        - Bring brick offline
        - Delete directory on mountpoint where data is writte
        - Create the same directory and write same data
        - Bring bricks online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_compilation
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_gfid_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_gfid_self_heal' on %s created "
            "successfully", self.mounts[0])

        # Write Deep directories and files
        count = 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 2 "
                   "--dir-length 10 --max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            ret, _, _ = g.run(self.mounts[0].client_system, cmd)
            self.assertEqual(ret, 0, "Failed to create files on mountpoint")
            g.log.info("Successfully created files on mountpoint")
            count += 10

        # Get arequal before getting bricks offline
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal after getting bricks offline '
                   'is %s', result_before_offline)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Delete directory on mountpoint where data is written
        cmd = ('rm -rf -v %s/test_gfid_self_heal' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to delete directory")
        g.log.info("Directory deleted successfully for %s", self.mounts[0])

        # Create the same directory and write same data
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_gfid_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_gfid_self_heal' on %s created "
            "successfully", self.mounts[0])

        # Write the same files again
        count = 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 2 "
                   "--dir-length 10 --max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            ret, _, _ = g.run(self.mounts[0].client_system, cmd)
            self.assertEqual(ret, 0, "Failed to create files on mountpoint")
            g.log.info("Successfully created files on mountpoint")
            count += 10

        # Bring bricks online
        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            bricks_to_bring_offline,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal after getting bricks online '
                   'is %s', result_after_online)