def _restart_volume_and_bring_all_offline_bricks_online(self):
        """Restart volume and bring all offline bricks online"""

        ret = is_heal_complete(self.mnode, self.volname)
        self.assertFalse(ret, 'Heal is completed')
        g.log.info('Heal is pending')

        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            self.bricks_to_bring_offline,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(
            ret,
            'Failed to bring bricks %s online' % self.bricks_to_bring_offline)

        # Check if bricks are back online or not
        ret = are_bricks_online(self.mnode, self.volname,
                                self.bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks not online %s even after restart' %
            self.bricks_to_bring_offline)

        g.log.info('Bringing bricks %s online is successful',
                   self.bricks_to_bring_offline)
def wait_to_heal_complete(gluster_pod,
                          hostname=None,
                          timeout=300,
                          wait_step=5):
    """Monitors heal for volumes on gluster
        gluster_pod (podcmd | str): gluster pod class object has gluster
                                    pod and ocp master node or gluster
                                    pod name
        hostname (str): master node on which gluster pod exists
    """
    gluster_pod = _get_gluster_pod(gluster_pod, hostname)

    gluster_vol_list = get_volume_list(gluster_pod)
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete(gluster_pod, gluster_vol):
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Example #3
0
    def test_replica_to_arbiter_volume_with_io(self):
        """
        Description: Replica 3 to arbiter conversion with ongoing IO's

        Steps :
        1) Create a replica 3 volume and start volume.
        2) Set client side self heal off.
        3) Fuse mount the volume.
        4) Create directory dir1 and write data.
           Example: untar linux tar from the client into the dir1
        5) When IO's is running, execute remove-brick command,
           and convert replica 3 to replica 2 volume
        6) Execute add-brick command and convert to arbiter volume,
           provide the path of new arbiter brick.
        7) Issue gluster volume heal.
        8) Heal should be completed with no files in split-brain.
        """

        # pylint: disable=too-many-statements
        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        self.io_process = run_linux_untar(self.clients[0],
                                          self.mounts[0].mountpoint,
                                          dirs=tuple(['linuxuntar']))
        self.is_io_running = True

        # Convert relicated to arbiter volume
        self._convert_replicated_to_arbiter_volume()

        # Wait for IO to complete.
        ret = self._wait_for_untar_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Example #4
0
    def test_gfid_assignment_on_lookup(self):
        g.log.info("Creating directories on the backend.")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        i = 0
        for brick in bricks_list:
            i += 1
            brick_node, brick_path = brick.split(":")
            ret, _, _ = g.run(brick_node, "mkdir %s/dir%d" % (brick_path, i))
            self.assertEqual(ret, 0, "Dir creation failed on %s" % brick_path)
        g.log.info("Created directories on the backend.")

        # To circumvent is_fresh_file() check in glusterfs code.
        time.sleep(2)

        # Do named lookup on directories from mount
        ret, _, err = g.run(self.clients[0],
                            "echo Hi >  %s/dir1" % self.mounts[0].mountpoint)
        errmsg = ("bash: %s/dir1: Is a directory\n" %
                  self.mounts[0].mountpoint)
        msg = "expected %s, but returned %s" % (errmsg, err)
        self.assertEqual(err, errmsg, msg)
        g.log.info("Writing a file with same name as directory \"dir1\" failed"
                   " as expected on mount point.")

        ret, _, _ = g.run(self.clients[0],
                          "touch %s/dir2" % self.mounts[0].mountpoint)
        self.assertEqual(
            ret, 0, "Touch of file with same name as directory "
            "\"dir2\" failed.")
        g.log.info("Touch of file with same name as directory \"dir2\" passed"
                   " but it will not create the file since a directory is "
                   "already present with the same name.")

        ret, _, err = g.run(self.clients[0],
                            "mkdir %s/dir3" % self.mounts[0].mountpoint)
        self.assertNotEqual(
            ret, 0, "Creation of directory with same name as "
            "directory \"dir3\" succeeded, which is not "
            "supposed to.")
        g.log.info("Creation of directory \"dir3\" failed as expected")

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Verify directories are present on the backend and gfids are assigned
        self.verify_gfid("dir1")
        self.verify_gfid("dir2")
        self.verify_gfid("dir3")

        # Check whether all the directories are listed on the mount
        _, count, _ = g.run(self.clients[0],
                            "ls %s | wc -l" % self.mounts[0].mountpoint)
        self.assertEqual(int(count), 3, "Not all the directories are listed on"
                         "the mount")
        g.log.info("All the directories are listed on the mount.")
Example #5
0
    def _check_heal_is_completed_and_not_in_split_brain(self):
        """Check if heal is completed and volume not in split brain"""
        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check if volume is in split brian or not
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
    def test_gfid_heal(self):
        """
        - Create a 1x3 volume and fuse mount it.
        - Create 1 directory with 1 file inside it directly on each brick.
        - Access the directories from the mount.
        - Launch heals and verify that the heals are over.
        - Verify that the files and directories have gfid assigned.
        """
        # pylint: disable=too-many-statements

        # Create data on the bricks.
        g.log.info("Creating directories and files on the backend.")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        i = 0
        for brick in bricks_list:
            i += 1
            brick_node, brick_path = brick.split(":")
            ret, _, _ = g.run(brick_node, "mkdir %s/dir%d" % (brick_path, i))
            self.assertEqual(ret, 0, "Dir creation failed on %s" % brick_path)
            ret, _, _ = g.run(brick_node,
                              "touch %s/dir%d/file%d" % (brick_path, i, i))
            self.assertEqual(ret, 0, "file creation failed on %s" % brick_path)
        g.log.info("Created directories and files on the backend.")

        # To circumvent is_fresh_file() check in glusterfs code.
        time.sleep(2)

        # Access files from mount
        for i in range(1, 4):
            cmd = ("ls %s/dir%d/file%d" % (self.mounts[0].mountpoint, i, i))
            ret, _, _ = g.run(self.clients[0], cmd)
            self.assertEqual(
                ret, 0, "Failed to access dir%d/file%d on %s" %
                (i, i, self.mounts[0].mountpoint))

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Verify gfid and links at the backend.
        self.verify_gfid_and_link_count("dir1", "file1")
        self.verify_gfid_and_link_count("dir2", "file2")
        self.verify_gfid_and_link_count("dir3", "file3")
def wait_to_heal_complete(timeout=300, wait_step=5):
    """Monitors heal for volumes on gluster"""
    gluster_vol_list = get_volume_list("auto_get_gluster_endpoint")
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete("auto_get_gluster_endpoint", gluster_vol):
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
Example #8
0
def wait_to_heal_complete(timeout=300, wait_step=5):
    """Monitors heal for volumes on gluster"""
    gluster_vol_list = get_volume_list("auto_get_gluster_endpoint")
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete("auto_get_gluster_endpoint", gluster_vol):
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
def wait_to_heal_complete(
        timeout=300, wait_step=5, g_node="auto_get_gluster_endpoint"):
    """Monitors heal for volumes on gluster"""
    gluster_vol_list = get_volume_list(g_node)
    if not gluster_vol_list:
        raise AssertionError("failed to get gluster volume list")

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete(g_node, gluster_vol):
                # NOTE(vponomar): Reset attempts for waiter to avoid redundant
                # sleep equal to 'interval' on the next usage.
                _waiter._attempt = 0
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
def wait_to_heal_complete(vol_name=None,
                          g_node="auto_get_gluster_endpoint",
                          timeout=300,
                          wait_step=5):
    """Monitors heal for volumes on gluster

    Args:
        vol_name (str): Name of the gluster volume else default is None and
            will check for all the volumes
        g_node (str): Name of the gluster node else default is
            auto_get_gluster_endpoint
        timeout (int): Time to wait for heal check to complete default is 300
        wait_step (int): Time to trigger heal check command for next iteration
    Raises:
        AssertionError: In case heal is not complete
    """
    if not vol_name:
        gluster_vol_list = get_volume_list(g_node)
        if not gluster_vol_list:
            raise AssertionError("failed to get gluster volume list")
    else:
        gluster_vol_list = [vol_name]

    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
    for gluster_vol in gluster_vol_list:
        for w in _waiter:
            if is_heal_complete(g_node, gluster_vol):
                # NOTE(vponomar): Reset attempts for waiter to avoid redundant
                # sleep equal to 'interval' on the next usage.
                _waiter._attempt = 0
                break

    if w.expired:
        err_msg = ("reached timeout waiting for all the gluster volumes "
                   "to reach the 'healed' state.")
        g.log.error(err_msg)
        raise AssertionError(err_msg)
    def test_self_heal_differing_in_file_type(self):
        """
        testing self heal of files with different file types
        with default configuration

        Description:
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequal and compare with arequal before
        getting bricks offline
        - modify the data
        - arequal before getting bricks online
        - bring bricks online
        - check daemons and healing completion
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        online and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        all_mounts_procs = []
        test_file_type_differs_self_heal_folder = \
            'test_file_type_differs_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=1k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; "
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(sorted(result_before_offline),
                         sorted(result_after_offline),
                         'Checksums before and after bringing bricks'
                         ' offline are not equal')
        g.log.info('Checksums before and after '
                   'bringing bricks offline are equal')

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 10` ; "
                   "do for j in `seq 1 5` ; "
                   "do for k in `seq 1 10` ; "
                   "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; "
                   "mkdir l1_dir.$i/l2_dir.$j/test.$k ; "
                   "done ; "
                   "done ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums before and after bringing bricks'
                         ' online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
Example #12
0
    def test_data_self_heal_algorithm_diff_heal_command(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff'

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "data-self-heal-algorithm": "diff"
        "self-heal-daemon": "off"
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - modify the data
        - get arequal before getting bricks online
        - bring bricks online
        - expand volume by adding bricks to the volume
        - do rebalance
        - set the volume option "self-heal-daemon": "on" and check for daemons
        - start healing
        - check if heal is completed
        - check for split-brain
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "data-self-heal-algorithm": "diff"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "'self-heal-daemon' "
                   "are set to 'off',"
                   "'data-self-heal-algorithm' "
                   "is set to 'diff' successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Expand volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums are equal')
Example #13
0
 def _check_if_there_are_files_and_dirs_to_be_healed(self):
     """Check if there are files and dirs to be healed"""
     ret = is_heal_complete(self.mnode, self.volname)
     self.assertFalse(ret, 'Heal is completed')
     g.log.info('Heal is pending')
Example #14
0
    def test_heal_gfid_1x3(self):
        """
        Description: This test case verifies the gfid self-heal on a 1x3
                 replicate volume.
                 1. file created at mount point
                 2. 2 bricks brought down
                 3. file deleted
                 4. created a new file from the mount point
                 5. all bricks brought online
                 6. check if gfid worked correctly
        """

        g.log.info("setting the quorum type to fixed")
        options = {"cluster.quorum-type": "fixed"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, "unable to set the quorum type to fixed")
        g.log.info("Successfully set the quorum type to fixed")

        g.log.info("creating a file from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 1 --base-file-name test_file --fixed-file-size 10k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a file from mount point")

        # getting list of all bricks
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "unable to get list of bricks")
        g.log.info("bringing down brick1 and brick2")
        ret = bring_bricks_offline(self.volname, all_bricks[:2])
        self.assertTrue(ret, "unable to bring bricks offline")
        g.log.info("Successfully brought the following bricks offline "
                   ": %s", str(all_bricks[:2]))

        g.log.info("deleting the file from mount point")
        command = "rm -f " + self.mounts[0].mountpoint + "/test_file1"
        ret, _, _ = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, "unable to remove file from mount point")
        g.log.info("Successfully deleted file from mountpoint")

        g.log.info("creating a new file of same name and different size "
                   "from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 1 --base-file-name test_file --fixed-file-size 1M %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a new file of same name "
                   "from mount point")

        g.log.info("bringing bricks 1 and 2 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[:2])
        self.assertIsNotNone(ret, "unable to bring bricks online")
        g.log.info("Successfully brought the following bricks online "
                   ": %s", str(all_bricks[:2]))

        g.log.info("checking if stat structure of the file is returned")
        ret = get_file_stat(self.mounts[0].client_system,
                            self.mounts[0].mountpoint + '/test_file0.txt')
        self.assertTrue(ret, "unable to get file stats")
        g.log.info("file stat structure returned successfully")

        g.log.info("checking if the heal has completed")
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, "heal not completed")
        g.log.info("Self heal was completed successfully")

        g.log.info("checking if the areequal checksum of all the bricks in "
                   "the subvol match")
        checksum_list = []
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            command = "arequal-checksum -p " + brick_path + \
                      " -i .glusterfs -i .landfill"
            ret, out, _ = g.run(node, command)
            self.assertEqual(
                ret, 0, "unable to get the arequal checksum "
                "of the brick")
            checksum_list.append(out)
            # checking file size of healed file on each brick to verify
            # correctness of choice for sink and source
            stat_dict = get_file_stat(node, brick_path + '/test_file0.txt')
            self.assertEqual(
                stat_dict['size'], '1048576',
                "file size of healed file is different "
                "than expected")
        flag = all(val == checksum_list[0] for val in checksum_list)
        self.assertTrue(flag, "the arequal checksum of all bricks is"
                        "not same")
        g.log.info("the arequal checksum of all the bricks in the subvol "
                   "is same")
Example #15
0
    def test_metadata_self_heal(self):
        """
        Test MetaData Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Change the permissions, ownership and the group
        of the files under "test_meta_data_self_heal" folder
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - check group and user are 'qa'
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create files
        g.log.info('Creating files...')
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 50` ; "
                   "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder,
                      test_meta_data_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Changing the permissions, ownership and the group
        # of the files under "test_meta_data_self_heal" folder
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Change permissions to 444
        g.log.info('Changing permissions...')
        command = ("cd %s/%s/ ; "
                   "chmod -R 444 *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Permissions are changed successfully')

        # Change the ownership to qa
        g.log.info('Changing the ownership...')
        command = ("cd %s/%s/ ; "
                   "chown -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Ownership is changed successfully')

        # Change the group to qa
        g.log.info('Changing the group...')
        command = ("cd %s/%s/ ; "
                   "chgrp -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Group is changed successfully')

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume process %s not online "
                              "despite waiting for 5 minutes", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')

        # Adding servers and client in single dict to check permissions
        nodes_to_check = {}
        all_bricks = get_all_bricks(self.mnode, self.volname)
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            nodes_to_check[node] = brick_path
        nodes_to_check[self.mounts[0].client_system] = \
            self.mounts[0].mountpoint

        # Checking for user and group
        for node in nodes_to_check:
            # Get file list
            command = ("cd %s/%s/ ; "
                       "ls"
                       % (nodes_to_check[node],
                          test_meta_data_self_heal_folder))
            ret, out, err = g.run(node, command)
            file_list = out.split()

            for file_name in file_list:
                file_to_check = '%s/%s/%s' % (nodes_to_check[node],
                                              test_meta_data_self_heal_folder,
                                              file_name)

                g.log.info('Checking for permissions, user and group for %s',
                           file_name)

                # Check for permissions
                cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'"
                       .format(file_to_check))
                ret, permissions, _ = g.run(node, cmd)
                self.assertEqual(permissions.split('\n')[0], '444',
                                 'Permissions %s is not equal to 444'
                                 % permissions)
                g.log.info("Permissions are '444' for %s", file_name)

                # Check for user
                cmd = ("ls -ld {} | awk '{{print $3}}'"
                       .format(file_to_check))
                ret, username, _ = g.run(node, cmd)
                self.assertEqual(username.split('\n')[0],
                                 'qa', 'User %s is not equal qa'
                                 % username)
                g.log.info("User is 'qa' for %s", file_name)

                # Check for group
                cmd = ("ls -ld {} | awk '{{print $4}}'"
                       .format(file_to_check))
                ret, groupname, _ = g.run(node, cmd)
                self.assertEqual(groupname.split('\n')[0],
                                 'qa', 'Group %s is not equal qa'
                                 % groupname)
                g.log.info("Group is 'qa' for %s", file_name)
    def test_heal_info_should_have_fixed_fields(self):
        """
        - Create IO
        - While IO is creating - bring down a couple of bricks
        - Wait for IO to complete
        - Bring up the down bricks
        - Wait for heal to complete
        - Check for fields 'Brick', 'Status', 'Number of entries' in heal info
        """
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -f 50 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get heal info
        g.log.info('Getting heal info...')
        heal_info_dicts = get_heal_info_summary(self.mnode, self.volname)
        self.assertFalse(ret, 'Failed to get heal info')
        g.log.info(heal_info_dicts)

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Check all fields in heal info dict
        g.log.info('Checking for all the fields in heal info...')
        for brick in bricks_list:
            g.log.info('Checking fields for %s', brick)
            self.assertEqual(heal_info_dicts[brick]['status'], 'Connected',
                             'Status is not Connected for brick %s' % brick)
            self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0',
                             'numberOfEntries is not 0 for brick %s' % brick)

        g.log.info('Successfully checked for all the fields in heal info')
Example #17
0
    def test_multiple_clients_dd_on_same_file_default(self):
        """
        - Create 2GB file
        - While creating file, start reading file
        - Bring down brick1
        - Bring back the brick brick1
        - Start healing
        - Bring down brick1
        - Wait for IO to complete
        - Wait for reading to complete
        - Bring back the brick brick1
        - Start healing
        - Wait for heal to complete
        - Check for split-brain
        - Calculate arequals on all the bricks and compare with mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("cd %s ; "
                       "dd if=/dev/urandom of=test_file bs=1M count=2020"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Reading files on client side
        all_mounts_procs_read = []
        for mount_obj in self.mounts:
            g.log.info("Reading data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Reading files...')
            command = ("python %s read %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            all_mounts_procs_read.append(proc)

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Bring brick1 online
        g.log.info('Bringing bricks %s online...', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_list[1])
        g.log.info('Bringing bricks %s online is successful',
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Validate reading
        self.assertTrue(
            validate_io_procs(all_mounts_procs_read, self.mounts),
            "Reading failed on some of the clients"
        )
        self.io_validation_complete = True

        # Bring brick1 online
        g.log.info('Bringing bricks %s online...', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_list[1])
        g.log.info('Bringing bricks %s online is successful',
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal for mount
        g.log.info('Getting arequal...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        g.log.info('Getting arequal on bricks...')
        arequals_after_heal = {}
        for brick in bricks_list:
            g.log.info('Getting arequal on bricks %s...', brick)
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan'
                       % brick_path)
            ret, arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s'
                             % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            arequals_after_heal[brick] = brick_total
            self.assertEqual(mount_point_total, brick_total,
                             'Arequals for mountpoint and %s are not equal'
                             % brick)
            g.log.info('Arequals for mountpoint and %s are equal', brick)
        g.log.info('All arequals are equal')
Example #18
0
    def test_entry_transaction_crash_consistency_rename(self):
        """
        Test entry transaction crash consistency : rename

        Description:
        - Create IO of 50 files
        - Rename 20 files
        - Calculate arequal before creating snapshot
        - Create snapshot
        - Rename 20 files more
        - Stop the volume
        - Restore snapshot
        - Start the volume
        - Get arequal after restoring snapshot
        - Compare arequals
        """

        # Creating files on client side
        count = 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "--base-file-name %d -f 25 %s"
                   % (self.script_upload_path,
                      count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            count = count + 10

        # Wait for IO to complete
        ret = wait_for_io_to_complete(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed to complete on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Rename files
        self.all_mounts_procs, self.io_validation_complete = [], False
        cmd = ("/usr/bin/env python %s mv -s FirstRename %s"
               % (self.script_upload_path,
                  self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete
        ret = wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed to complete on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before creating snapshot
        ret, result_before_snapshot = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, "Collecting arequal-checksum failed")

        # Create snapshot
        snapshot_name = ('entry_transaction_crash_consistency_rename-%s-%s'
                         % (self.volname, self.mount_type))
        ret, _, err = snap_create(self.mnode, self.volname, snapshot_name)
        self.assertEqual(ret, 0, err)
        g.log.info("Snapshot %s created successfully", snapshot_name)

        # Rename files
        self.all_mounts_procs, self.io_validation_complete = [], False
        cmd = ("/usr/bin/env python %s mv -s SecondRename %s"
               % (self.script_upload_path,
                  self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete
        ret = wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed to complete on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Restore snapshot
        ret = snap_restore_complete(self.mnode, self.volname,
                                    snapshot_name)
        self.assertTrue(ret, 'Failed to restore snapshot %s'
                        % snapshot_name)
        g.log.info("Snapshot %s restored successfully", snapshot_name)

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Wait for volume graph to get loaded.
        sleep(10)

        # Get arequal after restoring snapshot
        ret, result_after_restoring = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, "Collecting arequal-checksum failed")

        # Checking arequal before creating snapshot
        # and after restoring snapshot
        self.assertEqual(result_before_snapshot, result_after_restoring,
                         'Checksums are not equal')
        g.log.info('Checksums are equal')
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # The operation with key `4` in every op_type will be used for
            # final data consistency check
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
            # Create files and directories when brick is down with no
            # initial IO
            'gfid': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.2.$i; mkdir dir.2.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.2.$i/file.2.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.3.$i; mkdir dir.3.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.3.$i/file.3.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 4 6`;
                    do {1} 2K > file.$i; mkdir dir.$i;
                    for j in `seq 4 6`;
                    do {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create different file type with same name while a brick was down
            # with no initial IO and validate failure
            'file_type': {
                2:
                'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;',
                3:
                'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 2K > file.$i;
                    for j in `seq 1 6`;
                    do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create symlinks for files and directories while a brick was down
            # Out of 6 files, 6 dirs and 6 files in each dir, symlink
            # outer 2 files, inner 2 files in each dir, 2 dirs and
            # verify it's a symlink(-L) and linking file exists(-e)
            'symlink': {
                2:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.2.$i;
                    [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done;
                    [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k;
                    [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.3.$i;
                    [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done;
                    [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k;
                    [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1;
                    done;''',
                4:
                '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1;
                    {1} 4K >> sl_dir_new.4/dir.1/test_file;
                    {1} 4K >> sl_dir_new.4/test_file;
                    ''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(brick))
            self.assertTrue(
                are_bricks_offline(self.mnode, self.volname, [brick]),
                'Brick {} is not offline'.format(brick))

            # Perform file/dir operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            if op_type == 'file_type' and index == 3:
                # Should fail with ENOTCONN as one brick is down, lookupt can't
                # happen and quorum is not met
                self.assertNotEqual(
                    ret, 0, '{0} should fail as lookup fails, quorum is not '
                    'met'.format(cmd))
                self.assertIn(
                    'Transport', err, '{0} should fail with ENOTCONN '
                    'error'.format(cmd))
            else:
                self.assertEqual(ret, 0,
                                 '{0} failed with {1}'.format(cmd, err))
                self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(
                are_bricks_online(self.mnode, self.volname, [brick]),
                'Brick {} is not online'.format(brick))

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
Example #20
0
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            # Perform metadata/data operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
            self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
    def test_heal_command_unsuccessful_as_bricks_down(self):
        """
        - write 2 Gb file on mount
        - while write is in progress, kill brick b0
        - start heal on the volume (should fail and have error message)
        - bring up the brick which was down (b0)
        - bring down another brick (b1)
        - start heal on the volume (should fail and have error message)
        - bring bricks up
        - wait for heal to complete
        """
        # pylint: disable=too-many-statements
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create 2 Gb file
            g.log.info('Creating files...')
            command = ("cd %s ; dd if=/dev/zero of=file1  bs=10M  count=200"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Bring brick0 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[0])
        ret = bring_bricks_offline(self.volname, [bricks_list[0]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[0]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[0])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[0])

        # Start healing
        # Need to use 'gluster volume heal' command to check error message
        # after g.run
        cmd = "gluster volume heal %s" % self.volname
        ret, _, err = g.run(self.mnode, cmd)
        self.assertTrue(ret, 'Heal is started')
        # Check for error message
        self.assertIn("Launching heal operation to perform index self heal on "
                      "volume %s has been unsuccessful" % self.volname,
                      err,
                      "Error message is not present or not valid")
        g.log.info('Expected: Healing is not started')

        # Bring brick0 online
        g.log.info("Bring bricks: %s online", bricks_list[0])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[0]])
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_list[0])
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_list[0])

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Start healing
        # Need to use 'gluster volume heal' command to check error message
        # after g.run
        cmd = "gluster volume heal %s" % self.volname
        ret, _, err = g.run(self.mnode, cmd)
        self.assertTrue(ret, 'Heal is started')
        # Check for error message
        self.assertIn("Launching heal operation to perform index self heal on "
                      "volume %s has been unsuccessful" % self.volname,
                      err,
                      "Error message is not present or not valid")
        g.log.info('Expected: Healing is not started')

        # Bring brick 1 online
        g.log.info("Bring bricks: %s online", bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_list[1])
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True
    def test_handling_data_split_brain(self):
        """
        - create IO
        - calculate arequal from mountpoint
        - set volume option 'self-heal-daemon' to value "off"
        - kill data brick1
        - calculate arequal checksum and compare it
        - modify files and directories
        - bring back all bricks processes online
        - kill data brick3
        - modify files and directories
        - calculate arequal from mountpoint
        - bring back all bricks processes online
        - run the find command to trigger heal from mountpoint
        - set volume option 'self-heal-daemon' to value "on"
        - check if heal is completed
        - check for split-brain
        - read files
        - calculate arequal checksum and compare it
        """
        # pylint: disable=too-many-locals,too-many-statements

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do mkdir dir.$i ; "
                       "for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1K count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1k count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Setting options
        options = {"self-heal-daemon": "off"}
        g.log.info('Setting options %s for volume %s',
                   options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)

        # Bring brick 1 offline
        bricks_to_bring_offline = [bricks_list[0]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Comparing arequals before getting bricks offline
        # and after getting bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Arequals before getting bricks offline '
                         'and after getting bricks offline are not equal')
        g.log.info('Arequals before getting bricks offline '
                   'and after getting bricks offline are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Modify files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Bring 1-st brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Bring brick 3rd offline
        bricks_to_bring_offline = [bricks_list[-1]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring 3rd brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Mount and unmount mounts
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, 'Failed to unmount %s' % self.volname)

        ret = self.mount_volume(self.mounts)
        self.assertTrue(ret, 'Unable to mount %s' % self.volname)

        # Start heal from mount point
        g.log.info('Starting heal from mount point...')
        for mount_obj in self.mounts:
            g.log.info("Start heal for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = "/usr/bin/env python %s read %s" % (
                self.script_upload_path,
                self.mounts[0].mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Heal triggered for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Heal triggered for all mountpoints')

        # Enable self-heal daemon
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Successfully started self heal daemon')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Reading files
        g.log.info('Reading files...')
        for mount_obj in self.mounts:
            g.log.info("Start reading files for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = ('cd %s/ ; '
                       'for i in `seq 1 10` ; '
                       'do cat file.$i > /dev/null ; '
                       'for j in `seq 1 5` ; '
                       'do cat dir.$i/file.$j > /dev/null ; '
                       'done ; done'
                       % mount_obj.mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Reading files successfully for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Reading files successfully for all mountpoints')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Comparing arequals before getting bricks online
        # and after getting bricks online
        self.assertEqual(result_before_online, result_after_online,
                         'Arequals before getting bricks online '
                         'and after getting bricks online are not equal')
        g.log.info('Arequals before getting bricks online '
                   'and after getting bricks online are equal')
Example #23
0
    def test_metadata_self_heal_client_side_heal(self):
        """
        Testcase steps:
        1.Turn off the options self heal daemon
        2.Create IO
        3.Calculate arequal of the bricks and mount point
        4.Bring down "brick1" process
        5.Change the permissions of the directories and files
        6.Change the ownership of the directories and files
        7.Change the group of the directories and files
        8.Bring back the brick "brick1" process
        9.Execute "find . | xargs stat" from the mount point to trigger heal
        10.Verify the changes in permissions are not self healed on brick1
        11.Verify the changes in permissions on all bricks but brick1
        12.Verify the changes in ownership are not self healed on brick1
        13.Verify the changes in ownership on all the bricks but brick1
        14.Verify the changes in group are not successfully self-healed
           on brick1
        15.Verify the changes in group on all the bricks but brick1
        16.Turn on the option metadata-self-heal
        17.Execute "find . | xargs md5sum" from the mount point to trgger heal
        18.Wait for heal to complete
        19.Verify the changes in permissions are self-healed on brick1
        20.Verify the changes in ownership are successfully self-healed
           on brick1
        21.Verify the changes in group are successfully self-healed on brick1
        22.Calculate arequal check on all the bricks and mount point
        """
        # Setting options
        ret = set_volume_options(self.mnode, self.volname,
                                 {"self-heal-daemon": "off"})
        self.assertTrue(ret, 'Failed to set options self-heal-daemon '
                        'and metadata-self-heal to OFF')
        g.log.info("Options are set successfully")

        # Creating files on client side
        self.test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        for mount_object in self.mounts:
            command = ("cd {0}/ ; mkdir {1} ; cd {1}/ ;"
                       "for i in `seq 1 100` ; "
                       "do mkdir dir.$i ; "
                       "for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1K count=$j ; done ; done ;".format
                       (mount_object.mountpoint,
                        self.test_meta_data_self_heal_folder))
            proc = g.run_async(mount_object.client_system, command,
                               user=mount_object.user)
            self.all_mounts_procs.append(proc)

        # Validate IO
        self.validate_io_on_clients()

        # Calculate and check arequal of the bricks and mount point
        self.check_arequal_from_mount_point_and_bricks()

        # Select bricks to bring offline from a replica set
        subvols_dict = get_subvols(self.mnode, self.volname)
        subvols = subvols_dict['volume_subvols']
        bricks_to_bring_offline = []
        bricks_to_be_online = []
        for subvol in subvols:
            bricks_to_bring_offline.append(subvol[0])
            for brick in subvol[1:]:
                bricks_to_be_online.append(brick)

        # Bring bricks offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Change the permissions of the directories and files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            command = ('cd {}/{}; '
                       'for i in `seq 1 100` ; '
                       'do chmod 555 dir.$i ; done ; '
                       'for i in `seq 1 50` ; '
                       'do for j in `seq 1 5` ; '
                       'do chmod 666 dir.$i/file.$j ; done ; done ; '
                       'for i in `seq 51 100` ; '
                       'do for j in `seq 1 5` ; '
                       'do chmod 444 dir.$i/file.$j ; done ; done ;'
                       .format(mount_obj.mountpoint,
                               self.test_meta_data_self_heal_folder))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.validate_io_on_clients()

        # Change the ownership of the directories and files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            command = ('cd {}/{} ; '
                       'for i in `seq 1 35` ; '
                       'do chown -R qa_func dir.$i ; done ; '
                       'for i in `seq 36 70` ; '
                       'do chown -R qa_system dir.$i ; done ; '
                       'for i in `seq 71 100` ; '
                       'do chown -R qa_perf dir.$i ; done ;'
                       .format(mount_obj.mountpoint,
                               self.test_meta_data_self_heal_folder))
            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.validate_io_on_clients()

        # Change the group of the directories and files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            command = ('cd {}/{}; '
                       'for i in `seq 1 100` ; '
                       'do chgrp -R qa_all dir.$i ; done ;'
                       .format(mount_obj.mountpoint,
                               self.test_meta_data_self_heal_folder))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.validate_io_on_clients()

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Trigger heal from mount point
        self.trigger_heal_from_mount_point()

        # Verify the changes are not self healed on brick1 for each subvol
        for brick in bricks_to_bring_offline:
            node, brick_path = brick.split(':')

            dir_list = get_dir_contents(node, "{}/{}".format(
                brick_path, self.test_meta_data_self_heal_folder))
            self.assertIsNotNone(dir_list, "Dir list from "
                                 "brick is empty")
            g.log.info("Successfully got dir list from bick")

            # Verify changes for dirs
            for folder in dir_list:

                ret = get_file_stat(node, "{}/{}/{}".format(
                    brick_path, self.test_meta_data_self_heal_folder,
                    folder))

                self.assertEqual('755', ret['access'],
                                 "Permissions mismatch on node {}"
                                 .format(node))

                self.assertEqual('root', ret['username'],
                                 "User id mismatch on node {}"
                                 .format(node))

                self.assertEqual('root', ret['groupname'],
                                 "Group id mismatch on node {}"
                                 .format(node))

                # Get list of files for each dir
                file_list = get_dir_contents(node, "{}/{}/{}".format(
                    brick_path, self.test_meta_data_self_heal_folder,
                    folder))
                self.assertIsNotNone(file_list, "File list from "
                                     "brick is empty.")
                g.log.info("Successfully got file list from bick.")

                if file_list:
                    for file_name in file_list:

                        ret = get_file_stat(node, "{}/{}/{}/{}".format(
                            brick_path, self.test_meta_data_self_heal_folder,
                            folder, file_name))

                        self.assertEqual('644', ret['access'],
                                         "Permissions mismatch on node"
                                         " {} for file {}".format(node,
                                                                  file_name))

                        self.assertEqual('root', ret['username'],
                                         "User id mismatch on node"
                                         " {} for file {}".format(node,
                                                                  file_name))

                        self.assertEqual('root', ret['groupname'],
                                         "Group id mismatch on node"
                                         " {} for file {}".format(node,
                                                                  file_name))

        # Verify the changes are self healed on all bricks except brick1
        # for each subvol
        self.check_permssions_on_bricks(bricks_to_be_online)

        # Setting options
        ret = set_volume_options(self.mnode, self.volname,
                                 {"metadata-self-heal": "on"})
        self.assertTrue(ret, 'Failed to set options to ON.')
        g.log.info("Options are set successfully")

        # Trigger heal from mount point
        self.trigger_heal_from_mount_point()

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Verify the changes are self healed on brick1 for each subvol
        self.check_permssions_on_bricks(bricks_to_bring_offline)

        # Calculate and check arequal of the bricks and mount point
        self.check_arequal_from_mount_point_and_bricks()
Example #24
0
    def test_data_self_heal_daemon_off(self):
        """
        Test Data-Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - Get areequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Get areequal after getting bricks offline and compare with
        areequal before getting bricks offline
        - modify the data
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - add bricks
        - do rebalance
        - create 5k files
        - while creating files - kill bricks and bring bricks online one by one
        in cycle
        - validate IO
        """

        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off",
                   }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s"
                   % (options, self.volname))

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s"
                       % (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 100 --fixed-file-size 1k %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get areequal before getting bricks offline
        g.log.info('Getting areequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal before getting bricks offline '
                   'is successful')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(None, (
                bricks_to_bring_offline_dict['hot_tier_bricks'] +
                bricks_to_bring_offline_dict['cold_tier_bricks'] +
                bricks_to_bring_offline_dict['volume_bricks']))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful'
                   % bricks_to_bring_offline)

        # Get areequal after getting bricks offline
        g.log.info('Getting areequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after getting bricks offline '
                   'is successful')

        # Checking areequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Checksums before and '
                         'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s" %
                       (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 100 --fixed-file-size 10k %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Bring brick online
        g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful'
                   % bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online" % self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s" % self.volname)

        # Do rebalance
        ret, out, err = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Create 1k files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s" %
                       (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 1000 %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Kill all bricks in cycle
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_list:
            # Bring brick offline
            g.log.info('Bringing bricks %s offline' % brick)
            ret = bring_bricks_offline(self.volname, [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick)

            ret = are_bricks_offline(self.mnode, self.volname,
                                     [brick])
            self.assertTrue(ret, 'Bricks %s are not offline'
                            % brick)
            g.log.info('Bringing bricks %s offline is successful'
                       % bricks_to_bring_offline)

            # Bring brick online
            g.log.info('Bringing bricks %s online...' % brick)
            ret = bring_bricks_online(self.mnode, self.volname,
                                      [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s online' %
                            bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful'
                       % bricks_to_bring_offline)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info("Successful in waiting for volume %s processes to be "
                       "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(self.mnode,
                                                          self.volname)
            self.assertTrue(ret, ("Volume %s : All process are not online"
                                  % self.volname))
            g.log.info("Volume %s : All process are online" % self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(ret, "Either No self heal daemon process found or"
                                 "more than one self heal daemon process"
                                 "found")
            g.log.info("All self-heal-daemons are online")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")
Example #25
0
    def test_self_heal_50k_files_heal_command_by_add_brick(self):
        """
        Test self-heal of 50k files (heal command)
        Description:
        - Set the volume option
          "metadata-self-heal": "off"
          "entry-self-heal": "off"
          "data-self-heal": "off"
          "self-heal-daemon": "off"
        - Bring down all bricks processes from selected set
        - Create IO (50k files)
        - Get arequal before getting bricks online
        - Bring bricks online
        - Set the volume option
          "self-heal-daemon": "on"
        - Check for daemons
        - Start healing
        - Check if heal is completed
        - Check for split-brain
        - Get arequal after getting bricks online and compare with
          arequal before getting bricks online
        - Add bricks
        - Do rebalance
        - Get arequal after adding bricks and compare with
          arequal after getting bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "self-heal-daemon": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files on client side
        all_mounts_procs = []

        # Create 50k files
        g.log.info('Creating files...')
        command = ("cd %s ; "
                   "for i in `seq 1 50000` ; "
                   "do dd if=/dev/urandom of=test.$i "
                   "bs=100k count=1 ;  "
                   "done ;" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts[0]),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        ret, result_before_online = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        ret = set_volume_options(self.mnode, self.volname,
                                 {"self-heal-daemon": "on"})
        self.assertTrue(ret, 'Failed to set option self-heal-daemon to ON.')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(
            result_before_online, result_after_online, 'Checksums before and '
            'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Add bricks
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance and wait for it to complete
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=3600)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Get arequal after adding bricks
        ret, result_after_adding_bricks = collect_mounts_arequal(
            self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks ' 'is successful')

        # Checking arequals after bringing bricks online
        # and after adding bricks
        self.assertItemsEqual(
            result_after_online, result_after_adding_bricks,
            'Checksums after bringing bricks online and '
            'after adding bricks are not equal')
        g.log.info('Checksums after bringing bricks online and '
                   'after adding bricks are equal')
    def test_afr_heal_with_brickdown_hardlink(self):
        """
        Steps:
        1. Create  2 * 3 distribute replicate volume and disable all heals
        2. Create a file and 3 hardlinks to it from fuse mount.
        3. Kill brick4, rename HLINK1 to an appropriate name so that
           it gets hashed to replicate-1
        4. Likewise rename HLINK3 and HLINK7 as well, killing brick5 and brick6
           respectively each time. i.e. a different brick of the 2nd
           replica is down each time.
        5. Now enable shd and let selfheals complete.
        6. Heal should complete without split-brains.
        """
        bricks_list = get_all_bricks(self.mnode, self.volname)
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "self-heal-daemon": "off"
        }
        g.log.info("setting options %s", options)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for"
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        cmd = ("touch %s/FILE" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # Creating a hardlink for the file created
        for i in range(1, 4):
            ret = create_link_file(
                self.clients[0], '{}/FILE'.format(self.mounts[0].mountpoint),
                '{}/HLINK{}'.format(self.mounts[0].mountpoint, i))
            self.assertTrue(ret, "Unable to create hard link file ")

        # Bring brick3 offline,Rename file HLINK1,and bring back brick3 online
        self._test_brick_down_with_file_rename("HLINK1", "NEW-HLINK1",
                                               bricks_list[3])

        # Bring brick4 offline,Rename file HLINK2,and bring back brick4 online
        self._test_brick_down_with_file_rename("HLINK2", "NEW-HLINK2",
                                               bricks_list[4])

        # Bring brick5 offline,Rename file HLINK3,and bring back brick5 online
        self._test_brick_down_with_file_rename("HLINK3", "NEW-HLINK3",
                                               bricks_list[5])

        # Setting options
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Check data on mount point
        cmd = ("ls %s" % (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "failed to fetch data from mount point")
    def test_resolving_meta_data(self):
        """
        - Create a file test_file.txt
        - Find out which brick the file resides on and kill arbiter brick
        in the replica pair
        - Modify the permissions of the file
        - Bring back the killed brick
        - Kill the other brick in the replica pair
        - Modify the permissions of the file
        - Bring back the killed brick
        - Trigger heal
        - Check if heal is completed
        - Check for split-brain
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        file_to_create = 'test_file.txt'
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create file
            g.log.info('Creating file...')
            command = ("cd %s ; "
                       "touch %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # get bricks with file
        g.log.info('Getting bricks with file...')
        subvols_dict = get_subvols(self.mnode, self.volname)
        brick_list_with_file = []
        for subvol in subvols_dict['volume_subvols']:
            for brick in subvol:
                node, brick_path = brick.split(':')
                ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path)
                if 'test_file.txt' in brick_file_list:
                    brick_list_with_file.append(brick)
        g.log.info('Bricks with file: %s', brick_list_with_file)

        # Bring arbiter brick offline
        bricks_to_bring_offline = [brick_list_with_file[-1]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Modify the permissions
            g.log.info('Modifying the permissions of the file...')
            command = ("cd %s ; "
                       "chmod 600 %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring arbiter brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Bring 1-st data brick offline
        bricks_to_bring_offline = [brick_list_with_file[0]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Modify the permissions
            g.log.info('Modifying the permissions of the file...')
            command = ("cd %s ; "
                       "chmod 644 %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring 1-st data brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Example #28
0
    def test_manual_heal_should_trigger_heal(self):
        """
        - create a single brick volume
        - add some files and directories
        - get arequal from mountpoint
        - add-brick such that this brick makes the volume a replica vol 1x2
        - start heal
        - make sure heal is completed
        - get arequals from all bricks and compare with arequal from mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dir-length 1 "
                   "--dir-depth 1 "
                   "--max-num-of-dirs 1 "
                   "--num-of-files 10 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Get arequal for mount before adding bricks
        g.log.info('Getting arequal before adding bricks...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Form brick list to add
        g.log.info('Forming brick list to add...')
        bricks_to_add = form_bricks_list(self.mnode, self.volname, 1,
                                         self.servers, self.all_servers_info)
        g.log.info('Brick list to add: %s', bricks_to_add)

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret, _, _ = add_brick(self.mnode,
                              self.volname,
                              bricks_to_add,
                              force=True,
                              replica_count=2)
        self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add)
        g.log.info("Adding bricks is successful on volume %s", self.volname)

        # Make sure the newly added bricks are available in the volume
        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)
        for brick in bricks_to_add:
            self.assertIn(brick, bricks_list,
                          'Brick %s is not in brick list' % brick)
        g.log.info('New bricks are present in the volume')

        # Make sure volume change from distribute to replicate volume
        vol_info_dict = get_volume_type_info(self.mnode, self.volname)
        vol_type = vol_info_dict['volume_type_info']['typeStr']
        self.assertEqual(
            'Replicate', vol_type, 'Volume type is not converted to Replicate '
            'after adding bricks')
        g.log.info('Volume type is successfully converted to Replicate '
                   'after adding bricks')

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        g.log.info('Getting arequal on bricks...')
        arequals_after_heal = {}
        for brick in bricks_list:
            g.log.info('Getting arequal on bricks %s...', brick)
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            arequals_after_heal[brick] = brick_total
            self.assertEqual(
                mount_point_total, brick_total,
                'Arequals for mountpoint and %s are not equal' % brick)
            g.log.info('Arequals for mountpoint and %s are equal', brick)
        g.log.info('All arequals are equal for replicated')
Example #29
0
    def test_heal_when_quota_object_limit_exceeded(self):
        # Create a directory to set the quota_limit_objects
        path = "/dir"
        g.log.info("Creating a directory")
        self.all_mounts_procs = []
        for mount_object in self.mounts:
            cmd = "/usr/bin/env python %s create_deep_dir -d 0 -l 0 %s%s" % (
                self.script_upload_path, mount_object.mountpoint, path)
            ret = g.run(mount_object.client_system, cmd)
            self.assertTrue(ret, "Failed to create directory on mountpoint")
            g.log.info("Directory created successfully on mountpoint")

        # Enable Quota
        g.log.info("Enabling quota on the volume %s", self.volname)
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to enable quota on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Set quota-soft-timeout to 0
        g.log.info("Setting up soft timeout to 0")
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, "0")
        self.assertEqual(ret, 0, ("Failed to set quota-soft-timeout"))
        g.log.info("Successfully set the quota-soft-timeout")

        # Set quota-hard-timeout to 0
        g.log.info("Setting up hard timeout with 0")
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, "0")
        self.assertEqual(ret, 0, ("Failed to set quota-hard-timeout"))
        g.log.info("successfully set the quota-hard-timeout")

        # Set Quota limit on the newly created directory
        g.log.info("Set Quota Limit on the path %s of the volume %s", path,
                   self.volname)
        ret, _, _ = quota_limit_objects(self.mnode,
                                        self.volname,
                                        path=path,
                                        limit="5")
        self.assertEqual(ret, 0, ("Failed to set quota limit on path %s of "
                                  " the volume %s", path, self.volname))
        g.log.info(
            "Successfully set the quota limit on %s of the volume "
            "%s", path, self.volname)

        # Create 3 files inside the directory
        for mount_object in self.mounts:
            g.log.info("Creating Files on %s:%s", mount_object.client_system,
                       path)
            cmd = ("/usr/bin/env python %s create_files -f 3 "
                   "--base-file-name file-0 %s%s" %
                   (self.script_upload_path, mount_object.mountpoint, path))
            ret, _, _ = g.run(mount_object.client_system, cmd)
            self.assertEqual(ret, 0, ("Failed to create files on %s", path))
            g.log.info("Files created successfully on mountpoint")

        bricks_list = get_all_bricks(self.mnode, self.volname)

        # Bring brick3 offline
        g.log.info('Bringing brick %s offline', bricks_list[2])
        ret = bring_bricks_offline(self.volname, bricks_list[2])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[2])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2])
        g.log.info('Bringing brick %s offline is successful', bricks_list[2])

        # Try creating 5 more files, which should fail as the quota limit
        # exceeds
        cmd = ("/usr/bin/env python %s create_files -f 5 --base-file-name "
               "file-1 %s%s" %
               (self.script_upload_path, mount_object.mountpoint, path))
        ret, _, _ = g.run(mount_object.client_system, cmd)
        self.assertNotEqual(ret, 0, ("Creating 5 files succeeded while it was"
                                     "not supposed to."))
        g.log.info("Creating 5 files failed as expected due to quota object"
                   "limit on the directory.")

        # Bring brick3 online and check status
        g.log.info('Bringing brick %s online', bricks_list[2])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[2])
        g.log.info('Bringing brick %s online is successful', bricks_list[2])

        g.log.info("Verifying if brick %s is online", bricks_list[2])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[2]))
        g.log.info("Brick %s has come online.", bricks_list[2])

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')
    def test_data_self_heal_algorithm_full_default(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full'

        Description:
        - set the volume option "data-self-heal-algorithm" to value "full"
        - create IO
        - bring down all bricks processes from selected set
        - modify the data
        - calculate arequal
        - bring bricks online
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options "data-self-heal-algorithm": "full"...')
        options = {"data-self-heal-algorithm": "full"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'data-self-heal-algorithm' is set to 'full' "
                   "successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')
    def test_entry_heal_with_quota(self):
        """
        - Create a 1x3 volume
        - Set quota object limit
        - Create files less than the limit
        - Bring down a brick and create more files until limit is hit
        - Delete one file so that we are below the limit, and create one more
          file
        - Bring the brick back up and launch heal
        - Verify that after heal is complete, the deleted file does not
          re-appear in any of the bricks.
        """
        # pylint: disable=too-many-statements
        # Enable Quota
        g.log.info("Enabling quota on the volume %s", self.volname)
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Check if quota is enabled
        g.log.info("Validate Quota is enabled on the volume %s", self.volname)
        ret = is_quota_enabled(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Quota is not enabled on the volume %s", self.volname))
        g.log.info("Successfully Validated quota is enabled on volume %s",
                   self.volname)

        # Set quota related options
        options = {
            "quota-deem-statfs": "on",
            "soft-timeout": "0",
            "hard-timeout": "0"
        }
        g.log.info("setting quota volume options %s", options)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for "
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Create directory on mount
        ret = mkdir(self.mounts[0].client_system,
                    "%s/dir" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "mkdir failed")

        # Set Quota limit on the directory
        path = "/dir"
        g.log.info(
            "Setting Quota Limit object on the path %s of the "
            "volume %s", path, self.volname)
        ret, _, _ = quota_limit_objects(self.mnode,
                                        self.volname,
                                        path=path,
                                        limit="10")
        self.assertEqual(ret, 0,
                         ("Failed to set quota limit object "
                          "on path %s of the volume %s", path, self.volname))
        g.log.info(
            "Successfully set the Quota limit object on %s of the "
            "volume %s", path, self.volname)

        cmd = ("touch %s/dir/file{1..5}" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # Bring brick3 offline
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info('Bringing brick %s offline', bricks_list[2])
        ret = bring_bricks_offline(self.volname, bricks_list[2])
        self.assertTrue(ret,
                        'Failed to bring brick %s offline' % bricks_list[2])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2])
        g.log.info('Bringing brick %s offline was successful', bricks_list[2])

        # Create files until quota object limit
        cmd = ("touch %s/dir/file{6..9}" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # The next create must fail
        cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(
            ret, 1, ("Creation of %s/dir/file10 succeeded while "
                     "it was not supposed to." % self.mounts[0].mountpoint))
        g.log.info(
            "Creation of %s/dir/file10 failed as expected due to "
            "quota object limit.", self.mounts[0].mountpoint)

        # Delete one file and re-try the create to succeed.
        cmd = ("rm %s/dir/file1" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "File deletion failed")
        cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "File creation failed")

        # Bring brick3 online and check status
        g.log.info('Bringing brick %s online...', bricks_list[2])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[2])
        g.log.info('Bringing brick %s online is successful', bricks_list[2])

        g.log.info("Verifying if brick3 is online....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("brick3 did not come up"))
        g.log.info("brick3 has come online.")

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Verify that file10 did not get recreated on the down brick by an
        # accidental conservative merge.
        for brick in bricks_list:
            node, brick_path = brick.split(':')
            ret, _, _ = g.run(node, 'stat %s/dir/file10' % brick_path)
            self.assertFalse(ret, 'File present!')