Ejemplo n.º 1
0
    def test_nfs_ganesha_remove_brick(self):
        """
        Verify remove brick operation while IO is running
        Steps:
        1. Start IO on mount points
        2. Perform remove brick operation
        3. Validate IOs
        """
        # pylint: disable=too-many-statements
        # Start IO on all mount points
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (self.script_upload_path, count,
                                            mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Perform remove brick operation
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info("All volume %s processes came up "
                   "online successfully after remove brick operation",
                   self.volname)

        # Log volume info and status after performing remove brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")
Ejemplo n.º 2
0
    def test_shrinking_volume_when_io_in_progress(self):
        """Test shrinking volume (Decrease distribute count) using existing
        servers bricks when IO is in progress.

        Description:
            - remove brick (start, status, commit)
            - validate IO
        """
        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Shrinking volume by removing bricks from volume when IO in progress
        g.log.info("Start removing bricks from volume when IO in progress")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Shrinking volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online after "
                   "shrinking volume")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online after shrinking volume",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_replicated_to_arbiter_volume(self):
        """
        Description:-
        Reduce the replica count from replica 3 to arbiter
        """
        # pylint: disable=too-many-statements
        # Remove brick to reduce the replica count from replica 3
        g.log.info("Removing bricks to form replica 2 volume")
        ret = shrink_volume(self.mnode, self.volname, replica_num=0)
        self.assertTrue(ret,
                        "Failed to remove brick on volume %s" % self.volname)
        g.log.info("Successfully removed brick on volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s process not online despite waiting "
            "for 300 seconds" % self.volname)
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s : All process are not online" % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)

        # Adding the bricks to make arbiter brick
        g.log.info("Adding bricks to convert to Arbiter Volume")
        replica_arbiter = {'replica_count': 1, 'arbiter_count': 1}
        ret = expand_volume(self.mnode,
                            self.volname,
                            self.servers,
                            self.all_servers_info,
                            add_to_hot_tier=False,
                            **replica_arbiter)
        self.assertTrue(ret, "Failed to expand the volume  %s" % self.volname)
        g.log.info("Changing volume to arbiter volume is successful %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Failed to wait for volume %s processes "
            "to be online" % self.volname)
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s : All process are not online" % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)
    def test_verify_df_output_when_brick_replaced(self):
        """
        - Take the output of df -h.
        - Replace any one brick for the volumes.
        - Wait till the heal is completed
        - Repeat steps 1, 2 and 3 for all bricks for all volumes.
        - Check if there are any inconsistencies in the output of df -h
        - Remove bricks from volume and check output of df -h
        - Add bricks to volume and check output of df -h
        """

        # Perform some IO on the mount point
        self._perform_io_and_validate()

        # Get the mount size from df -h output
        initial_mount_size = self._get_mount_size_from_df_h_output()

        # Replace all the bricks and wait till the heal completes
        self._replace_bricks_and_wait_for_heal_completion()

        # Get df -h output after brick replace
        mount_size_after_replace = self._get_mount_size_from_df_h_output()

        # Verify the mount point size remains the same after brick replace
        self.assertEqual(
            initial_mount_size, mount_size_after_replace,
            "The mount sizes before and after replace bricks "
            "are not same")

        # Add bricks
        ret = expand_volume(self.mnode,
                            self.volname,
                            self.servers,
                            self.all_servers_info,
                            force=True)
        self.assertTrue(ret, "Failed to add-brick to volume")

        # Get df -h output after volume expand
        mount_size_after_expand = self._get_mount_size_from_df_h_output()

        # Verify df -h output returns greater value
        self.assertGreater(mount_size_after_expand, initial_mount_size,
                           "The mount size has not increased after expanding")

        # Remove bricks
        ret = shrink_volume(self.mnode, self.volname, force=True)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Get df -h output after volume shrink
        mount_size_after_shrink = self._get_mount_size_from_df_h_output()

        # Verify the df -h output returns smaller value
        self.assertGreater(mount_size_after_expand, mount_size_after_shrink,
                           "The mount size has not reduced after shrinking")
    def test_copy_huge_file_with_remove_brick_in_progress(self):
        """
        Test case:
        1. Create a volume, start it and mount it.
        2. Create files and dirs on the mount point.
        3. Start remove-brick and copy huge file when remove-brick is
           in progress.
        4. Commit remove-brick and check checksum of orginal and copied file.
        """
        # Create a directory with some files inside
        cmd = ("cd %s; for i in {1..10}; do mkdir dir$i; for j in {1..5};"
               " do dd if=/dev/urandom of=dir$i/file$j bs=1M count=1; done;"
               " done" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.first_client, cmd)
        self.assertFalse(ret,
                         "Failed to create dirs and files.")

        # Create a hug file under /mnt dir
        ret, _, _ = g.run(self.first_client,
                          "fallocate -l 10G /mnt/huge_file.txt")
        self.assertFalse(ret, "Failed to create hug file at /mnt")

        # Copy a huge file when remove-brick is in progress
        self.cp_running = False
        cmd = ("sleep 60; cd %s;cp ../huge_file.txt ."
               % self.mounts[0].mountpoint)
        self.io_proc = [g.run_async(self.first_client, cmd)]
        self.rename_running = True

        # Start remove-brick on volume and wait for it to complete
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=1000)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Validate if copy was successful or not
        ret = validate_io_procs(self.io_proc, [self.mounts[0]])
        self.assertTrue(ret, "dir rename failed on mount point")
        self.cp_running = False

        # Check checksum of orginal and copied file
        original_file_checksum = get_md5sum(self.first_client,
                                            "/mnt/huge_file.txt")
        copied_file_checksum = get_md5sum(self.first_client,
                                          "{}/huge_file.txt"
                                          .format(self.mounts[0].mountpoint))
        self.assertEqual(original_file_checksum.split(" ")[0],
                         copied_file_checksum.split(" ")[0],
                         "md5 checksum of original and copied file are"
                         " different")
        g.log.info("md5 checksum of original and copied file are same.")

        # Remove original huge file
        ret, _, _ = g.run(self.first_client, "rm -rf /mnt/huge_file.txt")
        self.assertFalse(ret, "Failed to remove huge_file from mount point")
    def test_brick_full_add_brick_remove_brick(self):
        """
        Test case:
        1. Create a volume, start it and mount it.
        2. Fill few bricks till min-free-limit is reached.
        3. Add brick to the volume.(This should pass.)
        4. Set cluster.min-free-disk to 30%.
        5. Remove bricks from the volume.(This should pass.)
        6. Check for data loss by comparing arequal before and after.
        """
        # Fill few bricks till it is full
        bricks = get_all_bricks(self.mnode, self.volname)

        # Calculate the usable size and fill till it reaches
        # min free limit
        usable_size = get_usable_size_per_disk(bricks[0])
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        filename = "abc"
        for _ in range(0, usable_size):
            while (subvols[find_hashed_subvol(subvols, "/",
                                              filename)[1]] == subvols[0]):
                filename = self._get_random_string()
            ret, _, _ = g.run(
                self.mounts[0].client_system,
                "fallocate -l 1G {}/{}".format(self.mounts[0].mountpoint,
                                               filename))
            self.assertFalse(ret, "Failed to fill disk to min free limit")
            filename = self._get_random_string()
        g.log.info("Disk filled up to min free limit")

        # Collect arequal checksum before ops
        arequal_checksum_before = collect_mounts_arequal(self.mounts[0])

        # Add brick to volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Failed to add brick on volume %s" % self.volname)

        # Set cluster.min-free-disk to 30%
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.min-free-disk': '30%'})
        self.assertTrue(ret, "Failed to set cluster.min-free-disk to 30%")

        # Remove bricks from the volume
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=1800)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Check for data loss by comparing arequal before and after ops
        arequal_checksum_after = collect_mounts_arequal(self.mounts[0])
        self.assertEqual(arequal_checksum_before, arequal_checksum_after,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")
    def test_remove_brick_with_open_fd(self):
        """
        Test case:
        1. Create volume, start it and mount it.
        2. Open file datafile on mount point and start copying /etc/passwd
           line by line(Make sure that the copy is slow).
        3. Start remove-brick of the subvol to which has datafile is hashed.
        4. Once remove-brick is complete compare the checksum of /etc/passwd
           and datafile.
        """
        # Open file datafile on mount point and start copying /etc/passwd
        # line by line
        ret, out, _ = g.run(self.mounts[0].client_system,
                            "cat /etc/passwd | wc -l")
        self.assertFalse(ret, "Failed to get number of lines of /etc/passwd")
        cmd = ("cd {}; exec 30<> datafile ;for i in `seq 1 {}`; do "
               "head -n $i /etc/passwd | tail -n 1 >> datafile; sleep 10; done"
               .format(self.mounts[0].mountpoint, out.strip()))

        self.list_of_io_processes = [
            g.run_async(self.mounts[0].client_system, cmd)
        ]
        self.is_copy_running = True

        # Start remove-brick of the subvol to which has datafile is hashed
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        number = find_hashed_subvol(subvols, "/", 'datafile')[1]

        ret = shrink_volume(self.mnode, self.volname, subvol_num=number)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Validate if I/O was successful or not.
        ret = validate_io_procs(self.list_of_io_processes, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.is_copy_running = False

        # Compare md5checksum of /etc/passwd and datafile
        md5_of_orginal_file = get_md5sum(self.mounts[0].client_system,
                                         '/etc/passwd')
        self.assertIsNotNone(md5_of_orginal_file,
                             'Unable to get md5 checksum of orignial file')
        md5_of_copied_file = get_md5sum(
            self.mounts[0].client_system,
            '{}/datafile'.format(self.mounts[0].mountpoint))
        self.assertIsNotNone(md5_of_copied_file,
                             'Unable to get md5 checksum of copied file')
        self.assertEqual(
            md5_of_orginal_file.split(" ")[0],
            md5_of_copied_file.split(" ")[0],
            "md5 checksum of original and copied file didn't"
            " match")
        g.log.info("md5 checksum of original and copied files are same")
    def test_remove_brick_rebalance_files_with_holes(self):
        """
        Test case:
        1. Create a volume, start it and mount it using fuse.
        2. On the volume root, create files with holes.
        3. After the file creation is complete, remove-brick from volume.
        4. Wait for remove-brick to complete.
        """
        # On the volume root, create files with holes
        cmd = ("cd %s;for i in {1..2000}; do dd if=/dev/urandom"
               " of=file_with_holes$i bs=1M count=1 seek=100M; done" %
               self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.first_client, cmd)
        self.assertFalse(ret, "Failed to create files with holes")

        # After the file creation is complete, remove-brick from volume
        # Wait for remove-brick to complete
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=16000)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")
    def test_remove_brick_command_basic(self):
        """
        Test case:
        1. Create a volume, start it and mount it.
        2. Create some data on the volume.
        3. Run remove-brick start, status and finally commit.
        4. Check if there is any data loss or not.
        """
        # Create some data on the volume
        self._run_io_on_mount_point()

        # Collect arequal checksum before ops
        arequal_checksum_before = collect_mounts_arequal(self.mounts[0])

        # Run remove-brick start, status and finally commit
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Check for data loss by comparing arequal before and after ops
        arequal_checksum_after = collect_mounts_arequal(self.mounts[0])
        self.assertEqual(arequal_checksum_before, arequal_checksum_after,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")
    def test_brick_removal_with_quota(self):
        """
        Test Brick removal with quota in place
        1. Create Volume of type distribute
        2. Set Quota limit on the directory
        3. Do some IO to reach the Hard limit
        4. After IO ends, remove bricks
        5. Quota validation should succeed.
        """
        # Enable Quota
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume 5s", self.volname))
        g.log.info("Successfully enabled quota on volume %s", self.volname)

        # Set the Quota timeouts to 0 for strict accounting
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname))
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname))
        g.log.info("Quota soft and hard timeout has been set to 0 for %s",
                   self.volname)

        # Set the quota limit of 100 MB on root dir of the volume
        ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/", "100MB")
        self.assertEqual(ret, 0, "Failed to set Quota for dir root")
        g.log.info("Successfully set quota limit for dir root")

        # Do some IO until hard limit is reached.
        cmd = ("/usr/bin/env python %s create_files "
               "-f 100 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=104857600,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Log Volume info and status before shrinking volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Shrink the volume.
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink volume on "
                              "volume %s", self.volname))
        g.log.info("Shrinking volume is successful on "
                   "volume %s", self.volname)

        # Log volume info and status after shrinking volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Perform rebalance start operation.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to  start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Rebalance started.")

        # Wait till rebalance ends.
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=104857600,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")
Ejemplo n.º 11
0
    def test_add_brick_remove_brick_with_lookups_and_kernal_untar(self):
        """
        Test case:
        1. Enable brickmux on cluster, create a volume, start it and mount it.
        2. Start the below I/O from 4 clients:
           From client-1 : run script to create folders and files continuously
           From client-2 : start linux kernel untar
           From client-3 : while true;do find;done
           From client-4 : while true;do ls -lRt;done
        3. Kill brick process on one of the nodes.
        4. Add brick to the volume.
        5. Remove bricks from the volume.
        6. Validate if I/O was successful or not.
        """
        # Fill few bricks till it is full
        bricks = get_all_bricks(self.mnode, self.volname)

        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0],
                              self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Run script to create folders and files continuously
        cmd = ("/usr/bin/env python {} create_deep_dirs_with_files "
               "--dirname-start-num 758 --dir-depth 2 "
               "--dir-length 100 --max-num-of-dirs 10 --num-of-files 105 {}".
               format(self.script_upload_path, self.mounts[1].mountpoint))
        ret = g.run_async(self.mounts[1].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Run lookup operations from 2 clients
        cmd = ("cd {}; for i in `seq 1 1000000`;do find .; done".format(
            self.mounts[2].mountpoint))
        ret = g.run_async(self.mounts[2].client_system, cmd)
        self.list_of_io_processes += [ret]

        cmd = ("cd {}; for i in `seq 1 1000000`;do ls -lRt; done".format(
            self.mounts[3].mountpoint))
        ret = g.run_async(self.mounts[3].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Kill brick process of one of the nodes.
        brick = choice(bricks)
        node, _ = brick.split(":")
        ret = kill_process(node, process_names="glusterfsd")
        self.assertTrue(ret,
                        "Failed to kill brick process of brick %s" % brick)

        # Add brick to volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Failed to add brick on volume %s" % self.volname)
        g.log.info("Add brick to volume successful")

        # Remove bricks from the volume
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=2400)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Validate if I/O was successful or not.
        ret = validate_io_procs(self.list_of_io_processes, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.is_io_running = False
 def _remove_brick_from_volume(self):
     """Remove bricks from volume"""
     # Remove bricks from the volume
     ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=2000)
     self.assertTrue(ret, "Failed to remove-brick from volume")
     g.log.info("Remove-brick rebalance successful")
Ejemplo n.º 13
0
    def test_subdir_with_removebrick(self):

        # pylint: disable=too-many-statements
        """
        Mount the volume
        Create 2 subdir on client subdir1 and subdir2
        Auth allow - Client1(subdir1,subdir2),Client2(subdir1,subdir2)
        Mount the subdir to their respective clients
        Start IO's on both subdirs
        Perform remove-brick
        Validate on client if subdir's are mounted post remove-brick
        operation is performed
        """
        # Create  directories subdir1 and subdir2 on mount point
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir1" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir1' in"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir2" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir2' in"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        # unmount volume
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, "Volumes UnMount failed")
        g.log.info("Volumes UnMounted successfully")

        # Set authentication on the subdirectory subdir1
        # and subdir2 to access by 2 clients
        g.log.info(
            'Setting authentication on subdir1 and subdir2'
            'for client %s and %s', self.clients[0], self.clients[0])
        ret = set_auth_allow(
            self.volname, self.mnode, {
                '/subdir1': [self.clients[0], self.clients[1]],
                '/subdir2': [self.clients[0], self.clients[1]]
            })
        self.assertTrue(
            ret, 'Failed to set Authentication on volume %s' % self.volume)

        self.mpoint = "/mnt/Mount_Point1"

        # Mount Subdir1 mount on client 1
        _, _, _ = mount_volume("%s/subdir1" % self.volname, self.mount_type,
                               self.mpoint, self.mnode, self.clients[0])

        # Checking subdir1 is mounted or not
        ret = is_mounted("%s/subdir1" % self.volname, self.mpoint, self.mnode,
                         self.clients[0], self.mount_type)
        self.assertTrue(ret,
                        "Volume not mounted on mount point: %s" % self.mpoint)
        g.log.info("Volume %s mounted on %s/subdir1", self.volname,
                   self.mpoint)

        # Mount Subdir2 mount on client 2
        _, _, _ = mount_volume("%s/subdir2" % self.volname, self.mount_type,
                               self.mpoint, self.mnode, self.clients[1])

        # Checking subdir2 is mounted or not
        ret = is_mounted("%s/subdir2" % self.volname, self.mpoint, self.mnode,
                         self.clients[1], self.mount_type)
        self.assertTrue(ret,
                        "Volume not mounted on mount point: %s" % self.mpoint)
        g.log.info("Volume %s mounted on %s/subdir2", self.volname,
                   self.mpoint)

        # Start IO on all the subdir mounts.
        self.subdir_mounts = [
            copy.deepcopy(self.mounts[0]),
            copy.deepcopy(self.mounts[1])
        ]
        self.subdir_mounts[0].volname = "%s/subdir1" % self.volname
        self.subdir_mounts[1].volname = "%s/subdir2" % self.volname
        all_mounts_procs = []
        count = 1
        for mount_obj in self.subdir_mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       self.mpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, self.mpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.subdir_mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.subdir_mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Perform remove brick operation when subdir is mounted on client
        g.log.info("Start removing bricks from volume")
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=600)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info("All volume %s processes came up "
                   "online successfully", self.volname)

        # Log Volume Info and Status after performing remove brick
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Again Checking subdir1 is mounted or not on Client 1
        ret = is_mounted("%s/subdir1" % self.volname, self.mpoint, self.mnode,
                         self.clients[0], self.mount_type)
        self.assertTrue(ret,
                        "Volume not mounted on mount point: %s" % self.mpoint)
        g.log.info("Volume %s mounted on %s/subdir1", self.volname,
                   self.mpoint)

        # Again Checking subdir2 is mounted or not on Client 2
        ret = is_mounted("%s/subdir2" % self.volname, self.mpoint, self.mnode,
                         self.clients[1], self.mount_type)
        self.assertTrue(ret,
                        "Volume not mounted on mount point: %s" % self.mpoint)
        g.log.info("Volume %s mounted on %s/subdir2", self.volname,
                   self.mpoint)
Ejemplo n.º 14
0
    def test_validate_snaps_restore(self):
        # pylint: disable=too-many-statements
        # Start IO on all mounts.
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Setting some volume option related to snapshot
        option_before_restore = {
            'volumeConfig': [{
                'softLimit': '100',
                'effectiveHardLimit': '200',
                'hardLimit': '256'
            }],
            'systemConfig': {
                'softLimit': '90%',
                'activateOnCreate': 'disable',
                'hardLimit': '256',
                'autoDelete': 'disable'
            }
        }
        ret = set_snap_config(self.mnode, option_before_restore)
        self.assertTrue(ret,
                        ("Failed to set vol option on  %s" % self.volname))
        g.log.info("Volume options for%s is set successfully", self.volname)

        # Get brick list before taking snap_restore
        bricks_before_snap_restore = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List before snap restore "
                   "volume: %s", bricks_before_snap_restore)

        # Creating snapshot
        ret = snap_create(self.mnode, self.volname, "snap1")
        self.assertTrue(ret,
                        ("Failed to create snapshot for %s" % self.volname))
        g.log.info("Snapshot snap1 created successfully for volume  %s",
                   self.volname)

        # Again start IO on all mounts.
        all_mounts_procs = []
        count = 1000
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Reset volume to make sure volume options will reset
        ret = volume_reset(self.mnode, self.volname, force=False)
        self.assertTrue(ret, ("Failed to reset %s" % self.volname))
        g.log.info("Reset Volume %s is Successful", self.volname)

        # Removing one brick
        g.log.info("Starting volume shrink")
        ret = shrink_volume(self.mnode, self.volname, force=True)
        self.assertTrue(ret, ("Failed to shrink the volume on "
                              "volume %s", self.volname))
        g.log.info("Shrinking volume is successful on "
                   "volume %s", self.volname)

        # Restore snapshot
        ret = snap_restore_complete(self.mnode, self.volname, "snap1")
        self.assertTrue(ret, ("Failed to restore snap snap1 on the "
                              "volume %s", self.volname))
        g.log.info(
            "Restore of volume is successful from snap1 on "
            "volume  %s", self.volname)

        # Validate volume is up and running
        g.log.info("Verifying volume is up and process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Get volume options post restore
        option_after_restore = get_snap_config(self.mnode)
        # Compare volume options
        self.assertNotEqual(option_before_restore, option_after_restore,
                            "Volume Options are not same after snap restore")

        # Get brick list post restore
        bricks_after_snap_restore = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after snap restore "
                   "volume: %s", bricks_after_snap_restore)
        # Compare brick_list
        self.assertNotEqual(bricks_before_snap_restore,
                            bricks_after_snap_restore,
                            "Bricks are not same after snap restore")

        # Creating snapshot
        ret = snap_create(self.mnode, self.volname, "snap2")
        self.assertTrue(ret,
                        ("Failed to create snapshot for %s" % self.volname))
        g.log.info("Snapshot snap2 created successfully for volume  %s",
                   self.volname)

        # Again start IO on all mounts after restore
        all_mounts_procs = []
        count = 1000
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")
Ejemplo n.º 15
0
    def test_remove_faulty_subvol_and_add_new_subvol(self):
        """
        - Create an distribute-replicate arbiter volume
        - Create IO
        - Calculate areequal
        - Remove a subvolume
        - Calculate areequal and compare with previous
        - Expand the volume
        - Do rebalance and check status
        - Calculate areequal and compare with previous
        """
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("python %s create_deep_dirs_with_files "
                       "-d 2 "
                       "-l 2 "
                       "-n 2 "
                       "-f 20 "
                       "%s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Get areequal before removing subvol
        g.log.info('Getting areequal before removing subvol...')
        ret, result_before_remove_subvol = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal before removing subvol is successful')

        # Remove subvol
        g.log.info('Removing subvol...')
        ret = shrink_volume(self.mnode, self.volname, subvol_num=1)
        self.assertTrue(ret, 'Failed to remove subvolume')
        g.log.info('Removed subvolume sucsessfully')

        # Get areequal after removing subvol
        g.log.info('Getting areequal after removing subvol...')
        ret, result_after_remove_subvol = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after removing subvol is successful')

        # Comparing areequal before removing subvol
        # and after removing subvol
        self.assertEqual(result_before_remove_subvol,
                         result_after_remove_subvol,
                         'Areequals before removing subvol and  '
                         'after removing subvol are not equal')
        g.log.info('Areequals before removing subvol and  '
                   'after removing subvol are equal')

        # Expand volume
        g.log.info('Expanding volume %s...', self.volname)
        ret = expand_volume(self.mnode, self.volname,
                            self.servers, self.all_servers_info)
        self.assertTrue(ret, 'Failed to expand volume %s' % self.volname)
        g.log.info('Expanded volume %s sucsessfully', self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info("Successful in logging volume info and status "
                   "of volume %s", self.volname)

        # Do rebalance
        g.log.info('Starting rebalance...')
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        g.log.info('Waiting for rebalance to complete...')
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Get areequal after expanding volume
        g.log.info('Getting areequal after expanding volume...')
        ret, result_after_expand_volume = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after expanding volume is successful')

        # Comparing areequal before removing subvol
        # and after expanding volume
        self.assertEqual(result_before_remove_subvol,
                         result_after_expand_volume,
                         'Areequals before removing subvol and  '
                         'after expanding volume are not equal')
        g.log.info('Areequals before removing subvol and  '
                   'after expanding volume are equal')
Ejemplo n.º 16
0
    def test_glustershd_with_add_remove_brick(self):
        """
        Test script to verify glustershd process with adding and
        removing bricks

        * check glustershd process - only 1 glustershd process should
          be running
        * bricks must be present in glustershd-server.vol file for
          the replicated involved volumes
        * Add bricks
        * check glustershd process - only 1 glustershd process should
          be running and its should be different from previous one
        * bricks which are added must present in glustershd-server.vol file
        * remove bricks
        * check glustershd process - only 1 glustershd process should
          be running and its different from previous one
        * bricks which are removed should not present
          in glustershd-server.vol file

        """
        # pylint: disable=too-many-statements
        nodes = self.volume['servers']
        bricks_list = []
        glustershd_pids = {}

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s", pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # expanding volume
        g.log.info("Start adding bricks to volume %s", self.volname)
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to add bricks to "
                              "volume %s " % self.volname))
        g.log.info("Add brick successful")

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info("Successful in logging volume info and status "
                   "of volume %s", self.volname)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, err = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on "
                                  "the volume %s with error %s" %
                                  (self.volname, err)))
        g.log.info("Successfully started rebalance on the "
                   "volume %s", self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete "
                              "on the volume %s", self.volname))
        g.log.info("Rebalance is successfully complete on "
                   "the volume %s", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for "
                                  "the volume %s", self.volname))
        g.log.info("Successfully got rebalance status of the "
                   "volume %s", self.volname)

        # Check the self-heal daemon process after adding bricks
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        glustershd_pids_after_expanding = {}
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Successful in getting self-heal daemon process "
                   "on nodes %s", nodes)

        glustershd_pids_after_expanding = pids
        g.log.info("Self Heal Daemon Process ID's after expanding "
                   "volume: %s", glustershd_pids_after_expanding)

        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_expanding,
                            "Self Daemon process is same before and"
                            " after adding bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after adding bricks")

        # get the bricks for the volume after expanding
        bricks_list_after_expanding = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s", bricks_list_after_expanding)

        # validate the bricks present in volume info
        # with glustershd server volume file after adding bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_expanding)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "expanding bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # shrink the volume
        g.log.info("Starting volume shrink")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink the volume on "
                              "volume %s", self.volname))
        g.log.info("Shrinking volume is successful on "
                   "volume %s", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status "
                   "of volume %s", self.volname)

        # get the bricks after shrinking the volume
        bricks_list_after_shrinking = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after shrinking "
                   "volume: %s", bricks_list_after_shrinking)

        self.assertEqual(len(bricks_list_after_shrinking), len(bricks_list),
                         "Brick Count is mismatched after "
                         "shrinking the volume %s" % self.volname)
        g.log.info("Brick Count matched before before expanding "
                   "and after shrinking volume")

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process after removing bricks
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        glustershd_pids_after_shrinking = {}
        ret, pids = get_self_heal_daemon_pid(nodes)
        glustershd_pids_after_shrinking = pids
        self.assertNotEqual(glustershd_pids_after_expanding,
                            glustershd_pids_after_shrinking,
                            "Self Heal Daemon process is same "
                            "after adding bricks and shrinking volume")
        g.log.info("Self Heal Daemon Process is different after adding bricks "
                   "and shrinking volume")

        # validate bricks present in volume info
        # with glustershd server volume file after removing bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_shrinking)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "removing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)
    def test_client_io_threads_on_replicate_volumes(self):
        """
        Test case 1:
        1. Create distrubuted volume and start it.
        2. Check the value of performance.client-io-threads it should be ON.
        3. io-threads should be loaded in trusted-.tcp-fuse.vol.
        4. Add brick to convert to replicate volume.
        5. Check the value of performance.client-io-threads it should be OFF.
        6. io-threads shouldn't be loaded in trusted-.tcp-fuse.vol.
        7. Remove brick so thate volume type is back to distributed.
        8. Check the value of performance.client-io-threads it should be ON.
        9. performance.client-io-threads should be loaded in
           trusted-.tcp-fuse.vol.

        Test case 2:
        1. Create a replicate volume and start it.
        2. Set performance.client-io-threads to ON.
        3. Check the value of performance.client-io-threads it should be ON.
        4. io-threads should be loaded in trusted-.tcp-fuse.vol.
        5. Add bricks to convert to make the volume 2x3.
        6. Check the value of performance.client-io-threads it should be ON.
        7. io-threads should be loaded in trusted-.tcp-fuse.vol.
        8. Remove brick to make the volume 1x3 again.
        9. Check the value of performance.client-io-threads it should be ON.
        10. performance.client-io-threads should be loaded in
            trusted-.tcp-fuse.vol.
        """
        # If volume type is distributed then run test case 1.
        if self.volume_type == "distributed":

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads()

            # Add brick to convert to replicate volume
            brick = form_bricks_list_to_add_brick(self.mnode, self.volname,
                                                  self.servers,
                                                  self.all_servers_info)
            self.assertIsNotNone(brick,
                                 "Failed to form brick list to add brick")

            ret, _, _ = add_brick(self.mnode, self.volname, brick,
                                  force=True, replica_count=2)
            self.assertFalse(ret, "Failed to add brick on volume %s"
                             % self.volname)
            g.log.info("Add-brick successful on volume")

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads(enabled=False)

            # Remove brick so thate volume type is back to distributed
            ret = shrink_volume(self.mnode, self.volname, replica_num=1)
            self.assertTrue(ret, "Failed to remove-brick from volume")
            g.log.info("Remove-brick successful on volume")

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads()

        # If volume type is replicated then run test case 2.
        else:
            # Set performance.client-io-threads to ON
            options = {"performance.client-io-threads": "on"}
            ret = set_volume_options(self.mnode, self.volname, options)
            self.assertTrue(ret, "Unable to set volume option %s for"
                            "volume %s" % (options, self.volname))
            g.log.info("Successfully set %s for volume %s",
                       options, self.volname)

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads()

            # Add bricks to convert to make the volume 2x3
            ret = expand_volume(self.mnode, self.volname, self.servers,
                                self.all_servers_info)
            self.assertTrue(ret, "Failed to add brick on volume %s"
                            % self.volname)
            g.log.info("Add-brick successful on volume")

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads()

            # Remove brick to make the volume 1x3 again
            ret = shrink_volume(self.mnode, self.volname)
            self.assertTrue(ret, "Failed to remove-brick from volume")
            g.log.info("Remove-brick successful on volume")

            # Check the value of performance.client-io-threads it should be ON
            # and io-threads should be loaded in trusted-.tcp-fuse.vol
            self._check_value_of_performance_client_io_threads()
    def test_replicated_to_arbiter_volume(self):
        """
        Description:-
        Reduce the replica count from replica 3 to arbiter
        """
        # Log Volume Info and Status
        g.log.info("Logging volume info and Status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Remove brick to reduce the replica count from replica 3
        g.log.info("Removing bricks to form replica 2 volume")
        subvol_num = 1
        replica_num = 1
        reduce_count = {'replica_count': 2, 'distribute_count': 1}
        ret = shrink_volume(self.mnode, self.volname, subvol_num,
                            replica_num, force=True, **reduce_count)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Log Volume Info and Status after changin
        # replicate 3 to replica 2
        g.log.info("Logging volume info and Status after changing to "
                   "arbitered volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s process not online despite waiting "
                              "for 300 seconds", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Adding the bricks to amke arbiter brick
        g.log.info("Adding bricks to convert to Arbiter Volume")
        replica_arbiter = {'replica_count': 3, 'arbiter_count': 1}
        ret = expand_volume(self.mnode, self.volname, self.servers[2:],
                            self.all_servers_info, add_to_hot_tier=False,
                            **replica_arbiter)
        self.assertTrue(ret, ("Failed to expand the volume  %s", self.volname))
        g.log.info("Changing volume to arbiter volume is successfull %s",
                   self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)
    def test_disperse_removebrick(self):

        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        """
        - Write IO's
        - Start remove brick
        - Validate IOs
        - Start rebalance
        - Wait for rebalance to complete
        - Start IO's and Vaildate IO's
        """

        # Write IO
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Start remove-brick (subvolume-decrease)
        g.log.info("Start removing bricks from volume")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All process  for volume %s are not"
                              "online", self.volname))
        g.log.info("All volume %s processes are now online", self.volname)

        # Validating IO's and waiting to complete
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Start IO on all mounts after rebalance completes
        all_mounts_procs = []
        count = 21
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")
Ejemplo n.º 20
0
    def test_replicated_to_arbiter_volume_change_with_volume_ops(self):
        """
        - Change the volume type from replicated to arbiter
        - Perform add-brick, rebalance on arbitered volume
        - Perform replace-brick on arbitered volume
        - Perform remove-brick on arbitered volume
        """
        # pylint: disable=too-many-statements

        # Start IO on mounts
        self.all_mounts_procs = []
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
               "--dirname-start-num 10 --dir-depth 1 --dir-length 1 "
               "--max-num-of-dirs 1 --num-of-files 5 %s" % (
                   self.script_upload_path,
                   self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Adding bricks to make an Arbiter Volume
        g.log.info("Adding bricks to convert to Arbiter Volume")
        ret = expand_volume(self.mnode, self.volname, self.servers[2:],
                            self.all_servers_info, replica_count=1,
                            arbiter_count=1)
        self.assertTrue(ret, ("Failed to expand the volume  %s", self.volname))
        g.log.info("Changing volume to arbiter volume is successful %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Checking for heals to finish after changing the volume type
        # from replicated to arbitered volume
        g.log.info("Wait for self-heal to complete after changing the "
                   "volume type from replicated to arbitered volume")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, ("Self heal didn't complete even after waiting "
                              "for 20 minutes."))
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Start IO on mounts
        self.all_mounts_procs = []
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
               "--dirname-start-num 10 --dir-depth 1 --dir-length 1 "
               "--max-num-of-dirs 1 --num-of-files 5 %s" % (
                   self.script_upload_path,
                   self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Start add-brick (subvolume-increase)
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname, 600)
        self.assertTrue(ret, "Rebalance did not start "
                             "despite waiting for 5 mins")
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a faulty brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname,
                                        self.servers, self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Shrinking volume by removing bricks from volume when IO in progress
        g.log.info("Start removing bricks from volume when IO in progress")
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=900)
        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Shrinking volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online after "
                   "shrinking volume")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online after shrinking volume",
                   self.volname)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True
Ejemplo n.º 21
0
    def test_shrinking_volume_when_io_in_progress(self):
        """Test shrinking volume (Decrease distribute count) using existing
        servers bricks when IO is in progress.

        Description:
            - remove brick (start, status, commit)
            - validate IO
        """
        # Log Volume Info and Status before shrinking the volume.
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Temporary code:
        # Additional checks to gather infomartion from all
        # servers for Bug 1810901 and setting log level to debug.
        if self.volume_type == 'distributed-dispersed':
            for brick_path in get_all_bricks(self.mnode, self.volname):
                node, path = brick_path.split(':')
                ret, out, _ = g.run(node, 'find {}/'.format(path))
                g.log.info(out)
                for filedir in out.split('\n'):
                    ret, out, _ = g.run(node, 'ls -l {}'.format(filedir))
                    g.log.info("Return value for ls -l command: %s", ret)
                    g.log.info(out)
                    ret = get_fattr_list(node, filedir, encode_hex=True)
                    g.log.info(ret)

        # Shrinking volume by removing bricks from volume when IO in progress
        ret = shrink_volume(self.mnode, self.volname)

        # Temporary code:
        # Additional checks to gather infomartion from all
        # servers for Bug 1810901.
        if not ret and self.volume_type == 'distributed-dispersed':
            for brick_path in get_all_bricks(self.mnode, self.volname):
                node, path = brick_path.split(':')
                ret, out, _ = g.run(node, 'find {}/'.format(path))
                g.log.info(out)
                for filedir in out.split('\n'):
                    ret, out, _ = g.run(node, 'ls -l {}'.format(filedir))
                    g.log.info("Return value for ls -l command: %s", ret)
                    g.log.info(out)
                    ret = get_fattr_list(node, filedir, encode_hex=True)
                    g.log.info(ret)

        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Shrinking volume when IO in progress is successful on "
            "volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status after shrinking the volume
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")