Beispiel #1
0
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status after replacing the brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Wait for self-heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=1800)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 30 minutes. 30 minutes is too much a time for "
            "current test workload")

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
Beispiel #2
0
 def _replace_one_random_brick(self):
     """Replace one random brick from the volume"""
     brick = choice(get_all_bricks(self.mnode, self.volname))
     ret = replace_brick_from_volume(self.mnode,
                                     self.volname,
                                     self.servers,
                                     self.all_servers_info,
                                     src_brick=brick)
     self.assertTrue(ret, "Failed to replace brick %s " % brick)
     g.log.info("Successfully replaced brick %s", brick)
 def _replace_a_old_added_brick(self, brick_to_be_replaced):
     """Replace a old brick from the volume"""
     ret = replace_brick_from_volume(self.mnode,
                                     self.volname,
                                     self.servers,
                                     self.all_servers_info,
                                     src_brick=brick_to_be_replaced)
     self.assertTrue(ret,
                     "Failed to replace brick %s " % brick_to_be_replaced)
     g.log.info("Successfully replaced brick %s", brick_to_be_replaced)
    def _replace_bricks_and_wait_for_heal_completion(self):
        """ Replaces all the bricks and waits for the heal to complete"""
        existing_bricks = get_all_bricks(self.mnode, self.volname)
        for brick_to_replace in existing_bricks:
            ret = replace_brick_from_volume(self.mnode,
                                            self.volname,
                                            self.servers,
                                            self.all_servers_info,
                                            src_brick=brick_to_replace)
            self.assertTrue(ret, "Replace of %s failed" % brick_to_replace)
            g.log.info("Replace of brick %s successful for volume %s",
                       brick_to_replace, self.volname)

            # Monitor heal completion
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal has not yet completed')
            g.log.info('Heal has completed successfully')
    def test_glusterd_replace_brick(self):
        """
        Create a volume and start it.
        - Get list of all the bricks which are online
        - Select a brick randomly from the bricks which are online
        - Form a non-existing brick path on node where the brick has to replace
        - Perform replace brick and it should fail
        - Form a new brick which valid brick path replace brick should succeed
        """
        # pylint: disable=too-many-function-args
        # Getting all the bricks which are online
        bricks_online = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks_online, "Unable to get the online bricks")
        g.log.info("got the brick list from the volume")

        # Getting one random brick from the online bricks to be replaced
        brick_to_replace = random.choice(bricks_online)
        g.log.info("Brick to replace %s", brick_to_replace)
        node_for_brick_replace = brick_to_replace.split(':')[0]
        new_brick_to_replace = form_bricks_list(self.mnode, self.volname, 1,
                                                node_for_brick_replace,
                                                self.all_servers_info)

        # performing replace brick with non-existing brick path
        path = ":/brick/non_existing_path"
        non_existing_path = node_for_brick_replace + path

        # Replace brick for non-existing path
        ret, _, _ = replace_brick(self.mnode, self.volname, brick_to_replace,
                                  non_existing_path)
        self.assertNotEqual(ret, 0, ("Replace brick with commit force"
                                     " on a non-existing brick passed"))
        g.log.info("Replace brick with non-existing brick with commit"
                   "force failed as expected")

        # calling replace brick by passing brick_to_replace and
        # new_brick_to_replace with valid brick path
        ret = replace_brick_from_volume(self.mnode,
                                        self.volname,
                                        self.servers,
                                        self.all_servers_info,
                                        brick_to_replace,
                                        new_brick_to_replace[0],
                                        delete_brick=True)
        self.assertTrue(ret, ("Replace brick with commit force failed"))

        # Validating whether the brick replaced is online
        halt = 20
        counter = 0
        _rc = False
        g.log.info("Wait for some seconds for the replaced brick "
                   "to get online")
        while counter < halt:
            ret = are_bricks_online(self.mnode, self.volname,
                                    new_brick_to_replace)
            if not ret:
                g.log.info("The replaced brick isn't online, "
                           "Retry after 2 seconds .......")
                time.sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("The replaced brick is online after being replaced")
                break
        if not _rc:
            raise ExecutionError("The replaced brick isn't online")
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        g.log.info(
            "Logging volume info and Status before replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a faulty brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for gluster processes to come online
        time.sleep(30)

        # Log Volume Info and Status after replacing the brick
        g.log.info(
            "Logging volume info and Status after replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
Beispiel #7
0
    def test_ec_replace_brick(self):
        """
        - Start resource consumption tool
        - Create directory dir1
        - Create 5 directory and 5 files in dir of mountpoint
        - Rename all files inside dir1 at mountpoint
        - Create softlink and hardlink of files in dir1 of mountpoint
        - Delete op for deleting all file in one of the dirs inside dir1
        - Change chmod, chown, chgrp
        - Create tiny, small, medium and large file
        - Get arequal before replacing brick
        - Replace brick
        - Get arequal after replacing brick
        - Compare Arequal's
        - Create IO's
        - Replace brick while IO's are going on
        - Validating IO's and waiting for it to complete
        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Starting resource consumption using top
        log_file_mem_monitor = '/var/log/glusterfs/mem_usage.log'
        cmd = ("for i in {1..20};do top -n 1 -b|egrep "
               "'RES|gluster' & free -h 2>&1 >> %s ;"
               "sleep 10;done" % (log_file_mem_monitor))
        g.log.info(cmd)
        cmd_list_procs = []
        for server in self.servers:
            proc = g.run_async(server, cmd)
            cmd_list_procs.append(proc)

        # Creating dir1
        ret = mkdir(self.mounts[0].client_system,
                    "%s/dir1" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("Directory dir1 on %s created successfully", self.mounts[0])

        # Create 5 dir and 5 files in each dir at mountpoint on dir1
        start, end = 1, 5
        for mount_obj in self.mounts:
            # Number of dir and files to be created.
            dir_range = ("%s..%s" % (str(start), str(end)))
            file_range = ("%s..%s" % (str(start), str(end)))
            # Create dir 1-5 at mountpoint.
            ret = mkdir(mount_obj.client_system,
                        "%s/dir1/dir{%s}" % (mount_obj.mountpoint, dir_range))
            self.assertTrue(ret, "Failed to create directory")
            g.log.info("Directory created successfully")

            # Create files inside each dir.
            cmd = ('touch %s/dir1/dir{%s}/file{%s};' %
                   (mount_obj.mountpoint, dir_range, file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "File creation failed")
            g.log.info("File created successfull")

            # Increment counter so that at next client dir and files are made
            # with diff offset. Like at next client dir will be named
            # dir6, dir7...dir10. Same with files.
            start += 5
            end += 5

        # Rename all files inside dir1 at mountpoint on dir1
        cmd = ('cd %s/dir1/dir1/; '
               'for FILENAME in *;'
               'do mv $FILENAME Unix_$FILENAME; cd ~;'
               'done;' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to rename file on " "client")
        g.log.info("Successfully renamed file on client")

        # Truncate at any dir in mountpoint inside dir1
        # start is an offset to be added to dirname to act on
        # diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s/; '
                   'for FILENAME in *;'
                   'do echo > $FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Truncate failed")
            g.log.info("Truncate of files successfull")

        # Create softlink and hardlink of files in mountpoint. Start is an
        # offset to be added to dirname to act on diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln -s $FILENAME softlink_$FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Softlinks have failed")
            g.log.info("Softlink of files have been changed successfully")

            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln $FILENAME hardlink_$FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start + 1)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Hardlinks have failed")
            g.log.info("Hardlink of files have been changed successfully")
            start += 5

        # chmod, chown, chgrp inside dir1
        # start and end used as offset to access diff files
        # at diff clients.
        start, end = 2, 5
        for mount_obj in self.mounts:
            dir_file_range = '%s..%s' % (str(start), str(end))
            cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing mode of files has failed")
            g.log.info("Mode of files have been changed successfully")

            cmd = ('chown root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing owner of files has failed")
            g.log.info("Owner of files have been changed successfully")

            cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing group of files has failed")
            g.log.info("Group of files have been changed successfully")
            start += 5
            end += 5

        # Create tiny, small, medium and large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for tiny files failed")
            g.log.info("Fallocate for tiny files successfully")

            cmd = 'fallocate -l 20M small_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for small files failed")
            g.log.info("Fallocate for small files successfully")

            cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for medium files failed")
            g.log.info("Fallocate for medium files successfully")

            cmd = 'fallocate -l 1G large_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for large files failed")
            g.log.info("Fallocate for large files successfully")
            offset += 1

        # Get arequal before replacing brick
        ret, result_before_replacing_brick = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before replacing of brick '
                   'is successful')

        # Replacing a brick of random choice
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Unexpected:Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Check if bricks are online
        all_bricks = get_all_bricks(self.mnode, self.volname)
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'Unexpected:All bricks are not online')
        g.log.info('All bricks are online')

        # Get areequal after replacing brick
        ret, result_after_replacing_brick = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after replacing of brick '
                   'is successful')

        # Comparing arequals
        self.assertEqual(
            result_before_replacing_brick, result_after_replacing_brick,
            'Arequals are not equals before replacing '
            'brick and after replacing brick')
        g.log.info('Arequals are equals before replacing brick '
                   'and after replacing brick')

        # Creating files on client side for dir1
        # Write IO
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" %
                   (self.script_upload_path1, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Replacing a brick while IO's are going on
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Create 2 directories and start IO's which opens FD
        ret = mkdir(self.mounts[0].client_system,
                    "%s/count{1..2}" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create directories")
        g.log.info("Directories created on %s successfully", self.mounts[0])

        all_fd_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("cd %s ;/usr/bin/env python %s -n 10 -t 120 "
                   "-d 5 -c 16 --dir count%s" %
                   (mount_obj.mountpoint, self.script_upload_path2, count))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_fd_procs.append(proc)
            count += 1

        # Replacing a brick while open FD IO's are going on
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_fd_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Close connection and check file exist for memory log
        ret = file_exists(self.mnode, '/var/log/glusterfs/mem_usage.log')
        self.assertTrue(ret, "Unexpected:Memory log file does " "not exist")
        g.log.info("Memory log file exists")
        for proc in cmd_list_procs:
            ret, _, _ = proc.async_communicate()
            self.assertEqual(ret, 0, "Memory logging failed")
            g.log.info("Memory logging is successful")
    def test_replicated_to_arbiter_volume_change_with_volume_ops(self):
        """
        - Change the volume type from replicated to arbiter
        - Perform add-brick, rebalance on arbitered volume
        - Perform replace-brick on arbitered volume
        - Perform remove-brick on arbitered volume
        """
        # pylint: disable=too-many-statements

        # Start IO on mounts
        self.all_mounts_procs = []
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
               "--dirname-start-num 10 --dir-depth 1 --dir-length 1 "
               "--max-num-of-dirs 1 --num-of-files 5 %s" % (
                   self.script_upload_path,
                   self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Adding bricks to make an Arbiter Volume
        g.log.info("Adding bricks to convert to Arbiter Volume")
        ret = expand_volume(self.mnode, self.volname, self.servers[2:],
                            self.all_servers_info, replica_count=1,
                            arbiter_count=1)
        self.assertTrue(ret, ("Failed to expand the volume  %s", self.volname))
        g.log.info("Changing volume to arbiter volume is successful %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Checking for heals to finish after changing the volume type
        # from replicated to arbitered volume
        g.log.info("Wait for self-heal to complete after changing the "
                   "volume type from replicated to arbitered volume")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, ("Self heal didn't complete even after waiting "
                              "for 20 minutes."))
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Start IO on mounts
        self.all_mounts_procs = []
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
               "--dirname-start-num 10 --dir-depth 1 --dir-length 1 "
               "--max-num-of-dirs 1 --num-of-files 5 %s" % (
                   self.script_upload_path,
                   self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Start add-brick (subvolume-increase)
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname, 600)
        self.assertTrue(ret, "Rebalance did not start "
                             "despite waiting for 5 mins")
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a faulty brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname,
                                        self.servers, self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Shrinking volume by removing bricks from volume when IO in progress
        g.log.info("Start removing bricks from volume when IO in progress")
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=900)
        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Shrinking volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online after "
                   "shrinking volume")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online after shrinking volume",
                   self.volname)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True
    def test_subdir_with_replacebrick(self):

        # pylint: disable=too-many-statements
        """
        Mount the volume
        Create 50 directories on mount point
        Unmount volume
        Auth allow - Client1(subdir25),Client2(subdir15)
        Mount the subdir to their authorized respective clients
        Start IO's on both subdirs
        Perform replace-brick
        Validate on client if subdir's are mounted post replace-brick
        operation is performed
        Stat data on subdirs
        """
        # Create  directories on mount point
        for i in range(0, 50):
            ret = mkdir(self.mounts[0].client_system,
                        "%s/subdir%s" % (self.mounts[0].mountpoint, i))
            self.assertTrue(
                ret,
                ("Failed to create directory %s/subdir%s on"
                 "volume from client %s" %
                 (self.mounts[0].mountpoint, i, self.mounts[0].client_system)))
        g.log.info("Successfully created directories on mount point")

        # unmount volume
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, "Volumes Unmount failed")
        g.log.info("Volumes Unmounted successfully")

        # Set authentication on the subdirectory subdir25 to access by
        # client1 and subdir15 to access by 2 clients
        g.log.info(
            'Setting authentication on subdir25 and subdir15'
            'for client %s and %s', self.clients[0], self.clients[1])
        ret = set_auth_allow(
            self.volname, self.mnode, {
                '/subdir25': [self.mounts[0].client_system],
                '/subdir15': [self.mounts[1].client_system]
            })
        self.assertTrue(
            ret, 'Failed to set Authentication on volume %s' % self.volume)

        # Creating mount list for mounting selected subdirs on authorized
        # clients
        self.subdir_mounts = [
            copy.deepcopy(self.mounts[0]),
            copy.deepcopy(self.mounts[1])
        ]
        self.subdir_mounts[0].volname = "%s/subdir25" % self.volname
        self.subdir_mounts[1].volname = "%s/subdir15" % self.volname

        # Mount Subdirectory subdir25 on client 1 and subdir15 on client 2
        for mount_obj in self.subdir_mounts:
            ret = mount_obj.mount()
            self.assertTrue(
                ret, ("Failed to mount  %s on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Successfully mounted %s on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully mounted sub directories on"
                   "authenticated clients")

        # Start IO on all the subdir mounts.
        all_mounts_procs = []
        count = 1
        for mount_obj in self.subdir_mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.subdir_mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Log Volume Info and Status before replacing brick from the volume.
        g.log.info(
            "Logging volume info and Status before replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace  brick from the volume")
        g.log.info("Successfully replaced brick from the volume")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info("All volume %s processes came up "
                   "online successfully", self.volname)

        # Log Volume Info and Status after replacing the brick
        g.log.info(
            "Logging volume info and Status after replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed Logging volume info and status on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info("self-heal is successful after replace-brick operation")

        # Again validate if subdirectories are still mounted post replace-brick
        for mount_obj in self.subdir_mounts:
            ret = mount_obj.is_mounted()
            self.assertTrue(
                ret, ("Subdirectory %s is not mounted on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Subdirectory %s is mounted on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully validated that subdirectories are mounted"
                   "on client1 and clients 2 post replace-brick operation")

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.subdir_mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")
    def test_nfs_ganesha_replace_brick(self):
        """
        Verify replace brick operation while IO is running
        Steps:
        1. Start IO on mount points
        2. Perofrm replace brick operation
        3. Validate IOs
        4. Get stat of files and dris
        """
        # pylint: disable=too-many-statements
        # Start IO on all mount points
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Perform replace brick operation
        g.log.info("Replace a brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace  brick from the volume")
        g.log.info("Replace brick operation successful")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online after replace "
                   "brick operation")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info(
            "All volume %s processes came up "
            "online successfully after replace brick operation", self.volname)

        # Log volume info and status
        g.log.info(
            "Logging volume info and status after replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to log volume info and status of "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info("Self-heal is successful after replace-brick operation")

        # Validate IOs
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")
Beispiel #11
0
    def test_ec_replace_brick_after_add_brick(self):
        """
        Test Steps:
        1. Create a pure-ec volume (say 1x(4+2))
        2. Mount volume on two clients
        3. Create some files and dirs from both mnts
        4. Add bricks in this case the (4+2) ie 6 bricks
        5. Create a new dir(common_dir) and in that directory create a distinct
           directory(using hostname as dirname) for each client and pump IOs
           from the clients(dd)
        6. While IOs are in progress replace any of the bricks
        7. Check for errors if any collected after step 6
        """
        # pylint: disable=unsubscriptable-object,too-many-locals
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(
            all_bricks, "Unable to get the bricks from the {}"
            " volume".format(self.volname))

        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on the mounts")
        self.all_mounts_procs *= 0

        # Expand the volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Expanding volume failed")

        # Create a new dir(common_dir) on mountpoint
        common_dir = self.mounts[0].mountpoint + "/common_dir"
        ret = mkdir(self.mounts[0].client_system, common_dir)
        self.assertTrue(ret, "Directory creation failed")

        # Create distinct directory for each client under common_dir
        distinct_dir = common_dir + "/$HOSTNAME"
        for each_client in self.clients:
            ret = mkdir(each_client, distinct_dir)
            self.assertTrue(ret, "Directory creation failed")

        # Run dd in the background and stdout,stderr to error.txt for
        # validating any errors after io completion.
        run_dd_cmd = ("cd {}; for i in `seq 1 1000`; do dd if=/dev/urandom "
                      "of=file$i bs=4096 count=10 &>> error.txt; done".format(
                          distinct_dir))
        for each_client in self.clients:
            proc = g.run_async(each_client, run_dd_cmd)
            self.all_mounts_procs.append(proc)

        # Get random brick from the bricks
        brick_to_replace = choice(all_bricks)
        node_from_brick_replace, _ = brick_to_replace.split(":")

        # Replace brick from the same node
        servers_info_of_replaced_node = {}
        servers_info_of_replaced_node[node_from_brick_replace] = (
            self.all_servers_info[node_from_brick_replace])

        ret = replace_brick_from_volume(self.mnode,
                                        self.volname,
                                        node_from_brick_replace,
                                        servers_info_of_replaced_node,
                                        src_brick=brick_to_replace)
        self.assertTrue(ret, "Replace brick failed")

        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on the mounts")
        self.all_mounts_procs *= 0

        err_msg = "Too many levels of symbolic links"
        dd_log_file = distinct_dir + "/error.txt"
        for each_client in self.clients:
            ret = occurences_of_pattern_in_file(each_client, err_msg,
                                                dd_log_file)
            self.assertEqual(
                ret, 0, "Either file {} doesn't exist or {} "
                "messages seen while replace brick operation "
                "in-progress".format(dd_log_file, err_msg))

        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
                        "Heal failed on the volume {}".format(self.volname))
Beispiel #12
0
    def test_custom_xlator_ops(self):
        '''
        Steps:
        - Perform minimal IO on the mount
        - Enable custom xlator and verify xlator position in the volfile
        - After performing any operation on the custom xlator set options using
          'storage.reserve' to validate other xlators aren't effected
        - Add brick to the volume and verify the xlator position in volfile in
          the new brick
        - Replace brick and verify the xlator position in new brick volfile
        - Verify debug xlator is reflected correctly in the volfile when set
        - Validate unexisting xlator position should fail
        - Reset the volume and verify all the options set above are reset

        For more details refer inline comments
        '''

        # Write IO on the mount
        self._simple_io()

        # Set storage.reserve option, just a baseline that set options are
        # working
        self._set_and_assert_volume_option('storage.reserve', '2%')

        # Test mount is accessible in RW
        self._simple_io()

        # Position custom xlator in the graph
        xlator, parent, xtype = 'ro', 'worm', 'user'
        self._enable_xlator(xlator, parent, xtype)

        # Verify mount is accessible as we didn't set any options yet
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, 'Failed to list all files and dirs')

        # Set 'read-only' to 'on'
        self._set_and_assert_volume_option('user.xlator.ro.read-only', 'on')

        # Functional verification that mount should be RO
        self._simple_io(xfail=True)
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, 'Failed to list all files and dirs')

        # Shouldn't effect other xlator options
        self._set_and_assert_volume_option('storage.reserve', '3%')

        # Functional validation that mount should be RW
        self._set_and_assert_volume_option('user.xlator.ro.read-only', 'off')
        self._simple_io()

        # Shouldn't effect other xlator options
        self._set_and_assert_volume_option('storage.reserve', '4%')

        # Add brick to the volume and new brick volfile should have custom
        # xlator
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, 'Unable to expand volume')
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to log volume info and status')
        self._verify_position(xlator, parent, xtype)
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, 'Unable to start rebalance operaiont post '
            'expanding volume')
        sleep(.5)
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance on the volume is not completed')

        # Replace on 'pure distribute' isn't recommended
        if self.volume['voltype']['type'] != 'distributed':

            # Replace brick and new brick volfile should have custom xlator
            ret = replace_brick_from_volume(self.mnode, self.volname,
                                            self.servers,
                                            self.all_servers_info)
            self.assertTrue(ret, 'Unable to perform replace brick operation')
            self._verify_position(xlator, parent, xtype)
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(
                ret, 'Heal is not yet completed after performing '
                'replace brick operation')

        # Regression cases
        # Framework should fail when non existing xlator position is supplied
        self._set_and_assert_volume_option('user.xlator.ro',
                                           'unknown',
                                           xfail=True)

        # Any failure in setting xlator option shouldn't result in degraded
        # volume
        self._simple_io()
        self._set_and_assert_volume_option('storage.reserve', '5%')

        # Custom xlator framework touches existing 'debug' xlators and minimal
        # steps to verify no regression
        xlator, parent, xtype = 'delay-gen', 'posix', 'debug'
        self._enable_xlator(xlator, parent, xtype)

        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, 'Failed to list all files and dirs')

        # Volume shouldn't be able to start on using same name for custom
        # xlator and existing xlator
        if self.mount_type != 'nfs':
            xlator, parent, xtype = 'posix', 'posix', 'user'
            self._enable_xlator(xlator, parent, xtype, xsfail=True)

        # Volume reset should remove all the options that are set upto now
        ret, _, _ = volume_reset(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Unable to reset volume')

        # Volume start here is due to earlier failure starting the volume and
        # isn't related to 'volume_reset'
        if self.mount_type != 'nfs':
            ret, _, _ = volume_start(self.mnode, self.volname)
            self.assertEqual(ret, 0, 'Unable to start a stopped volume')
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, 'Not all volume processes are online after '
            'starting a stopped volume')
        sleep(self.timeout)
        self._simple_io()

        # Verify options are reset
        vol_info = get_volume_info(self.mnode, self.volname)
        options = vol_info[self.volname]['options']
        negate = ['user.xlator.ro', 'debug.delay-gen', 'storage.reserve']
        for option in negate:
            self.assertNotIn(
                option, options, 'Found {0} in volume info even '
                'after volume reset'.format(option))

        g.log.info(
            'Pass: Validating custom xlator framework for volume %s '
            'is successful', self.volname)