Esempio n. 1
0
    def get_rebalance_status(self, volume_name):
        """Rebalance status after expansion."""
        wait_reb = rebalance_ops.wait_for_rebalance_to_complete(
            'auto_get_gluster_endpoint', volume_name)
        self.assertTrue(
            wait_reb,
            "Rebalance for '%s' volume was not completed." % volume_name)

        reb_status = rebalance_ops.get_rebalance_status(
            'auto_get_gluster_endpoint', volume_name)
        self.assertEqual(
            reb_status["aggregate"]["statusStr"], "completed",
            "Failed to get rebalance status for '%s' volume." % volume_name)
    def get_rebalance_status(self, volume_name):
        """Rebalance status after expansion."""
        wait_reb = rebalance_ops.wait_for_rebalance_to_complete(
            'auto_get_gluster_endpoint', volume_name)
        self.assertTrue(
            wait_reb,
            "Rebalance for '%s' volume was not completed." % volume_name)

        reb_status = rebalance_ops.get_rebalance_status(
            'auto_get_gluster_endpoint', volume_name)
        self.assertEqual(
            reb_status["aggregate"]["statusStr"], "completed",
            "Failed to get rebalance status for '%s' volume." % volume_name)
Esempio n. 3
0
    def _rebalance_completion(self, volume_name):
        """Rebalance start and completion after expansion."""
        ret, _, err = rebalance_ops.rebalance_start(
            'auto_get_gluster_endpoint', volume_name)
        self.assertFalse(
            ret, "Rebalance for {} volume not started with error {}".format(
                volume_name, err))

        for w in waiter.Waiter(240, 10):
            reb_status = rebalance_ops.get_rebalance_status(
                'auto_get_gluster_endpoint', volume_name)
            if reb_status["aggregate"]["statusStr"] == "completed":
                break
        if w.expired:
            raise AssertionError(
                "Failed to complete the rebalance in 240 seconds")
    def tearDown(self):

        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']
        if 'in progress' in status:
            # Stop rebalance on the volume
            g.log.info("Stop Rebalance on volume %s", self.volname)
            ret, _, _ = rebalance_stop(self.mnode, self.volname)
            if ret != 0:
                raise ExecutionError("Volume %s: Rebalance stop failed" %
                                     self.volname)
            g.log.info("Volume %s: Rebalance stop success", self.volname)

        # Unmount Volume and Cleanup Volume
        g.log.info("Starting to Unmount Volume and Cleanup Volume")
        ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts)
        if not ret:
            raise ExecutionError("Failed to Unmount Volume and Cleanup Volume")
        g.log.info("Successful in Unmount Volume and Cleanup Volume")

        # Calling GlusterBaseClass tearDown
        self.get_super_method(self, 'tearDown')()
Esempio n. 5
0
    def test_rebalance_status_from_newly_probed_node(self):

        # Peer probe first 3 servers
        servers_info_from_three_nodes = {}
        for server in self.servers[0:3]:
            servers_info_from_three_nodes[server] = self.all_servers_info[
                server]
            # Peer probe the first 3 servers
            ret, _, _ = peer_probe(self.mnode, server)
            self.assertEqual(ret, 0, "Peer probe failed to %s" % server)

        self.volume['servers'] = self.servers[0:3]
        # create a volume using the first 3 nodes
        ret = setup_volume(self.mnode,
                           servers_info_from_three_nodes,
                           self.volume,
                           force=True)
        self.assertTrue(
            ret, "Failed to create"
            "and start volume %s" % self.volname)

        # Mounting a volume
        ret = self.mount_volume(self.mounts)
        self.assertTrue(ret, "Volume mount failed for %s" % self.volname)

        # Checking volume mounted or not
        ret = is_mounted(self.volname, self.mounts[0].mountpoint, self.mnode,
                         self.mounts[0].client_system, self.mount_type)
        self.assertTrue(
            ret, "Volume not mounted on mount point: %s" %
            self.mounts[0].mountpoint)
        g.log.info("Volume %s mounted on %s", self.volname,
                   self.mounts[0].mountpoint)

        # run IOs
        g.log.info("Starting IO on all mounts...")
        self.counter = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = (
                "python %s create_deep_dirs_with_files "
                "--dirname-start-num %d "
                "--dir-depth 10 "
                "--dir-length 5 "
                "--max-num-of-dirs 3 "
                "--num-of-files 100 %s" %
                (self.script_upload_path, self.counter, mount_obj.mountpoint))
            ret = g.run(mount_obj.client_system, cmd)
            self.assertEqual(ret, 0,
                             "IO failed on %s" % mount_obj.client_system)
            self.counter = self.counter + 10

        # add a brick to the volume and start rebalance
        brick_to_add = form_bricks_list(self.mnode, self.volname, 1,
                                        self.servers[0:3],
                                        servers_info_from_three_nodes)
        ret, _, _ = add_brick(self.mnode, self.volname, brick_to_add)
        self.assertEqual(ret, 0, "Failed to add a brick to %s" % self.volname)

        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start rebalance")

        # peer probe a new node from existing cluster
        ret, _, _ = peer_probe(self.mnode, self.servers[3])
        self.assertEqual(ret, 0, "Peer probe failed")

        ret = get_rebalance_status(self.servers[3], self.volname)
        self.assertIsNone(ret, "Failed to get rebalance status")
    def test_quota_rebalance(self):
        """
        * Enable quota on the volume
        * set hard and soft time out to zero.
        * Create some files and directories from mount point
          so that the limits are reached.
        * Perform add-brick operation on the volume.
        * Start rebalance on the volume.
        * While rebalance is in progress, create some more files
          and directories from the mount point until limit is hit
        """

        # pylint: disable=too-many-statements
        # Enable Quota on volume
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Set the Quota timeouts to 0 for strict accounting
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname))
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname))
        g.log.info("Quota soft and hard timeout has been set to 0 for %s",
                   self.volname)

        # Set limit of 100 MB on root dir of the volume
        ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/", "100MB")
        self.assertEqual(ret, 0, "Failed to set Quota for dir '/'")
        g.log.info("Successfully set quota limit for dir '/'")

        # Do some IO  until hard limit is reached
        cmd = ("/usr/bin/env python %s create_files "
               "-f 100 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Add bricks to the volume
        if "replica_count" in self.volume["voltype"]:
            new_bricks_count = self.volume["voltype"]["replica_count"]
        elif "disperse_count" in self.volume["voltype"]:
            new_bricks_count = self.volume["voltype"]["disperse_count"]
        else:
            new_bricks_count = 3
        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       new_bricks_count, self.servers,
                                       self.all_servers_info)
        g.log.info("new brick list: %s", bricks_list)
        ret, _, _ = add_brick(self.mnode, self.volname, bricks_list, False)
        self.assertEqual(ret, 0, "Failed to add the bricks to the volume")
        g.log.info("Successfully added bricks to volume")

        # Perform rebalance start operation
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Rebalance Start Failed")

        # Wait for at least one file to be lookedup/scanned on the nodes
        status_info = get_rebalance_status(self.mnode, self.volname)
        count = 0
        while count < 20:
            lookups_start_count = 0
            for node in range(len(status_info['node'])):
                status_info = get_rebalance_status(self.mnode, self.volname)
                lookups_file_count = status_info['node'][node]['lookups']
                if int(lookups_file_count) > 0:
                    lookups_start_count += 1
                    sleep(2)
            if lookups_start_count == len(self.servers):
                g.log.info(
                    "Volume %s: At least one file is lookedup/scanned "
                    "on all nodes", self.volname)
                break
            count += 1

        # Perform some more IO and check if hard limit is honoured
        self.all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 100 --fixed-file-size 1M --base-file-name newfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        # This should fail as the quotas were already reached
        self.assertFalse(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "Unexpected: IO passed on the client even after quota is reached")
        g.log.info("Expected: IO failed as quota is reached")

        # Wait for rebalance to finish
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=180)
        self.assertTrue(ret, "Unexpected: Rebalance did not complete")
        g.log.info("Rebalance completed as expected")
Esempio n. 7
0
    def test_volume_start_stop_while_rebalance_is_in_progress(self):
        # DHT Layout and hash validation
        for mount_obj in self.mounts:
            g.log.debug("Verifying hash layout values %s:%s",
                        mount_obj.client_system, mount_obj.mountpoint)
            ret = validate_files_in_dir(mount_obj.client_system,
                                        mount_obj.mountpoint,
                                        test_type=FILE_ON_HASHED_BRICKS,
                                        file_type=FILETYPE_FILES |
                                        FILETYPE_DIRS)
            self.assertTrue(ret, "Hash Layout Values: Fail")
            g.log.info("Hash layout values are verified %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        g.log.error(ret, "Logging volume info and status failed on "
                         "volume %s", self.volname)
        g.log.info("Logging volume info and status was successful for volume "
                   "%s", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info,)
        self.assertTrue(ret, ("Failed to expand the volume on volume %s ",
                              self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Error: Volume processes failed to come up for "
                              "%s", self.volname))
        g.log.info("All processes are up for volume %s", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Error: Volume processes failed to come up for "
                              "%s", self.volname))
        g.log.info("All processes are up for volume %s", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode,
                                                      self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online ",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s ",
                   self.volname)

        # Logging rebalance status
        g.log.info("Logging rebalance status")
        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']

        self.assertIn('in progress', status,
                      "Rebalance process is not running")
        g.log.info("Rebalance process is running")

        ret, out, err = volume_stop(self.mnode, self.volname)
        g.log.debug("Rebalance info: %s", out)

        self.assertIn("rebalance session is in progress", err, " Volume "
                      "stopped successfully while rebalance session is in "
                      "progress")
        g.log.info("Volume stop failed as rebalance session is in "
                   "progress")

        # Check volume info to check the status of volume
        g.log.info("Checking volume info for the volume status")
        status_info = get_volume_info(self.mnode, self.volname)
        status = status_info[self.volname]['statusStr']
        self.assertIn('Started', status, ("Volume %s state is \"Stopped\"",
                                          self.volname))
        g.log.info("Volume %s state is \"Started\"", self.volname)
Esempio n. 8
0
    def test_rebalance_with_hidden_files(self):
        # pylint: disable=too-many-statements
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_files "
                   "--base-file-name . "
                   "-f 99 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")

        # Verify DHT values across mount points
        for mount_obj in self.mounts:
            g.log.debug("Verifying hash layout values %s:%s",
                        mount_obj.client_system, mount_obj.mountpoint)
            ret = validate_files_in_dir(mount_obj.client_system,
                                        mount_obj.mountpoint,
                                        test_type=FILE_ON_HASHED_BRICKS,
                                        file_type=FILETYPE_FILES)
            self.assertTrue(
                ret, "Expected - Files are created on only "
                "sub-volume according to its hashed value")
            g.log.info("Hash layout values are verified %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

        # Getting areequal checksum before rebalance
        g.log.info("Getting areequal checksum before rebalance")
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online ", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            failed_files_count = int(each_node['failures'])
            self.assertEqual(
                failed_files_count, 0,
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("There are no migration failures")

        # Getting areequal checksum after rebalance
        g.log.info("Getting areequal checksum after rebalance")
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance
        g.log.info("Comparing arequals checksum before and after rebalance")
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")
Esempio n. 9
0
    def test_expanding_volume_when_io_in_progress(self):
        # pylint: disable=too-many-statements
        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume when IO in progress
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume while IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume while IO in progress on "
            "volume %s : Success", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Waiting for volume %s process to be online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Started rebalance on the volume %s: Success", self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=1800)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for the "
                                  "volume %s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO on all mounts: Complete")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("List all files and directories: Success")

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            self.assertEqual(
                0, int(each_node['failures']),
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("No migration failures on %s", each_node['nodeName'])
Esempio n. 10
0
    def test_open_file_migration(self):
        """
        Description: Checks that files with open fd are migrated successfully.

        Steps :
        1) Create a volume.
        2) Mount the volume using FUSE.
        3) Create files on volume mount.
        4) Open fd for the files and keep on doing read write operations on
           these files.
        5) While fds are open, add bricks to the volume and trigger rebalance.
        6) Wait for rebalance to complete.
        7) Wait for write on open fd to complete.
        8) Check for any data loss during rebalance.
        9) Check if rebalance has any failures.
        """
        # Create files and open fd for the files on mount point
        m_point = self.mounts[0].mountpoint
        cmd = ('cd {}; for i in `seq 261 1261`;do touch testfile$i;'
               'done'.format(m_point))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to create files")
        g.log.info("Successfully created files")
        proc = open_file_fd(m_point,
                            2,
                            self.clients[0],
                            start_range=301,
                            end_range=400)

        # Calculate file count for the mount-point
        cmd = ("ls -lR {}/testfile* | wc -l".format(m_point))
        ret, count_before, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to get file count")
        g.log.info("File count before rebalance is:%s", count_before)

        # Add bricks to the volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Trigger rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start rebalance")
        g.log.info("Rebalance is started")

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=300)
        self.assertTrue(ret, ("Rebalance failed on volume %s", self.volname))
        g.log.info("Rebalance is successful on " "volume %s", self.volname)

        # Close connection and check if write on open fd has completed
        ret, _, _ = proc.async_communicate()
        self.assertEqual(ret, 0, "Write on open fd" " has not completed yet")
        g.log.info("Write completed on open fd")

        # Calculate file count for the mount-point
        cmd = ("ls -lR {}/testfile* | wc -l".format(m_point))
        ret, count_after, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to get file count")
        g.log.info("File count after rebalance is:%s", count_after)

        # Check if there is any data loss
        self.assertEqual(
            int(count_before), int(count_after),
            "The file count before and after"
            " rebalance is not same."
            " There is data loss.")
        g.log.info("The file count before and after rebalance is same."
                   " No data loss occurred.")

        # Check if rebalance has any failures
        ret = get_rebalance_status(self.mnode, self.volname)
        no_of_failures = ret['aggregate']['failures']
        self.assertEqual(int(no_of_failures), 0, "Failures in rebalance")
        g.log.info("No failures in rebalance")
    def test_rename_file_rebalance(self):
        """
        Test file renames during rebalance
        - Create a volume
        - Create directories or files
        - Calculate the checksum using arequal
        - Add brick and start rebalance
        - While rebalance is running, rename files or directories.
        - After rebalancing calculate checksum.
        """
        # Taking the instance of mount point.
        mount_point = self.mounts[0].mountpoint

        # Creating main directory.
        ret = mkdir(self.mounts[0].client_system,
                    "{}/main".format(mount_point))
        self.assertTrue(ret, "mkdir of dir main failed")

        # Creating Files.
        self.all_mounts_procs = []
        command = ("/usr/bin/env python {} create_files"
                   " {}/main/ -f 4000"
                   " --fixed-file-size 1k".format(self.script_upload_path,
                                                  mount_point))
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        g.log.info("IO on %s:%s is started successfully",
                   self.mounts[0].client_system, mount_point)

        # Wait for IO completion.
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Getting the arequal checksum.
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log Volume Info and Status after expanding the volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Check that rebalance status is "in progress"
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        ret = rebalance_status['aggregate']['statusStr']
        self.assertEqual(ret, "in progress", ("Rebalance is not in "
                                              "'in progress' state, either "
                                              "rebalance is in completed state"
                                              " or failed to get rebalance "
                                              " status"))
        g.log.info("Rebalance is in 'in progress' state")

        # Renaming the files during rebalance.
        self.all_mounts_procs = []
        command = ("/usr/bin/env python {} mv"
                   " {}/main/ --postfix re ".format(self.script_upload_path,
                                                    mount_point))
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        g.log.info("IO on %s:%s is started successfully",
                   self.mounts[0].client_system, mount_point)
        self.all_mounts_procs.append(proc)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalace is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Wait for IO completion.
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Getting arequal checksum after rebalance
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance.
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
Esempio n. 12
0
    def test_xml_dump_of_gluster_volume_status_during_rebalance(self):
        """
        1. Create a trusted storage pool by peer probing the node
        2.  Create a distributed-replicated volume
        3. Start the volume and fuse mount the volume and start IO
        4. Create another replicated volume and start it and stop it
        5. Start rebalance on the volume
        6. While rebalance in progress, stop glusterd on one of the nodes
            in the Trusted Storage pool.
        7. Get the status of the volumes with --xml dump
        """
        self.volname_2 = "test_volume_2"

        # create volume
        # Fetching all the parameters for volume_create
        list_of_three_servers = []
        server_info_for_three_nodes = {}
        for server in self.servers[:3]:
            list_of_three_servers.append(server)
            server_info_for_three_nodes[server] = self.all_servers_info[server]

        bricks_list = form_bricks_list(self.mnode, self.volname, 3,
                                       list_of_three_servers,
                                       server_info_for_three_nodes)
        # Creating volumes using 3 servers
        ret, _, _ = volume_create(self.mnode,
                                  self.volname_2,
                                  bricks_list,
                                  force=True)
        self.assertFalse(ret, "Volume creation failed")
        g.log.info("Volume %s created successfully", self.volname_2)
        ret, _, _ = volume_start(self.mnode, self.volname_2)
        self.assertFalse(ret,
                         "Failed to start volume {}".format(self.volname_2))
        ret, _, _ = volume_stop(self.mnode, self.volname_2)
        self.assertFalse(ret,
                         "Failed to stop volume {}".format(self.volname_2))

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))

        # Get rebalance status
        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']

        self.assertIn('in progress', status,
                      "Rebalance process is not running")
        g.log.info("Rebalance process is running")

        # Stop glusterd
        ret = stop_glusterd(self.servers[2])
        self.assertTrue(ret, "Failed to stop glusterd")

        ret, out, _ = g.run(
            self.mnode,
            "gluster v status  | grep -A 4 'Rebalance' | awk 'NR==3{print "
            "$3,$4}'")

        ret = get_volume_status(self.mnode, self.volname, options="tasks")
        rebalance_status = ret[self.volname]['task_status'][0]['statusStr']
        self.assertIn(rebalance_status, out.replace("\n", ""))
Esempio n. 13
0
    def test_stop_glusterd_while_rebalance_in_progress(self):

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)
        g.log.info("Successful in logging volume info and status of "
                   "volume %s", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand success", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)
        g.log.info("Successful in logging volume info and status of "
                   "volume %s", self.volname)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Volume %s: Rebalance start failed",
                                  self.volname))
        g.log.info("Volume %s: Started rebalance", self.volname)

        # Wait for at least one file to be lookedup/scanned on the nodes
        status_info = get_rebalance_status(self.mnode, self.volname)
        count = 0
        while count < 100:
            lookups_start_count = 0
            for node in range(len(status_info['node'])):
                status_info = get_rebalance_status(self.mnode, self.volname)
                lookups_file_count = status_info['node'][node]['lookups']
                if int(lookups_file_count) > 0:
                    lookups_start_count += 1
                    sleep(5)
            if lookups_start_count == len(self.servers):
                g.log.info("Volume %s: At least one file is lookedup/scanned "
                           "on all nodes", self.volname)
                break
            count += 1

        # Form a new list of servers without mnode in it to prevent mnode
        # from glusterd failure
        nodes = self.servers[:]
        nodes.remove(self.mnode)

        # Stop glusterd on a server
        random_server = random.choice(nodes)
        g.log.info("Stop glusterd on server %s", random_server)
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, ("Server %s: Failed to stop glusterd",
                              random_server))
        g.log.info("Server %s: Stopped glusterd", random_server)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname,
                                             timeout=600)
        self.assertTrue(ret, ("Rebalance is either timed out or failed"
                              "%s", self.volname))
        g.log.info("Volume %s: Rebalance completed successfully",
                   self.volname)
    def test_delete_file_in_migration(self):
        """
        Verify that if a file is picked for migration and then deleted, the
        file should be removed successfully.
        * First create a big data file of 10GB.
        * Rename that file, such that after rename a linkto file is created
          (we are doing this to make sure that file is picked for migration.)
        * Add bricks to the volume and trigger rebalance using force option.
        * When the file has been picked for migration, delete that file from
          the mount point.
        * Check whether the file has been deleted or not on the mount-point
          as well as the back-end bricks.
        """

        # pylint: disable=too-many-statements
        # pylint: disable=too-many-locals
        # pylint: disable=protected-access

        mountpoint = self.mounts[0].mountpoint

        # Location of source file
        src_file = mountpoint + '/file1'

        # Finding a file name such that renaming source file to it will form a
        # linkto file
        subvols = (get_subvols(self.mnode, self.volname))['volume_subvols']
        newhash = find_new_hashed(subvols, "/", "file1")
        new_name = str(newhash.newname)
        new_host = str(newhash.hashedbrickobject._host)
        new_name_path = str(newhash.hashedbrickobject._fqpath)[:-2]

        # Location of destination file to which source file will be renamed
        dst_file = '{}/{}'.format(mountpoint, new_name)
        # Create a 10GB file source file
        cmd = (
            "dd if=/dev/urandom of={} bs=1024K count=10000".format(src_file))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, ("File {} creation failed".format(src_file)))

        # Move file such that it hashes to some other subvol and forms linkto
        # file
        ret = move_file(self.clients[0], src_file, dst_file)
        self.assertTrue(ret, "Rename failed")
        g.log.info('Renamed file %s to %s', src_file, dst_file)

        # Check if "file_two" is linkto file
        ret = is_linkto_file(new_host, '{}/{}'.format(new_name_path, new_name))
        self.assertTrue(ret, "File is not a linkto file")
        g.log.info("File is linkto file")

        # Expanding volume by adding bricks to the volume
        ret, _, _ = add_brick(self.mnode,
                              self.volname,
                              self.add_brick_list,
                              force=True)
        self.assertEqual(ret, 0,
                         ("Volume {}: Add-brick failed".format(self.volname)))
        g.log.info("Volume %s: add-brick successful", self.volname)

        # Log Volume Info and Status after expanding the volume
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname, force=True)
        self.assertEqual(
            ret, 0,
            ("Volume {}: Failed to start rebalance".format(self.volname)))
        g.log.info("Volume %s : Rebalance started ", self.volname)

        # Check if rebalance is running and delete the file
        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']
        self.assertEqual(status, 'in progress', "Rebalance is not running")
        ret, _, _ = g.run(self.clients[0], (" rm -rf {}".format(dst_file)))
        self.assertEqual(ret, 0, ("Cannot delete file {}".format(dst_file)))
        g.log.info("File is deleted")

        # Check if the file is present on the mount point
        ret, _, _ = g.run(self.clients[0], ("ls -l {}".format(dst_file)))
        self.assertEqual(ret, 2, ("Failed to delete file {}".format(dst_file)))

        # Check if the file is present on the backend bricks
        bricks = get_all_bricks(self.mnode, self.volname)
        for brick in bricks:
            node, brick_path = brick.split(':')
            ret, _, _ = g.run(node, "ls -l {}/{}".format(brick_path, new_name))
            self.assertEqual(
                ret, 2, "File is still present on"
                " back-end brick: {}".format(brick_path))
            g.log.info("File is deleted from back-end brick: %s", brick_path)

        # Check if rebalance process is still running
        for server in self.servers:
            ret, _, _ = g.run(server, "pgrep rebalance")
            self.assertEqual(ret, 1, ("Rebalance process is still"
                                      " running on server {}".format(server)))
            g.log.info("Rebalance process is not running")
Esempio n. 15
0
    def test_readdirp_with_rebalance(self):
        """
        Description: Tests to check that all directories are read
                     and listed while rebalance is still in progress.

        Steps :
        1) Create a volume.
        2) Mount the volume using FUSE.
        3) Create a dir "master" on mount-point.
        4) Create 8000 empty dirs (dir1 to dir8000) inside dir "master".
        5) Now inside a few dirs (e.g. dir1 to dir10), create deep dirs
           and inside every dir, create 50 files.
        6) Collect the number of dirs present on /mnt/<volname>/master
        7) Change the rebalance throttle to lazy.
        8) Add-brick to the volume (at least 3 replica sets.)
        9) Start rebalance using "force" option on the volume.
        10) List the directories on dir "master".
        """
        # pylint: disable=too-many-statements
        # Start IO on mounts
        m_point = self.mounts[0].mountpoint
        ret = mkdir(self.mounts[0].client_system, "{}/master".format(m_point))
        self.assertTrue(ret, "mkdir of dir master failed")

        # Create 8000 empty dirs
        cmd = ("ulimit -n 64000; /usr/bin/env python {} create_deep_dir"
               " --dir-length 8000 --dir-depth 0"
               " {}/master/".format(self.script_upload_path, m_point))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        g.log.info("IO on %s:%s is started successfully",
                   self.mounts[0].client_system, m_point)

        # Validate 8000 empty dirs are created successfully
        ret = validate_io_procs(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Create deep dirs and files
        self.all_mounts_procs = []
        cmd = ("/usr/bin/env python {} create_deep_dirs_with_files"
               " --dir-length 10 --dir-depth 1 --max-num-of-dirs 50 "
               " --num-of-files 50 --file-type empty-file"
               " {}/master/".format(self.script_upload_path, m_point))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        g.log.info("IO on %s:%s is started successfully",
                   self.mounts[0].client_system, m_point)

        # Validate deep dirs and files are created successfully
        ret = validate_io_procs(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Check the dir count before rebalance
        cmd = ('cd {}/master; ls -l | wc -l'.format(m_point))
        ret, dir_count_before, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to " "get directory count")
        g.log.info("Dir count before %s", dir_count_before)

        # Change the rebalance throttle to lazy
        ret, _, _ = set_rebalance_throttle(self.mnode,
                                           self.volname,
                                           throttle_type='lazy')
        self.assertEqual(ret, 0, "Failed to set rebal-throttle to lazy")
        g.log.info("Rebal-throttle set to 'lazy' successfully")

        # Add-bricks to the volume
        ret, _, _ = add_brick(self.mnode, self.volname, self.add_brick_list)
        self.assertEqual(ret, 0, "Failed to add-brick to the volume")
        g.log.info("Added bricks to the volume successfully")

        # Start rebalance using force
        ret, _, _ = rebalance_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to start rebalance")
        g.log.info("Rebalance started successfully")

        # Check if rebalance is in progress
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        status = rebalance_status['aggregate']['statusStr']
        self.assertEqual(status, "in progress",
                         ("Rebalance is not in 'in progress' state,"
                          " either rebalance is in compeleted state"
                          " or failed to get rebalance status"))

        # Check the dir count after rebalance
        cmd = ('cd {}/master; ls -l | wc -l'.format(m_point))
        ret, dir_count_after, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to do lookup and"
                         " get directory count")
        g.log.info("Dir count after %s", dir_count_after)

        # Check if there is any data loss
        self.assertEqual(set(dir_count_before), set(dir_count_after),
                         ("There is data loss"))
        g.log.info("The checksum before and after rebalance is same."
                   " There is no data loss.")
    def test_fix_layout_start(self):
        # pylint: disable=too-many-statements
        # Get arequal checksum before starting fix-layout
        g.log.info("Getting arequal checksum before fix-layout")
        arequal_checksum_before_fix_layout = collect_mounts_arequal(
            self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and status of volume "
            "%s", self.volname)

        # Form brick list for expanding volume
        add_brick_list = form_bricks_list_to_add_brick(self.mnode,
                                                       self.volname,
                                                       self.servers,
                                                       self.all_servers_info,
                                                       distribute_count=1)
        self.assertIsNotNone(add_brick_list,
                             ("Volume %s: Failed to form "
                              "bricks list to expand", self.volname))
        g.log.info("Volume %s: Formed bricks list to expand", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Volume %s: Expand start")
        ret, _, _ = add_brick(self.mnode, self.volname, add_brick_list)
        self.assertEqual(ret, 0, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand successful", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and status of volume "
            "%s", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance fix-layout
        g.log.info("Starting fix-layout on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, fix_layout=True)
        self.assertEqual(ret, 0, ("Volume %s: fix-layout start failed"
                                  "%s", self.volname))
        g.log.info("Volume %s: fix-layout start success", self.volname)

        # Wait for fix-layout to complete
        g.log.info("Waiting for fix-layout to complete")
        ret = wait_for_fix_layout_to_complete(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s: Fix-layout is still in-progress", self.volname))
        g.log.info("Volume %s: Fix-layout completed successfully",
                   self.volname)

        # Check Rebalance status after fix-layout is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(
            ret, 0,
            ("Volume %s: Failed to get rebalance status", self.volname))
        g.log.info("Volume %s: Successfully got rebalance status",
                   self.volname)

        # Get arequal checksum after fix-layout is complete
        g.log.info("arequal after fix-layout is complete")
        arequal_checksum_after_fix_layout = collect_mounts_arequal(self.mounts)

        # Compare arequals checksum before and after fix-layout
        g.log.info("Comparing checksum before and after fix-layout")
        self.assertEqual(arequal_checksum_before_fix_layout,
                         arequal_checksum_after_fix_layout,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")

        # Check if there are any file migrations after fix-layout
        status_info = get_rebalance_status(self.mnode, self.volname)
        for node in range(len(status_info['node'])):
            status_info = get_rebalance_status(self.mnode, self.volname)
            file_migration_count = status_info['node'][node]['files']
            self.assertEqual(
                int(file_migration_count), 0,
                ("Server %s: Few files are migrated", self.servers[node]))
            g.log.info("Server %s: No files are migrated")

        # Check if new bricks contains any files
        for brick in add_brick_list:
            brick_node, brick_path = brick.split(":")
            cmd = ('find %s -type f ! -perm 1000 | grep -ve .glusterfs' %
                   brick_path)
            _, out, _ = g.run(brick_node, cmd)
            self.assertEqual(
                len(out), 0,
                (("Files(excluded linkto files) are present on %s:%s"),
                 (brick_node, brick_path)))
            g.log.info("No files (excluded linkto files) are present on %s:%s",
                       brick_node, brick_path)
    def test_rebalance_with_force(self):

        # Getting arequal checksum before rebalance
        g.log.info("Getting arequal checksum before rebalance")
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, "Logging volume info and status failed on "
            "volume %s" % self.volname)
        g.log.info(
            "Successful in logging volume info and"
            "status of volume %s", self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand successful", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and"
            "status of volume %s", self.volname)

        # Start Rebalance with force
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, ("Volume %s: Failed to start rebalance with "
                                  "force", self.volname))
        g.log.info("Volume %s: Started rebalance with force option",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=600)
        self.assertTrue(
            ret, ("Volume %s: Rebalance is still in-progress ", self.volname))
        g.log.info("Volume %s: Rebalance completed", self.volname)

        # Getting arequal checksum after rebalance
        g.log.info("Getting arequal checksum after rebalance with force "
                   "option")
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance with force
        # option
        g.log.info("Comparing arequals checksum before and after rebalance"
                   "with force option")
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")

        # Checking if rebalance skipped any files
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            self.assertEqual(
                int(each_node['skipped']), 0,
                "Few files are skipped on node %s" % each_node['nodeName'])
            g.log.info("No files are skipped on %s", each_node['nodeName'])
    def test_remove_brick_while_rebalance_is_running(self):

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s",
                    self.clients[0], self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0], self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Volume %s: Expand failed", self.volname))
        g.log.info("Volume %s: Expand successful", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: one or more volume process are "
                              "not up", self.volname))
        g.log.info("All volume %s processes are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Verify volume's all process are online
        g.log.info("Volume %s: Verifying that all process are online",
                   self.volname)
        ret = verify_all_process_of_volume_are_online(self.mnode,
                                                      self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online ",
                              self.volname))
        g.log.info("Volume %s: All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Volume %s: Failed to start rebalance",
                                  self.volname))
        g.log.info("Volume %s: Rebalance started ", self.volname)

        # Check if rebalance is running
        status_info = get_rebalance_status(self.mnode, self.volname)
        status = status_info['aggregate']['statusStr']
        if 'in progress' in status:
            # Shrinking volume by removing bricks
            g.log.info("Start removing bricks from volume")
            _, _, err = remove_brick(self.mnode, self.volname,
                                     self.remove_brick_list, "start")
            self.assertIn("Rebalance is in progress", err, "Successfully "
                          "removed bricks while volume rebalance is "
                          "in-progress")
            g.log.info("Failed to start remove-brick as rebalance is "
                       "in-progress")
        else:
            g.log.error("Rebalance process is not running")
            raise ExecutionError("Rebalance process is not running")
Esempio n. 19
0
    def test_glusterd_rebalance(self):
        '''
        -> Create Volume
        -> Fuse mount the volume
        -> Perform I/O on fuse mount
        -> Add bricks to the volume
        -> Perform rebalance on the volume
        -> While rebalance is in progress,
        -> restart glusterd on all the nodes in the cluster
        '''

        # run IOs
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = (
                "python %s create_deep_dirs_with_files "
                "--dirname-start-num %d "
                "--dir-depth 4 "
                "--dir-length 6 "
                "--max-num-of-dirs 3 "
                "--num-of-files 25 %s" %
                (self.script_upload_path, self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Forming brick list
        brick_list = form_bricks_list_to_add_brick(self.mnode, self.volname,
                                                   self.servers,
                                                   self.all_servers_info)

        # Adding Bricks
        ret, _, _ = add_brick(self.mnode, self.volname, brick_list)
        self.assertEqual(ret, 0,
                         "Failed to add brick to the volume %s" % self.volname)
        g.log.info("Brick added successfully to the volume %s", self.volname)

        # Performing rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, 'Failed to start rebalance on volume %s' % self.volname)
        g.log.info("Rebalance started successfully on volume %s", self.volname)

        # Checking Rebalance is in progress or not
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        if rebalance_status['aggregate']['statusStr'] != 'in progress':
            raise ExecutionError("Rebalance is not in 'in progress' state, "
                                 "either rebalance is in compeleted state or"
                                 " failed to get rebalance status")

        # Restart glusterd
        ret = restart_glusterd(self.servers)
        self.assertTrue(ret, "Failed to restart glusterd on servers")
        g.log.info("Glusterd restarted successfully on %s", self.servers)

        # Checking glusterd status
        count = 0
        while count < 60:
            ret = is_glusterd_running(self.servers)
            if not ret:
                break
            sleep(2)
            count += 1
        self.assertEqual(ret, 0, "Glusterd is not running on some of the "
                         "servers")
        g.log.info("Glusterd is running on all servers %s", self.servers)
Esempio n. 20
0
    def test_rebalance_with_special_files(self):
        """
        Rebalance with special files
        - Create Volume and start it.
        - Create some special files on mount point.
        - Once it is complete, start some IO.
        - Add brick into the volume and start rebalance
        - All IO should be successful.
        """
        # Create pipe files at mountpoint.
        cmd = ("for i in {1..500};do mkfifo %s/fifo${i}; done" %
               (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to create pipe files")
        g.log.info("Pipe files created successfully")

        # Create block device files at mountpoint.
        cmd = ("for i in {1..500};do mknod %s/blk${i} blockfile 1 5;done" %
               (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to create block files")
        g.log.info("Block files created successfully")

        # Create character device files at mountpoint.
        cmd = (
            "for i in {1..500};do mknod %s/charc${i} characterfile 1 5;done" %
            (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to create character files")
        g.log.info("Character files created successfully")

        # Create files at mountpoint.
        cmd = ("/usr/bin/env python %s create_files "
               "-f 1000 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Log the volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log the volume info after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Check rebalance is in progress
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        ret = rebalance_status['aggregate']['statusStr']
        self.assertEqual(ret, "in progress", ("Rebalance is not in "
                                              "'in progress' state, either "
                                              "rebalance is in completed state"
                                              " or failed to get rebalance "
                                              "status"))
        g.log.info("Rebalance is in 'in progress' state")

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Wait for IO to complete.
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")
    def test_rebalance_with_quota_enabled(self):
        """
        Test rebalance with quota enabled on root.
        1. Create Volume of type distribute
        2. Set Quota limit on the root directory
        3. Do some IO to reach the Hard limit
        4. After IO ends, compute arequal checksum
        5. Add bricks to the volume.
        6. Start rebalance
        7. After rebalance is completed, check arequal checksum
        """
        # Enable Quota
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on volume %s", self.volname)

        # Set the Quota timeouts to 0 for strict accounting
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname))
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname))
        g.log.info("Quota soft and hard timeout has been set to 0 for %s",
                   self.volname)

        # Set the quota limit of 1 GB on root dir of the volume
        ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/", "1GB")
        self.assertEqual(ret, 0, "Failed to set Quota for dir root")
        g.log.info("Successfully set quota limit for dir root")

        # Do some IO until hard limit is reached.
        cmd = ("/usr/bin/env python %s create_files "
               "-f 1024 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=1073741824,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Compute arequal checksum.
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log volume info and status after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Perform rebalance start operation.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to  start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Rebalance started.")

        # Check rebalance is in progress
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        ret = rebalance_status['aggregate']['statusStr']
        self.assertEqual(ret, "in progress", ("Rebalance is not in "
                                              "'in progress' state, either "
                                              "rebalance is in completed state"
                                              "  or failed to get rebalance "
                                              "status"))
        g.log.info("Rebalance is 'in progress' state")

        # Wait till rebalance ends.
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=1073741824,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Compute arequal checksum.
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance.
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
 def _check_if_files_are_skipped_or_not(self):
     """Check if files are skipped or not"""
     rebalance_status = get_rebalance_status(self.mnode, self.volname)
     ret = int(rebalance_status['aggregate']['skipped'])
     self.assertNotEqual(ret, 0, "Hardlink rebalance skipped")