def test_mount_point_not_go_to_rofs(self):
        """
        - create two volumes with arbiter1 and mount it on same client
        - create IO
        - start deleting files from both mountpoints
        - kill brick from one of the node
        - Check if all the files are deleted from the mount point
        from both the servers
        """
        # create files on all mounts
        g.log.info("Starting IO on all mounts...")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files "
                       "-f 100 "
                       "--fixed-file-size 1M "
                       "%s" % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # select bricks to bring offline
        volume_list = get_volume_list(self.mnode)
        for volname in volume_list:
            bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
                self.mnode, volname))
            bricks_to_bring_offline = filter(
                None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                       bricks_to_bring_offline_dict['cold_tier_bricks'] +
                       bricks_to_bring_offline_dict['volume_bricks']))

            # bring bricks offline
            g.log.info("Going to bring down the brick process for %s",
                       bricks_to_bring_offline)
            ret = bring_bricks_offline(volname, bricks_to_bring_offline)
            self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                                  "check the log file for more details."))
            g.log.info("Brought down the brick process for %s successfully",
                       bricks_to_bring_offline)

        # delete files on all mounts
        g.log.info("Deleting IO on all mounts...")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Deleting data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Delete files
            g.log.info('Deleting files...')
            command = ("python %s delete %s" %
                       (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
Ejemplo n.º 2
0
    def test_data_self_heal_daemon_off(self):
        """
        Test Data-Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - Get arequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Get areeual after getting bricks offline and compare with
        arequal before getting bricks offline
        - modify the data
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - add bricks
        - do rebalance
        - create 1k files
        - while creating files - kill bricks and bring bricks online one by one
        in cycle
        - validate IO
        """
        # pylint: disable=too-many-statements,too-many-locals

        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Creating files on client side
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_files -f 100"
               " --fixed-file-size 1k %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            cmd,
                            user=self.mounts[0].user)
        self.assertFalse(
            ret, 'Failed to create the data for %s: %s' %
            (self.mounts[0].mountpoint, err))
        g.log.info('Created IO for %s is successfully',
                   self.mounts[0].mountpoint)

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        result_before_offline = arequals[0].splitlines()[-1].split(':')[-1]
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        result_after_offline = arequals[0].splitlines()[-1].split(':')[-1]
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(
            result_before_offline, result_after_offline,
            'Checksums before and '
            'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Modify the data
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_files -f 100"
               " --fixed-file-size 10k %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            cmd,
                            user=self.mounts[0].user)
        self.assertFalse(
            ret, 'Failed to midify the data for %s: %s' %
            (self.mounts[0].mountpoint, err))
        g.log.info('Modified IO for %s is successfully',
                   self.mounts[0].mountpoint)

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Create 1k files
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 1000"
                   " --base-file-name newfile %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Kill all bricks in cycle
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_list:
            # Bring brick offline
            g.log.info('Bringing bricks %s offline', brick)
            ret = bring_bricks_offline(self.volname, [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick)

            ret = are_bricks_offline(self.mnode, self.volname, [brick])
            self.assertTrue(ret, 'Bricks %s are not offline' % brick)
            g.log.info('Bringing bricks %s offline is successful', brick)

            # Introducing 30 second sleep when brick is down
            g.log.info(
                "Waiting for 30 seconds, with ongoing IO while "
                "brick %s is offline", brick)
            sleep(30)

            # Bring brick online
            g.log.info('Bringing bricks %s online...', brick)
            ret = bring_bricks_online(self.mnode, self.volname, [brick])
            self.assertTrue(
                ret,
                'Failed to bring bricks %s online' % bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful', brick)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info(
                "Successful in waiting for volume %s processes to be "
                "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(
                self.mnode, self.volname)
            self.assertTrue(
                ret, ("Volume %s : All process are not online" % self.volname))
            g.log.info("Volume %s : All process are online", self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(
                ret, "Either No self heal daemon process found or"
                "more than one self heal daemon process"
                "found")
            g.log.info("All self-heal-daemons are online")

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
    def test_snap_self_heal(self):
        """
        Steps:

        1. create a volume
        2. mount volume
        3. create snapshot of that volume
        4. Activate snapshot
        5. Clone snapshot and Mount
        6. Perform I/O
        7. Bring Down Few bricks from volume without
           affecting the volume or cluster.
        8. Perform I/O
        9. Bring back down bricks to online
        10. Validate heal is complete with areequal

        """
        # pylint: disable=too-many-statements, too-many-locals
        # Creating snapshot:
        g.log.info("Starting to Create snapshot")
        ret, _, _ = snap_create(self.mnode, self.volname, self.snap)
        self.assertEqual(
            ret, 0, ("Failed to create snapshot for volume %s" % self.volname))
        g.log.info("Snapshot %s created successfully for volume %s", self.snap,
                   self.volname)

        # Activating snapshot
        g.log.info("Starting to Activate Snapshot")
        ret, _, _ = snap_activate(self.mnode, self.snap)
        self.assertEqual(ret, 0,
                         ("Failed to Activate snapshot %s" % self.snap))
        g.log.info("Snapshot %s activated successfully", self.snap)

        # snapshot list
        ret, _, _ = snap_list(self.mnode)
        self.assertEqual(ret, 0, ("Failed to list all the snapshot"))
        g.log.info("Snapshot list command was successful")

        # Creating a Clone volume from snapshot:
        g.log.info("Starting to Clone volume from Snapshot")
        ret, _, _ = snap_clone(self.mnode, self.snap, self.clone)
        self.assertEqual(ret, 0, ("Failed to clone %s from snapshot %s" %
                                  (self.clone, self.snap)))
        g.log.info("%s created successfully", self.clone)

        #  start clone volumes
        g.log.info("start to created clone volumes")
        ret, _, _ = volume_start(self.mnode, self.clone)
        self.assertEqual(ret, 0, "Failed to start clone %s" % self.clone)
        g.log.info("clone volume %s started successfully", self.clone)

        # Mounting a clone volume
        g.log.info("Mounting a clone volume")
        ret, _, _ = mount_volume(self.clone, self.mount_type, self.mount1,
                                 self.mnode, self.clients[0])
        self.assertEqual(ret, 0,
                         "Failed to mount clone Volume %s" % self.clone)
        g.log.info("Clone volume %s mounted Successfully", self.clone)

        # Checking cloned volume mounted or not
        ret = is_mounted(self.clone, self.mount1, self.mnode, self.clients[0],
                         self.mount_type)
        self.assertTrue(
            ret,
            "Failed to mount clone volume on mount point: %s" % self.mount1)
        g.log.info("clone Volume %s mounted on %s", self.clone, self.mount1)

        # write files on all mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mount1)
        all_mounts_procs = []
        cmd = ("python %s create_files "
               "-f 10 --base-file-name file %s" %
               (self.script_upload_path, self.mount1))
        proc = g.run(self.clients[0], cmd)
        all_mounts_procs.append(proc)
        g.log.info("Successful in creating I/O on mounts")

        # get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s", self.clone)
        bricks_list = get_all_bricks(self.mnode, self.clone)
        g.log.info("Brick List : %s", bricks_list)

        # Select bricks to bring offline
        g.log.info("Starting to bring bricks to offline")
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(
            None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                   bricks_to_bring_offline_dict['cold_tier_bricks'] +
                   bricks_to_bring_offline_dict['volume_bricks']))
        g.log.info("Brick to bring offline: %s ", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.clone, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring the bricks offline")
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Offline Bricks list
        offline_bricks = get_offline_bricks_list(self.mnode, self.clone)
        self.assertIsNotNone(
            offline_bricks, "Failed to get offline bricklist"
            "for volume %s" % self.clone)
        for bricks in offline_bricks:
            self.assertIn(bricks, bricks_to_bring_offline,
                          "Failed to validate "
                          "Bricks offline")
        g.log.info("Bricks Offline: %s", offline_bricks)

        # Online Bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.clone)
        self.assertIsNotNone(
            online_bricks, "Failed to get online bricks"
            " for volume %s" % self.clone)
        g.log.info("Bricks Online: %s", online_bricks)

        # write files mountpoint
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mount1)
        all_mounts_procs = []
        cmd = ("python %s create_files "
               "-f 10 --base-file-name file %s" %
               (self.script_upload_path, self.mount1))
        proc = g.run(self.clients[0], cmd)
        all_mounts_procs.append(proc)
        g.log.info("Successful in creating I/O on mounts")

        # Bring all bricks online
        g.log.info("bring all bricks online")
        ret = bring_bricks_online(self.mnode, self.clone,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks online")
        g.log.info("Successful in bringing all bricks online")

        # Validate Bricks are online
        g.log.info("Validating all bricks are online")
        ret = are_bricks_online(self.mnode, self.clone, bricks_list)
        self.assertTrue(ret, "Failed to bring all the bricks online")
        g.log.info("bricks online: %s", bricks_list)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.clone)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online" % self.clone))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.clone)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.clone)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.clone))
        g.log.info("Volume %s : All process are online", self.clone)

        # wait for the heal process to complete
        g.log.info("waiting for heal process to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to complete the heal process")
        g.log.info("Successfully completed heal process")

        # Check areequal
        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.clone)
        subvols = get_subvols(self.mnode, self.clone)
        num_subvols = len(subvols['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # Get arequals and compare
        g.log.info("Starting to Compare areequals")
        for i in range(0, num_subvols):
            # Get arequal for first brick
            subvol_brick_list = subvols['volume_subvols'][i]
            node, brick_path = subvol_brick_list[0].split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            first_brick_total = arequal.splitlines()[-1].split(':')[-1]

        # Get arequal for every brick and compare with first brick
        for brick in subvol_brick_list:
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, brick_arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = brick_arequal.splitlines()[-1].split(':')[-1]
            self.assertEqual(
                first_brick_total, brick_total,
                'Arequals for subvol and %s are not equal' % brick)
            g.log.info('Arequals for subvol and %s are equal', brick)
        g.log.info('All arequals are equal for distributed-replicated')
Ejemplo n.º 4
0
    def test_data_self_heal_daemon_off(self):
        """
        Test Data-Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - Get areequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Get areequal after getting bricks offline and compare with
        areequal before getting bricks offline
        - modify the data
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - add bricks
        - do rebalance
        - create 5k files
        - while creating files - kill bricks and bring bricks online one by one
        in cycle
        - validate IO
        """

        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off",
                   }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s"
                   % (options, self.volname))

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s"
                       % (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 100 --fixed-file-size 1k %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get areequal before getting bricks offline
        g.log.info('Getting areequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal before getting bricks offline '
                   'is successful')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(None, (
                bricks_to_bring_offline_dict['hot_tier_bricks'] +
                bricks_to_bring_offline_dict['cold_tier_bricks'] +
                bricks_to_bring_offline_dict['volume_bricks']))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...' % bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful'
                   % bricks_to_bring_offline)

        # Get areequal after getting bricks offline
        g.log.info('Getting areequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after getting bricks offline '
                   'is successful')

        # Checking areequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Checksums before and '
                         'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s" %
                       (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 100 --fixed-file-size 10k %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Bring brick online
        g.log.info('Bringing bricks %s online...' % bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful'
                   % bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online" % self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s" % self.volname)

        # Do rebalance
        ret, out, err = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Create 1k files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s" %
                       (mount_obj.client_system, mount_obj.mountpoint))
            # Create files
            g.log.info('Creating files...')
            command = ("python %s create_files -f 1000 %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Kill all bricks in cycle
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_list:
            # Bring brick offline
            g.log.info('Bringing bricks %s offline' % brick)
            ret = bring_bricks_offline(self.volname, [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick)

            ret = are_bricks_offline(self.mnode, self.volname,
                                     [brick])
            self.assertTrue(ret, 'Bricks %s are not offline'
                            % brick)
            g.log.info('Bringing bricks %s offline is successful'
                       % bricks_to_bring_offline)

            # Bring brick online
            g.log.info('Bringing bricks %s online...' % brick)
            ret = bring_bricks_online(self.mnode, self.volname,
                                      [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s online' %
                            bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful'
                       % bricks_to_bring_offline)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info("Successful in waiting for volume %s processes to be "
                       "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(self.mnode,
                                                          self.volname)
            self.assertTrue(ret, ("Volume %s : All process are not online"
                                  % self.volname))
            g.log.info("Volume %s : All process are online" % self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(ret, "Either No self heal daemon process found or"
                                 "more than one self heal daemon process"
                                 "found")
            g.log.info("All self-heal-daemons are online")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")
Ejemplo n.º 5
0
    def test_entry_self_heal_heal_command(self):
        """
        Test Entry-Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - get areequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - get areequal after getting bricks offline and compare with
        arequal after bringing bricks offline
        - modify the data
        - get areequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - get areequal after getting bricks online and compare with
        arequal before bringing bricks online
        """

        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off'")

        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s" %
                       (mount_obj.client_system, mount_obj.mountpoint))
            cmd = (
                "python %s create_deep_dirs_with_files "
                "--dirname-start-num %d "
                "--dir-length 2 "
                "--dir-depth 2 "
                "--max-num-of-dirs 2 "
                "--num-of-files 20 %s" %
                (self.script_upload_path, self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10
            g.log.info("IO on %s:%s is started successfully" %
                       (mount_obj.client_system, mount_obj.mountpoint))
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Command list to do different operations with data -
        # create, rename, copy and delete
        cmd_list = [
            "python %s create_files -f 20 %s",
            "python %s mv -i '.trashcan' %s",
            "python %s copy --dest-dir new_dir %s",
            "python %s delete %s",
        ]

        for cmd in cmd_list:
            # Get areequal before getting bricks offline
            g.log.info('Getting areequal before getting bricks offline...')
            ret, result_before_offline = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting areequal before getting bricks offline '
                       'is successful')

            # Setting options
            g.log.info('Setting options...')
            options = {
                "self-heal-daemon": "off",
            }
            ret = set_volume_options(self.mnode, self.volname, options)
            self.assertTrue(ret, 'Failed to set options %s' % options)
            g.log.info("Option 'self-heal-daemon' "
                       "is set to 'off' successfully")

            # Select bricks to bring offline
            bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
                self.mnode, self.volname))
            bricks_to_bring_offline = filter(
                None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                       bricks_to_bring_offline_dict['cold_tier_bricks'] +
                       bricks_to_bring_offline_dict['volume_bricks']))

            # Bring brick offline
            g.log.info('Bringing bricks %s offline...' %
                       bricks_to_bring_offline)
            ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
            self.assertTrue(
                ret,
                'Failed to bring bricks %s offline' % bricks_to_bring_offline)

            ret = are_bricks_offline(self.mnode, self.volname,
                                     bricks_to_bring_offline)
            self.assertTrue(
                ret, 'Bricks %s are not offline' % bricks_to_bring_offline)
            g.log.info('Bringing bricks %s offline is successful' %
                       bricks_to_bring_offline)

            # Get areequal after getting bricks offline
            g.log.info('Getting areequal after getting bricks offline...')
            ret, result_after_offline = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting areequal after getting bricks offline '
                       'is successful')

            # Checking areequals before bringing bricks offline
            # and after bringing bricks offline
            self.assertEqual(result_before_offline, result_after_offline,
                             'Checksums are not equal')
            g.log.info('Checksums before bringing bricks offline '
                       'and after bringing bricks offline are equal')

            # Modify the data
            g.log.info("Start modifying IO on all mounts...")
            self.all_mounts_procs = []
            for mount_obj in self.mounts:
                g.log.info("Modifying IO on %s:%s", mount_obj.client_system,
                           mount_obj.mountpoint)
                cmd = cmd % (self.script_upload_path, mount_obj.mountpoint)
                proc = g.run_async(mount_obj.client_system,
                                   cmd,
                                   user=mount_obj.user)
                self.all_mounts_procs.append(proc)
                g.log.info("IO on %s:%s is modified successfully" %
                           (mount_obj.client_system, mount_obj.mountpoint))
            self.io_validation_complete = False

            # Validate IO
            g.log.info("Wait for IO to complete and validate IO ...")
            ret = validate_io_procs(self.all_mounts_procs, self.mounts)
            self.assertTrue(ret, "IO failed on some of the clients")
            self.io_validation_complete = True
            g.log.info("IO is successful on all mounts")

            # Get areequal before getting bricks online
            g.log.info('Getting areequal before getting bricks online...')
            ret, result_before_online = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting areequal before getting bricks online '
                       'is successful')

            # List all files and dirs created
            g.log.info("List all files and directories:")
            ret = list_all_files_and_dirs_mounts(self.mounts)
            if not ret:
                raise ExecutionError("Failed to list all files and dirs")
            g.log.info("Listing all files and directories is successful")

            # Bring brick online
            g.log.info('Bringing bricks %s online...' %
                       bricks_to_bring_offline)
            ret = bring_bricks_online(self.mnode, self.volname,
                                      bricks_to_bring_offline)
            self.assertTrue(
                ret,
                'Failed to bring bricks %s online' % bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful' %
                       bricks_to_bring_offline)

            # Setting options
            g.log.info('Setting options...')
            options = {
                "self-heal-daemon": "on",
            }
            ret = set_volume_options(self.mnode, self.volname, options)
            self.assertTrue(ret, 'Failed to set options %s' % options)
            g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info(
                "Successful in waiting for volume %s processes to be "
                "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(
                self.mnode, self.volname)
            self.assertTrue(
                ret, ("Volume %s : All process are not online" % self.volname))
            g.log.info("Volume %s : All process are online" % self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(ret, "Either No self heal daemon process found")
            g.log.info("All self-heal-daemons are online")

            # Start healing
            ret = trigger_heal(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal is not started')
            g.log.info('Healing is started')

            # Monitor heal completion
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal has not yet completed')

            # Check if heal is completed
            ret = is_heal_complete(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal is not complete')
            g.log.info('Heal is completed successfully')

            # Check for split-brain
            ret = is_volume_in_split_brain(self.mnode, self.volname)
            self.assertFalse(ret, 'Volume is in split-brain state')
            g.log.info('Volume is not in split-brain state')

            # Get areequal after getting bricks online
            g.log.info('Getting areequal after getting bricks online...')
            ret, result_after_online = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting areequal after getting bricks online '
                       'is successful')

            # List all files and dirs created
            g.log.info("List all files and directories:")
            ret = list_all_files_and_dirs_mounts(self.mounts)
            if not ret:
                raise ExecutionError("Failed to list all files and dirs")
            g.log.info("Listing all files and directories is successful")

            # Checking areequals before bringing bricks online
            # and after bringing bricks online
            self.assertEqual(result_before_online, result_after_online,
                             'Checksums are not equal')
            g.log.info('Checksums before bringing bricks online '
                       'and after bringing bricks online are equal')
    def test_replace_brick_self_heal_io_in_progress(self):
        """
        - Create directory on mount point and write files/dirs
        - Create another set of files (1K files)
        - While creation of files/dirs are in progress Kill one brick
        - Remove the contents of the killed brick(simulating disk replacement)
        - When the IO's are still in progress, restart glusterd on the nodes
          where we simulated disk replacement to bring back bricks online
        - Start volume heal
        - Wait for IO's to complete
        - Verify whether the files are self-healed
        - Calculate arequals of the mount point and all the bricks
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        # Create dirs with files
        g.log.info('Creating dirs with file...')
        command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "-d 2 -l 2 -n 2 -f 10 %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            command,
                            user=self.mounts[0].user)
        self.assertFalse(ret, err)
        g.log.info("IO is successful")

        # Creating another set of files (1K files)
        self.all_mounts_procs = []

        # Create dirs with files
        g.log.info('Creating 1K files...')
        command = ("/usr/bin/env python %s create_files "
                   "-f 1500 --fixed-file-size 10k %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Remove the content of the killed bricks
        for brick in bricks_to_bring_offline:
            brick_node, brick_path = brick.split(':')

            # Removing files
            command = ('cd %s ; rm -rf *' % brick_path)
            ret, _, err = g.run(brick_node, command)
            self.assertFalse(ret, err)
            g.log.info('Files are deleted on brick %s', brick)

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal daemons are online")

        # Start healing
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Check arequals for "replicated"
        all_bricks = get_all_bricks(self.mnode, self.volname)
        if self.volume_type == "replicated":

            # Get arequal after bricks are online
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting arequal after successfully bringing'
                       'bricks online.')
            mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

            # Get arequal on bricks and compare with mount_point_total
            ret, arequals = collect_bricks_arequal(all_bricks)
            self.assertTrue(ret, 'Failed to get arequal on bricks')
            for arequal in arequals:
                brick_total = arequal.splitlines()[-1].split(':')[-1]
                self.assertEqual(
                    mount_point_total, brick_total,
                    'Arequals for mountpoint and brick '
                    'are not equal')
                g.log.info('Arequals for mountpoint and brick are equal')

        # Check arequals for "distributed-replicated"
        if self.volume_type == "distributed-replicated":

            # Get the subvolumes
            subvols_dict = get_subvols(self.mnode, self.volname)
            num_subvols = len(subvols_dict['volume_subvols'])
            g.log.info("Number of subvolumes in volume %s:", num_subvols)

            # Get arequals and compare
            for i in range(0, num_subvols):

                # Get arequal for first brick
                subvol_brick_list = subvols_dict['volume_subvols'][i]
                ret, arequal = collect_bricks_arequal(subvol_brick_list[0])
                self.assertTrue(ret, 'Failed to get arequal on first brick')
                first_brick_total = arequal[0].splitlines()[-1].split(':')[-1]

                # Get arequal for every brick and compare with first brick
                ret, arequals = collect_bricks_arequal(subvol_brick_list)
                self.assertTrue(ret, 'Failed to get arequal on bricks')
                for arequal in arequals:
                    brick_total = arequal.splitlines()[-1].split(':')[-1]
                    self.assertEqual(
                        first_brick_total, brick_total,
                        'Arequals for subvol and brick are '
                        'not equal')
                    g.log.info('Arequals for subvol and brick are equal')
    def test_self_heal_differing_in_file_type(self):
        """
        testing self heal of files with different file types
        with default configuration

        Description:
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequal and compare with arequal before
        getting bricks offline
        - modify the data
        - arequal before getting bricks online
        - bring bricks online
        - check daemons and healing completion
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        online and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        all_mounts_procs = []
        test_file_type_differs_self_heal_folder = \
            'test_file_type_differs_self_heal'
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)

        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=1k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; " % (self.mounts[0].mountpoint,
                                test_file_type_differs_self_heal_folder,
                                test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts),
                        "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(
            sorted(result_before_offline), sorted(result_after_offline),
            'Checksums before and after bringing bricks'
            ' offline are not equal')
        g.log.info('Checksums before and after '
                   'bringing bricks offline are equal')

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 10` ; "
                   "do for j in `seq 1 5` ; "
                   "do for k in `seq 1 10` ; "
                   "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; "
                   "mkdir l1_dir.$i/l2_dir.$j/test.$k ; "
                   "done ; "
                   "done ; "
                   "done ;" % (self.mounts[0].mountpoint,
                               test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(
            sorted(result_before_online), sorted(result_after_online),
            'Checksums before and after bringing bricks'
            ' online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
    def test_self_heal_symbolic_links(self):
        """
        Test Self-Heal of Symbolic Links (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "data-self-heal-algorithm": "diff"
        "self-heal-daemon": "off"
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequals and compare with arequal
        before bringing bricks offline
        - modify the data and verify whether the links are properly created
        - calculate arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is complited
        - check for split-brain
        - calculate arequal after getting bricks online and compare with
        arequal before getting bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off",
                   "self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "'self-heal-daemon' "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_sym_link_self_heal_folder = 'test_sym_link_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)
        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ; "
                   "cd %s/ ;"
                   "for i in `seq 1 5` ; "
                   "do mkdir dir.$i ; "
                   "for j in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=dir.$i/file.$j "
                   "bs=1k count=$j ; "
                   "done ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_sym_link_self_heal_folder,
                      test_sym_link_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(sorted(result_before_offline),
                         sorted(result_after_offline),
                         'Checksums before and after bringing bricks '
                         'online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Modify the data
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)
        # Create symlinks
        g.log.info('Creating symlinks...')
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 5` ; "
                   "do ln -s dir.$i sym_link_dir.$i ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_sym_link_self_heal_folder))
        ret, _, _ = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, 'Failed to modify the data for %s...'
                         % self.mounts[0].mountpoint)
        g.log.info('Modifying the data for %s is successful',
                   self.mounts[0].mountpoint)

        # Verify whether the links are properly created
        # Get symlink list
        command = ("cd %s/%s/ ; "
                   "ls |grep 'sym'"
                   % (self.mounts[0].mountpoint,
                      test_sym_link_self_heal_folder))
        _, out, _ = g.run(self.mounts[0].client_system, command)
        symlink_list = out.strip().split('\n')

        # Get folder list
        command = ("cd %s/%s/ ; "
                   "ls |grep -v 'sym'"
                   % (self.mounts[0].mountpoint,
                      test_sym_link_self_heal_folder))
        _, out, _ = g.run(self.mounts[0].client_system, command)
        folder_list = out.strip().split('\n')

        # Compare symlinks and folders
        for symlink in symlink_list:
            symlink_index = symlink_list.index(symlink)
            command = ("cd %s/%s/ ; "
                       "readlink %s"
                       % (self.mounts[0].mountpoint,
                          test_sym_link_self_heal_folder,
                          symlink))
            _, out, _ = g.run(self.mounts[0].client_system, command)
            symlink_to_folder = out.strip()
            self.assertEqual(symlink_to_folder, folder_list[symlink_index],
                             'Links are not properly created')
            g.log.info('Links for %s are properly created',
                       self.mounts[0].mountpoint)

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums before and after bringing bricks '
                         'online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
Ejemplo n.º 9
0
    def test_metadata_self_heal(self):
        """
        Test MetaData Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Change the permissions, ownership and the group
        of the files under "test_meta_data_self_heal" folder
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - check group and user are 'qa'
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)

        # Create files
        g.log.info('Creating files...')
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 50` ; "
                   "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; "
                   "done ;" %
                   (self.mounts[0].mountpoint, test_meta_data_self_heal_folder,
                    test_meta_data_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts),
                        "Io failed to complete on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Changing the permissions, ownership and the group
        # of the files under "test_meta_data_self_heal" folder
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)

        # Change permissions to 444
        g.log.info('Changing permissions...')
        command = (
            "cd %s/%s/ ; "
            "chmod -R 444 *" %
            (self.mounts[0].mountpoint, test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Permissions are changed successfully')

        # Change the ownership to qa
        g.log.info('Changing the ownership...')
        command = (
            "cd %s/%s/ ; "
            "chown -R qa *" %
            (self.mounts[0].mountpoint, test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Ownership is changed successfully')

        # Change the group to qa
        g.log.info('Changing the group...')
        command = (
            "cd %s/%s/ ; "
            "chgrp -R qa *" %
            (self.mounts[0].mountpoint, test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Group is changed successfully')

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume process %s not online "
                              "despite waiting for 5 minutes", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')

        # Adding servers and client in single dict to check permissions
        nodes_to_check = {}
        all_bricks = get_all_bricks(self.mnode, self.volname)
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            nodes_to_check[node] = brick_path
        nodes_to_check[self.mounts[0].client_system] = \
            self.mounts[0].mountpoint

        # Checking for user and group
        for node in nodes_to_check:
            # Get file list
            command = ("cd %s/%s/ ; "
                       "ls" %
                       (nodes_to_check[node], test_meta_data_self_heal_folder))
            ret, out, err = g.run(node, command)
            file_list = out.split()

            for file_name in file_list:
                file_to_check = '%s/%s/%s' % (nodes_to_check[node],
                                              test_meta_data_self_heal_folder,
                                              file_name)

                g.log.info('Checking for permissions, user and group for %s',
                           file_name)

                # Check for permissions
                cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'".format(
                    file_to_check))
                ret, permissions, _ = g.run(node, cmd)
                self.assertEqual(
                    permissions.split('\n')[0], '444',
                    'Permissions %s is not equal to 444' % permissions)
                g.log.info("Permissions are '444' for %s", file_name)

                # Check for user
                cmd = ("ls -ld {} | awk '{{print $3}}'".format(file_to_check))
                ret, username, _ = g.run(node, cmd)
                self.assertEqual(
                    username.split('\n')[0], 'qa',
                    'User %s is not equal qa' % username)
                g.log.info("User is 'qa' for %s", file_name)

                # Check for group
                cmd = ("ls -ld {} | awk '{{print $4}}'".format(file_to_check))
                ret, groupname, _ = g.run(node, cmd)
                self.assertEqual(
                    groupname.split('\n')[0], 'qa',
                    'Group %s is not equal qa' % groupname)
                g.log.info("Group is 'qa' for %s", file_name)
    def test_entry_self_heal_heal_command(self):
        """
        Test Entry-Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - get arequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - get arequal after getting bricks offline and compare with
        arequal after bringing bricks offline
        - modify the data
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before bringing bricks online
        """
        # pylint: disable=too-many-statements

        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off'")

        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("Starting IO on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("python %s create_deep_dirs_with_files "
               "--dir-length 2 "
               "--dir-depth 2 "
               "--max-num-of-dirs 2 "
               "--num-of-files 20 %s/files" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            cmd,
                            user=self.mounts[0].user)
        self.assertFalse(
            ret, 'Failed to create the data for %s: %s' %
            (self.mounts[0].mountpoint, err))
        g.log.info('Created IO for %s is successfully',
                   self.mounts[0].mountpoint)

        # Command list to do different operations with data -
        # create, rename, copy and delete
        cmd_list = [
            "python %s create_files -f 20 %s/files",
            "python %s mv %s/files",
            # 'copy' command works incorrect. disable until fixed
            # "python %s copy --dest-dir %s/new_dir %s/files",
            "python %s delete %s"
        ]

        for cmd in cmd_list:
            # Get arequal before getting bricks offline
            g.log.info('Getting arequal before getting bricks offline...')
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            result_before_offline = arequals[0].splitlines()[-1].split(':')[-1]
            g.log.info('Getting arequal before getting bricks offline '
                       'is successful')

            # Setting options
            g.log.info('Setting options...')
            options = {"self-heal-daemon": "off"}
            ret = set_volume_options(self.mnode, self.volname, options)
            self.assertTrue(ret, 'Failed to set options %s' % options)
            g.log.info("Option 'self-heal-daemon' "
                       "is set to 'off' successfully")

            # Select bricks to bring offline
            bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
                self.mnode, self.volname))
            bricks_to_bring_offline = filter(
                None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                       bricks_to_bring_offline_dict['cold_tier_bricks'] +
                       bricks_to_bring_offline_dict['volume_bricks']))

            # Bring brick offline
            g.log.info('Bringing bricks %s offline...',
                       bricks_to_bring_offline)
            ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
            self.assertTrue(
                ret,
                'Failed to bring bricks %s offline' % bricks_to_bring_offline)

            ret = are_bricks_offline(self.mnode, self.volname,
                                     bricks_to_bring_offline)
            self.assertTrue(
                ret, 'Bricks %s are not offline' % bricks_to_bring_offline)
            g.log.info('Bringing bricks %s offline is successful',
                       bricks_to_bring_offline)

            # Get arequal after getting bricks offline
            g.log.info('Getting arequal after getting bricks offline...')
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            result_after_offline = arequals[0].splitlines()[-1].split(':')[-1]
            g.log.info('Getting arequal after getting bricks offline '
                       'is successful')

            # Checking arequals before bringing bricks offline
            # and after bringing bricks offline
            self.assertEqual(result_before_offline, result_after_offline,
                             'Checksums are not equal')
            g.log.info('Checksums before bringing bricks offline '
                       'and after bringing bricks offline are equal')

            # Modify the data
            g.log.info("Start modifying IO on all mounts...")
            g.log.info("Modifying IO on %s:%s", self.mounts[0].client_system,
                       self.mounts[0].mountpoint)
            if 'copy --dest-dir' in cmd:
                parsed_cmd = cmd % (self.script_upload_path,
                                    self.mounts[0].mountpoint,
                                    self.mounts[0].mountpoint)
            else:
                parsed_cmd = cmd % (self.script_upload_path,
                                    self.mounts[0].mountpoint)
            ret, _, err = g.run(self.mounts[0].client_system,
                                parsed_cmd,
                                user=self.mounts[0].user)
            self.assertFalse(
                ret, 'Failed to modify the data for %s: %s' %
                (self.mounts[0].mountpoint, err))
            g.log.info('Modified IO for %s is successfully',
                       self.mounts[0].mountpoint)

            # Get arequal before getting bricks online
            g.log.info('Getting arequal before getting bricks online...')
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            result_before_online = arequals[0].splitlines()[-1].split(':')[-1]
            g.log.info('Getting arequal before getting bricks online '
                       'is successful')

            # List all files and dirs created
            g.log.info("List all files and directories:")
            ret = list_all_files_and_dirs_mounts(self.mounts)
            if not ret:
                raise ExecutionError("Failed to list all files and dirs")
            g.log.info("Listing all files and directories is successful")

            # Bring brick online
            g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
            ret = bring_bricks_online(self.mnode, self.volname,
                                      bricks_to_bring_offline)
            self.assertTrue(
                ret,
                'Failed to bring bricks %s online' % bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful',
                       bricks_to_bring_offline)

            # Setting options
            g.log.info('Setting options...')
            options = {"self-heal-daemon": "on"}
            ret = set_volume_options(self.mnode, self.volname, options)
            self.assertTrue(ret, 'Failed to set options %s' % options)
            g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info(
                "Successful in waiting for volume %s processes to be "
                "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(
                self.mnode, self.volname)
            self.assertTrue(
                ret, ("Volume %s : All process are not online" % self.volname))
            g.log.info("Volume %s : All process are online", self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(ret, "Either No self heal daemon process found")
            g.log.info("All self-heal-daemons are online")

            # Start healing
            ret = trigger_heal(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal is not started')
            g.log.info('Healing is started')

            # Monitor heal completion
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal has not yet completed')

            # Check if heal is completed
            ret = is_heal_complete(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal is not complete')
            g.log.info('Heal is completed successfully')

            # Check for split-brain
            ret = is_volume_in_split_brain(self.mnode, self.volname)
            self.assertFalse(ret, 'Volume is in split-brain state')
            g.log.info('Volume is not in split-brain state')

            # Get arequal after getting bricks online
            g.log.info('Getting arequal after getting bricks online...')
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            result_after_online = arequals[0].splitlines()[-1].split(':')[-1]
            g.log.info('Getting arequal after getting bricks online '
                       'is successful')

            # List all files and dirs created
            g.log.info("List all files and directories:")
            ret = list_all_files_and_dirs_mounts(self.mounts)
            if not ret:
                raise ExecutionError("Failed to list all files and dirs")
            g.log.info("Listing all files and directories is successful")

            # Checking arequals before bringing bricks online
            # and after bringing bricks online
            self.assertEqual(result_before_online, result_after_online,
                             'Checksums are not equal')
            g.log.info('Checksums before bringing bricks online '
                       'and after bringing bricks online are equal')
Ejemplo n.º 11
0
    def test_self_heal_when_io_in_progress(self):
        """Test self-heal is successful when IO is in progress.

        Description:
            - simulate brick down.
            - bring bricks online
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before simulating brick failure
        g.log.info("Logging volume info and Status before bringing bricks "
                   "offlien from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks']))

        # Bring bricks offline
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s offline",
                              bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s processes not online despite waiting"
                              "for 5 mins", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, ("Not all the bricks in list: %s are offline",
                              bricks_to_bring_offline))
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Add delay before bringing bricks online
        time.sleep(40)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s online",
                              bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "online from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
Ejemplo n.º 12
0
    def test_self_heal_50k_files_heal_default(self):
        """
        Test self-heal of 50k files by heal default
        Description:
        - bring down all bricks processes from selected set
        - create IO (50k files)
        - Get arequal before getting bricks online
        - check for daemons to come online
        - heal daemon should pick  up entries to heal automatically
        - check if heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create 50k files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 50000 %s" % (
                self.script_upload_path,
                mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Default Heal testing, wait for shd to pick up healing
        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(result_before_online, result_after_online,
                         'Checksums before and after bringing bricks online '
                         'are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
Ejemplo n.º 13
0
    def test_data_self_heal_command(self):
        """
        Test Data-Self-Heal (heal command)

        Description:
        - get the client side healing volume options and check
        if they have already been disabled by default
        NOTE: Client side healing has been disabled by default
        since GlusterFS 6.0
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - Get arequal before getting bricks offline
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Get arequal after getting bricks offline and compare with
        arequal before getting bricks offline
        - modify the data
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check if heal is completed
        - check for split-brain
        - create 5k files
        - while creating files - kill bricks and bring bricks online one by one
        in cycle
        - validate IO
        """
        # pylint: disable=too-many-statements

        # Checking if Client side healing options are disabled by default
        g.log.info('Checking Client side healing is disabled by default')
        options = ('cluster.metadata-self-heal', 'cluster.data-self-heal',
                   'cluster.entry-self-heal')
        for option in options:
            ret = get_volume_options(self.mnode, self.volname, option)[option]
            self.assertTrue(bool(ret == 'off' or ret == 'off (DEFAULT)'),
                            "{} option is not disabled by default"
                            .format(option))
            g.log.info("Client side healing options are disabled by default")

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 100 "
                       "--fixed-file-size 1k %s" % (
                           self.script_upload_path,
                           mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Checksums before and '
                         'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 100 "
                       "--fixed-file-size 10k %s" % (
                           self.script_upload_path,
                           mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all processes are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Create 1k files
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 1000 %s" % (
                self.script_upload_path,
                mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Kill all bricks in cycle
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_list:
            # Bring brick offline
            g.log.info('Bringing bricks %s offline', brick)
            ret = bring_bricks_offline(self.volname, [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick)

            ret = are_bricks_offline(self.mnode, self.volname,
                                     [brick])
            self.assertTrue(ret, 'Bricks %s are not offline'
                            % brick)
            g.log.info('Bringing bricks %s offline is successful',
                       bricks_to_bring_offline)

            # Bring brick online
            g.log.info('Bringing bricks %s online...', brick)
            ret = bring_bricks_online(self.mnode, self.volname,
                                      [brick])
            self.assertTrue(ret, 'Failed to bring bricks %s online' %
                            bricks_to_bring_offline)
            g.log.info('Bringing bricks %s online is successful',
                       bricks_to_bring_offline)

            # Wait for volume processes to be online
            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode,
                                                       self.volname)
            self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                                  "be online", self.volname))
            g.log.info("Successful in waiting for volume %s processes to be "
                       "online", self.volname)

            # Verify volume's all process are online
            g.log.info("Verifying volume's all process are online")
            ret = verify_all_process_of_volume_are_online(self.mnode,
                                                          self.volname)
            self.assertTrue(ret, ("Volume %s : All process are not online"
                                  % self.volname))
            g.log.info("Volume %s : All process are online", self.volname)

            # Wait for self-heal-daemons to be online
            g.log.info("Waiting for self-heal-daemons to be online")
            ret = is_shd_daemonized(self.all_servers)
            self.assertTrue(ret, "Either No self heal daemon process found or"
                                 "more than one self heal daemon process"
                                 "found")
            g.log.info("All self-heal-daemons are online")

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True
    def test_data_self_heal_algorithm_diff_heal_command(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff'

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "data-self-heal-algorithm": "diff"
        "self-heal-daemon": "off"
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - modify the data
        - get arequal before getting bricks online
        - bring bricks online
        - expand volume by adding bricks to the volume
        - do rebalance
        - set the volume option "self-heal-daemon": "on" and check for daemons
        - start healing
        - check if heal is completed
        - check for split-brain
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "data-self-heal-algorithm": "diff"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "'self-heal-daemon' "
                   "are set to 'off',"
                   "'data-self-heal-algorithm' "
                   "is set to 'diff' successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Expand volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums are not equal')
        g.log.info('Checksums are equal')
Ejemplo n.º 15
0
    def test_self_heal_50k_files(self):
        """
        Description:
        - Select bricks to bring offline
        - Bring brick offline
        - Create 50k files
        - Validate IO
        - Bring bricks online
        - Monitor heal
        - Check for split-brain
        - Validate IO
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Select bricks to bring offline
        bricks_to_bring_offline_dict = select_bricks_to_bring_offline(
            self.mnode, self.volname)
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Create 50k files
        command = ("cd %s ; "
                   "for i in `seq 1 50000` ; "
                   "do dd if=/dev/urandom of=test.$i "
                   "bs=100k count=1 ;  "
                   "done ;" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)

        # Validate IO
        self.assertTrue(validate_io_procs([proc], self.mounts[0]),
                        "IO failed on some of the clients")

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3000)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Ejemplo n.º 16
0
    def test_self_heal_50k_files_heal_command_by_add_brick(self):
        """
        Test self-heal of 50k files (heal command
        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - create IO (50k files)
        - Get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check for daemons
        - start healing
        - check if heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - add bricks
        - do rebalance
        - get arequal after adding bricks and compare with
        arequal after getting bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off",
                   "self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create 50k files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 50000 %s" % (
                self.script_upload_path,
                mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums before and '
                              'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Get arequal after adding bricks
        g.log.info('Getting arequal after adding bricks...')
        ret, result_after_adding_bricks = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks '
                   'is successful')

        # Checking arequals after bringing bricks online
        # and after adding bricks
        self.assertItemsEqual(result_after_online, result_after_adding_bricks,
                              'Checksums after bringing bricks online and '
                              'after adding bricks are not equal')
        g.log.info('Checksums after bringing bricks online and '
                   'after adding bricks are equal')
    def test_heal_info_should_have_fixed_fields(self):
        """
        - Create IO
        - While IO is creating - bring down a couple of bricks
        - Wait for IO to complete
        - Bring up the down bricks
        - Wait for heal to complete
        - Check for fields 'Brick', 'Status', 'Number of entries' in heal info
        """
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -f 50 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get heal info
        g.log.info('Getting heal info...')
        heal_info_dicts = get_heal_info_summary(self.mnode, self.volname)
        self.assertFalse(ret, 'Failed to get heal info')
        g.log.info(heal_info_dicts)

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Check all fields in heal info dict
        g.log.info('Checking for all the fields in heal info...')
        for brick in bricks_list:
            g.log.info('Checking fields for %s', brick)
            self.assertEqual(heal_info_dicts[brick]['status'], 'Connected',
                             'Status is not Connected for brick %s' % brick)
            self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0',
                             'numberOfEntries is not 0 for brick %s' % brick)

        g.log.info('Successfully checked for all the fields in heal info')
Ejemplo n.º 18
0
    def test_self_heal_algorithm_full_daemon_off(self):
        """""
        Description:-
        Checking healing when algorithm is set to "full" and self heal daemon
        is "off".
        """ ""
        # pylint: disable=too-many-statements

        # Setting volume option of self heal & algorithm
        options = {
            "metadata-self-heal": "disable",
            "entry-self-heal": "disable",
            "data-self-heal": "disable",
            "data-self-heal-algorithm": "full",
            "self-heal-daemon": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, "Failed to set the volume options %s" % options)
        g.log.info(" Volume set options success")

        # Select bricks to bring down
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)

        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret,
            "Failed to bring bricks: %s offline" % bricks_to_bring_offline)
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, "Not all the bricks in list:%s are offline" %
            bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks %s are all offline",
                   bricks_to_bring_offline)

        # IO on the mount point
        all_mounts_procs = []
        g.log.info("Creating Files on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("cd %s ;for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=1;done" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Collecting Arequal before bring the bricks up
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_before = collect_mounts_arequal(self.mounts)

        # Turning self heal daemon ON
        optionstwo = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, optionstwo)
        self.assertTrue(ret, "Failed to turn self-heal ON")
        g.log.info("Volume set options %s: success", optionstwo)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, "Failed to bring bricks: %s online" % bricks_to_bring_offline)
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Waiting for bricks to come online
        g.log.info("Waiting for brick process to come online")
        ret = wait_for_bricks_to_be_online(self.mnode,
                                           self.volname,
                                           timeout=30)
        self.assertTrue(ret, "bricks didn't come online after adding bricks")
        g.log.info("Bricks are online")

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Volume %s : All process are not online" % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self heal processes to come online
        g.log.info("Wait for selfheal process to come online")
        ret = wait_for_self_heal_daemons_to_be_online(self.mnode,
                                                      self.volname,
                                                      timeout=300)
        self.assertTrue(ret, "Self-heal process are not online")
        g.log.info("All self heal process are online")

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # arequal after healing
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_after = collect_mounts_arequal(self.mounts)

        # Comparing the results
        g.log.info("comparing both the results")
        self.assertEqual(result_before, result_after, "Arequals are not equal")
Ejemplo n.º 19
0
    def test_self_heal_50k_files_heal_command_by_add_brick(self):
        """
        Test self-heal of 50k files (heal command)
        Description:
        - Set the volume option
          "metadata-self-heal": "off"
          "entry-self-heal": "off"
          "data-self-heal": "off"
          "self-heal-daemon": "off"
        - Bring down all bricks processes from selected set
        - Create IO (50k files)
        - Get arequal before getting bricks online
        - Bring bricks online
        - Set the volume option
          "self-heal-daemon": "on"
        - Check for daemons
        - Start healing
        - Check if heal is completed
        - Check for split-brain
        - Get arequal after getting bricks online and compare with
          arequal before getting bricks online
        - Add bricks
        - Do rebalance
        - Get arequal after adding bricks and compare with
          arequal after getting bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "self-heal-daemon": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files on client side
        all_mounts_procs = []

        # Create 50k files
        g.log.info('Creating files...')
        command = ("cd %s ; "
                   "for i in `seq 1 50000` ; "
                   "do dd if=/dev/urandom of=test.$i "
                   "bs=100k count=1 ;  "
                   "done ;" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts[0]),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        ret, result_before_online = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        ret = set_volume_options(self.mnode, self.volname,
                                 {"self-heal-daemon": "on"})
        self.assertTrue(ret, 'Failed to set option self-heal-daemon to ON.')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(
            result_before_online, result_after_online, 'Checksums before and '
            'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')

        # Add bricks
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance and wait for it to complete
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=3600)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Get arequal after adding bricks
        ret, result_after_adding_bricks = collect_mounts_arequal(
            self.mounts[0])
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks ' 'is successful')

        # Checking arequals after bringing bricks online
        # and after adding bricks
        self.assertItemsEqual(
            result_after_online, result_after_adding_bricks,
            'Checksums after bringing bricks online and '
            'after adding bricks are not equal')
        g.log.info('Checksums after bringing bricks online and '
                   'after adding bricks are equal')
Ejemplo n.º 20
0
    def test_self_heal_when_io_in_progress(self):
        """Test self-heal is successful when IO is in progress.

        Description:
            - simulate brick down.
            - bring bricks online
            - wait for heal to complete
            - validate IO
        """
        # pylint: disable=too-many-statements
        # Check if volume type is dispersed. If the volume type is
        # dispersed, set the volume option 'disperse.optimistic-change-log'
        # to 'off'
        # Refer to: https://bugzilla.redhat.com/show_bug.cgi?id=1470938
        # pylint: disable=unsupported-membership-test
        if 'dispersed' in self.volume_type and 'nfs' in self.mount_type:
            g.log.info("Set volume option 'disperse.optimistic-change-log' "
                       "to 'off' on a dispersed volume . "
                       "Refer to bug: "
                       "https://bugzilla.redhat.com/show_bug.cgi?id=1470938")
            ret = set_volume_options(self.mnode, self.volname,
                                     {'disperse.optimistic-change-log': 'off'})
            self.assertTrue(ret, ("Failed to set the volume option %s to "
                                  "off on volume %s",
                                  'disperse.optimistic-change-log',
                                  self.volname))
            g.log.info("Successfully set the volume option "
                       "'disperse.optimistic-change-log' to 'off'")

        # Log Volume Info and Status before simulating brick failure
        g.log.info("Logging volume info and Status before bringing bricks "
                   "offlien from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks']))

        # Bring bricks offline
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s offline",
                              bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Log Volume Info and Status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, ("Not all the bricks in list: %s are offline",
                              bricks_to_bring_offline))
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Add delay before bringing bricks online
        time.sleep(40)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s online",
                              bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "online from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_brick_process_not_started_on_read_only_node_disks(self):
        """
        * create volume and start
        * kill one brick
        * start IO
        * unmount the brick directory from node
        * remount the brick directory with read-only option
        * start the volume with "force" option
        * check for error 'posix: initializing translator failed' in log file
        * remount the brick directory with read-write option
        * start the volume with "force" option
        * validate IO
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files for all volumes
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_files -f 100 "
                   "%s/test_dir" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # umount brick
        brick_node, volume_brick = bricks_to_bring_offline[0].split(':')
        node_brick = '/'.join(volume_brick.split('/')[0:3])
        g.log.info('Start umount brick %s...', node_brick)
        ret, _, _ = g.run(brick_node, 'umount %s' % node_brick)
        self.assertFalse(ret, 'Failed to umount brick %s' % node_brick)
        g.log.info('Successfully umounted %s', node_brick)

        # get time before remount the directory and checking logs for error
        g.log.info('Getting time before remount the directory and '
                   'checking logs for error...')
        _, time_before_checking_logs, _ = g.run(brick_node, 'date -u +%s')
        g.log.info('Time before remount the directory and checking logs - %s',
                   time_before_checking_logs)

        # remount the directory with read-only option
        g.log.info('Start remount brick %s with read-only option...',
                   node_brick)
        ret, _, _ = g.run(brick_node, 'mount -o ro %s' % node_brick)
        self.assertFalse(ret, 'Failed to remount brick %s' % node_brick)
        g.log.info('Successfully remounted %s with read-only option',
                   node_brick)

        # start volume with "force" option
        g.log.info('starting volume with "force" option...')
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertFalse(
            ret,
            'Failed to start volume %s with "force" option' % self.volname)
        g.log.info('Successfully started volume %s with "force" option',
                   self.volname)

        # check logs for an 'initializing translator failed' error
        g.log.info(
            "Checking logs for an 'initializing translator failed' "
            "error for %s brick...", node_brick)
        error_msg = 'posix: initializing translator failed'
        cmd = ("cat /var/log/glusterfs/bricks/bricks-%s-%s.log | "
               "grep '%s'" % (volume_brick.split('/')[-2],
                              volume_brick.split('/')[-1], error_msg))
        ret, log_msgs, _ = g.run(brick_node, cmd)
        log_msg = log_msgs.rstrip().split('\n')[-1]

        self.assertTrue(error_msg in log_msg, 'No errors in logs')
        g.log.info('EXPECTED: %s', error_msg)

        # get time from log message
        log_time_msg = log_msg.split('E')[0][1:-2].split('.')[0]
        log_time_msg_converted = calendar.timegm(
            time.strptime(log_time_msg, '%Y-%m-%d %H:%M:%S'))
        g.log.info('Time_msg from logs - %s ', log_time_msg)
        g.log.info('Time from logs - %s ', log_time_msg_converted)

        # get time after remount the directory checking logs for error
        g.log.info('Getting time after remount the directory and '
                   'checking logs for error...')
        _, time_after_checking_logs, _ = g.run(brick_node, 'date -u +%s')
        g.log.info('Time after remount the directory and checking logs - %s',
                   time_after_checking_logs)

        # check time periods
        g.log.info('Checking if an error is in right time period...')
        self.assertTrue(
            int(time_before_checking_logs) <= int(log_time_msg_converted) <=
            int(time_after_checking_logs),
            'Expected error is not in right time period')
        g.log.info('Expected error is in right time period')

        # umount brick
        g.log.info('Start umount brick %s...', node_brick)
        ret, _, _ = g.run(brick_node, 'umount %s' % node_brick)
        self.assertFalse(ret, 'Failed to umount brick %s' % node_brick)
        g.log.info('Successfully umounted %s', node_brick)

        # remount the directory with read-write option
        g.log.info('Start remount brick %s with read-write option...',
                   node_brick)
        ret, _, _ = g.run(brick_node, 'mount %s' % node_brick)
        self.assertFalse(ret, 'Failed to remount brick %s' % node_brick)
        g.log.info('Successfully remounted %s with read-write option',
                   node_brick)

        # start volume with "force" option
        g.log.info('starting volume with "force" option...')
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertFalse(
            ret,
            'Failed to start volume %s with "force" option' % self.volname)
        g.log.info('Successfully started volume %s with "force" option',
                   self.volname)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True
Ejemplo n.º 22
0
    def test_data_self_heal_algorithm_full_default(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full'

        Description:
        - set the volume option "data-self-heal-algorithm" to value "full"
        - create IO
        - bring down all bricks processes from selected set
        - modify the data
        - calculate arequal
        - bring bricks online
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options "data-self-heal-algorithm": "full"...')
        options = {"data-self-heal-algorithm": "full"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'data-self-heal-algorithm' is set to 'full' "
                   "successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = ("python %s create_files -f 100 %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(
            None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                   bricks_to_bring_offline_dict['cold_tier_bricks'] +
                   bricks_to_bring_offline_dict['volume_bricks']))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("python %s create_files -f 100 --fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')
Ejemplo n.º 23
0
    def test_glustershd_with_restarting_glusterd(self):
        """
        Test Script to verify the self heal daemon process with restarting
        glusterd and rebooting the server

        * stop all volumes
        * restart glusterd - should not run self heal daemon process
        * start replicated involved volumes
        * single self heal daemon process running
        * restart glusterd
        * self heal daemon pid will change
        * bring down brick and restart glusterd
        * self heal daemon pid will change and its different from previous
        * brought up the brick

        """
        # pylint: disable=too-many-statements
        nodes = self.volume['servers']

        # stop the volume
        g.log.info("Stopping the volume %s", self.volname)
        ret = volume_stop(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to stop volume %s" % self.volname))
        g.log.info("Successfully stopped volume %s", self.volname)

        # check the self heal daemon process after stopping the volume
        g.log.info("Verifying the self heal daemon process for "
                   "volume %s", self.volname)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is still running "
                               "even after stopping volume %s" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume %s", self.volname)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        self.assertTrue(
            wait_for_glusterd_to_start(self.servers),
            "Failed to start glusterd on %s" % self.servers)

        # check the self heal daemon process after restarting glusterd process
        g.log.info("Starting to get self-heal daemon process on"
                   " nodes %s", nodes)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is running after "
                               "glusterd restart with volume %s in "
                               "stop state" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume and restarting glusterd %s", self.volname)

        # start the volume
        g.log.info("Starting the volume %s", self.volname)
        ret = volume_start(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to start volume %s" % self.volname))
        g.log.info("Volume %s started successfully", self.volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("Checking whether glustershd process is daemonized or not")
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Single self heal daemon process on all nodes %s", nodes)

        # get the self heal daemon pids after starting volume
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Successful in getting self heal daemon pids")
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info
        # with glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different from "
                              "glustershd server volume file. "
                              "Please check log file for details."))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after starting volume and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_glusterd_restart = pids

        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_glusterd_restart,
                            ("Self Heal Daemon pids are same after "
                             "restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different before and "
                   "after restarting glusterd process")

        # select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # bring bricks offline
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # restart glusterd after brought down the brick
        g.log.info("Restart glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after killing brick and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_killing_brick = pids

        self.assertNotEqual(glustershd_pids_after_glusterd_restart,
                            glustershd_pids_after_killing_brick,
                            ("Self Heal Daemon process are same from before "
                             "killing the brick,restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different after killing the "
                   "brick, restarting the glusterd process")

        # brought the brick online
        g.log.info("bringing up the bricks : %s online",
                   bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to brought the bricks online"))
        g.log.info("Successfully brought the bricks online")

        # check all bricks are online
        g.log.info("Verifying all bricka are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")
Ejemplo n.º 24
0
    def test_self_heal(self):
        """
        Description:-
        - Create files on mount point
        - Kill one brick from volume
        - rm -rfv on mount point
        - bring bricks online
        - wait for heals
        - list
        """
        # pylint: disable=too-many-statements

        # IO on the mount point
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = (
                "/usr/bin/env python %s create_deep_dirs_with_files "
                "--dirname-start-num %d "
                "--dir-depth 2 "
                "--dir-length 35 "
                "--max-num-of-dirs 5 "
                "--num-of-files 5 %s" %
                (self.script_upload_path, self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Killing one brick from the volume set
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s offline", bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, "Not all the bricks in list: %s are offline" %
            bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Checking volume status
        g.log.info(
            "Logging volume info and Status after bringing bricks "
            "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Removing files from the mount point when one brick is down
        g.log.info("Removing files from the mount point")
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system
        cmd = "rm -rfv %s/*" % mountpoint
        ret, _, _ = g.run(client, cmd)
        if ret != 0:
            raise ExecutionError("failed to delete the files")

        # Bringing bricks online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Check if bricks are online
        g.log.info("Checking bricks are online or not")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not online' % bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Monitoring heals on the volume
        g.log.info("Wait for heal completion...")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes.")
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")