def _bring_bricks_online_heal(self, mnode, volname, bricks_list):
        """
        Bring bricks online and monitor heal completion
        """
        # Bring bricks online
        ret = bring_bricks_online(
            mnode,
            volname,
            bricks_list,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(ret, 'Failed to bring bricks online')

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(mnode, volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(mnode, volname)
        self.assertTrue(
            ret, ("Volume {} : All process are not online".format(volname)))
        g.log.info("Volume %s : All process are online", volname)

        # Monitor heal completion
        ret = monitor_heal_completion(mnode, volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(mnode, volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
Esempio n. 2
0
    def _validate_brick_down_scenario(self,
                                      validate_heal=False,
                                      monitor_heal=False):
        """
        Refactor of common steps across volume type for validating brick down
        scenario
        """
        if validate_heal:
            # Wait for ample amount of IO to be written to file
            sleep(180)

            # Validate heal info shows o/p and exit in <8s
            self._validate_heal()

        # Force start volume and verify all process are online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, 'Unable to force start volume')

        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, 'Not able to confirm all process of volume are online')

        if monitor_heal:
            # Wait for IO to be written to file
            sleep(30)

            # Monitor heal and validate data was appended successfully to file
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret,
                            'Self heal is not completed post brick online')
    def _validate_heal_completion_and_arequal(self, op_type):
        '''Refactor of steps common to all tests: Validate heal from heal
        commands, verify arequal, perform IO and verify arequal after IO'''

        # Validate heal completion
        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
                        'Self heal is not completed within timeout')
        self.assertFalse(
            is_volume_in_split_brain(self.mnode, self.volname),
            'Volume is in split brain even after heal completion')

        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        arbiter = self.volume_type.find('arbiter') >= 0
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])

        # Validate arequal
        self._validate_arequal_and_perform_lookup(subvols, stop)

        # Perform some additional metadata/data operations
        cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd)
        ret, _, err = g.run(self.client, cmd)
        self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
        self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

        # Validate arequal after additional operations
        self._validate_arequal_and_perform_lookup(subvols, stop)
Esempio n. 4
0
    def test_replica_to_arbiter_volume_with_io(self):
        """
        Description: Replica 3 to arbiter conversion with ongoing IO's

        Steps :
        1) Create a replica 3 volume and start volume.
        2) Set client side self heal off.
        3) Fuse mount the volume.
        4) Create directory dir1 and write data.
           Example: untar linux tar from the client into the dir1
        5) When IO's is running, execute remove-brick command,
           and convert replica 3 to replica 2 volume
        6) Execute add-brick command and convert to arbiter volume,
           provide the path of new arbiter brick.
        7) Issue gluster volume heal.
        8) Heal should be completed with no files in split-brain.
        """

        # pylint: disable=too-many-statements
        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        self.io_process = run_linux_untar(self.clients[0],
                                          self.mounts[0].mountpoint,
                                          dirs=tuple(['linuxuntar']))
        self.is_io_running = True

        # Convert relicated to arbiter volume
        self._convert_replicated_to_arbiter_volume()

        # Wait for IO to complete.
        ret = self._wait_for_untar_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Esempio n. 5
0
 def _bring_bricks_online_and_monitor_heal(self, bricks):
     """Bring the bricks online and monitor heal until completion"""
     ret, _, _ = volume_start(self.mnode, self.volname, force=True)
     self.assertEqual(ret, 0, 'Not able to force start volume')
     ret = monitor_heal_completion(self.mnode,
                                   self.volname,
                                   bricks=list(bricks))
     self.assertTrue(ret, 'Heal is not complete for {}'.format(bricks))
Esempio n. 6
0
def ec_check_heal_comp(self):
    g.log.info("Get the pending heal info for the volume %s", self.volname)
    heal_info = get_heal_info(self.mnode, self.volname)
    g.log.info("Successfully got heal info for the volume %s", self.volname)
    g.log.info("Heal Entries %s : %s", self.volname, heal_info)

    # Monitor heal completion
    ret = monitor_heal_completion(self.mnode, self.volname)
    self.assertTrue(ret, 'Heal has not yet completed')
Esempio n. 7
0
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status after replacing the brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Wait for self-heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=1800)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 30 minutes. 30 minutes is too much a time for "
            "current test workload")

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
Esempio n. 8
0
    def test_gfid_heal(self):
        """
        - Create a 1x3 volume and fuse mount it.
        - Create 1 directory with 1 file inside it directly on each brick.
        - Access the directories from the mount.
        - Launch heals and verify that the heals are over.
        - Verify that the files and directories have gfid assigned.
        """
        # pylint: disable=too-many-statements

        # Create data on the bricks.
        g.log.info("Creating directories and files on the backend.")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        i = 0
        for brick in bricks_list:
            i += 1
            brick_node, brick_path = brick.split(":")
            ret, _, _ = g.run(brick_node, "mkdir %s/dir%d" % (brick_path, i))
            self.assertEqual(ret, 0, "Dir creation failed on %s" % brick_path)
            ret, _, _ = g.run(brick_node,
                              "touch %s/dir%d/file%d" % (brick_path, i, i))
            self.assertEqual(ret, 0, "file creation failed on %s" % brick_path)
        g.log.info("Created directories and files on the backend.")

        # To circumvent is_fresh_file() check in glusterfs code.
        time.sleep(2)

        # Access files from mount
        for i in range(1, 4):
            cmd = ("ls %s/dir%d/file%d" % (self.mounts[0].mountpoint, i, i))
            ret, _, _ = g.run(self.clients[0], cmd)
            self.assertEqual(
                ret, 0, "Failed to access dir%d/file%d on %s" %
                (i, i, self.mounts[0].mountpoint))

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Verify gfid and links at the backend.
        self.verify_gfid_and_link_count("dir1", "file1")
        self.verify_gfid_and_link_count("dir2", "file2")
        self.verify_gfid_and_link_count("dir3", "file3")
    def test_compile_linux_kernel_with_self_heal(self):
        """
        Test Case:
        1. Create a volume of any type, start it and mount it.
        2. Perform linux untar on the mount point and wait for it to complete.
        3. Bring down brick processes of volume.
        4. Compile linux kernel on monut point.
        5. Bring offline bricks back online.
        6. Wait for heal to complete.
        7. Check arequal checksum on bricks and make sure there is
           no data loss.

        Note:
        Run the below command before this test:
        yum install elfutils-libelf-devel openssl-devel git fakeroot
        ncurses-dev xz-utils libssl-dev bc flex libelf-dev bison

        OR

        dnf install make flex bison elfutils-libelf-devel openssl-devel

        The below test takes anywhere between 27586.94 sec to 35396.90 sec,
        as kernel compilation is a very slow process in general.
        """
        # Perform linux untar on the mount point and wait for it to complete
        self._untar_linux_kernel_in_a_specific_dir()

        # Bring down brick processes of volume
        self._bring_bricks_offline()

        # Compile linux kernel on monut point
        cmd = ('cd {}/{}/linux-5.4.54/; make defconfig; make'.format(
            self.mountpoint, 'test_self_heal'))
        ret, _, _ = g.run(self.first_client, cmd)
        self.assertFalse(ret, "Failed to compile linux kernel")
        g.log.info("linux kernel compliation successful")

        # Bring offline bricks back online
        self._restart_volume_and_bring_all_offline_bricks_online()

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check arequal checksum on bricks and make sure there is
        # no data loss
        self._check_arequal_checksum_for_the_volume()
    def _replace_bricks_and_wait_for_heal_completion(self):
        """ Replaces all the bricks and waits for the heal to complete"""
        existing_bricks = get_all_bricks(self.mnode, self.volname)
        for brick_to_replace in existing_bricks:
            ret = replace_brick_from_volume(self.mnode,
                                            self.volname,
                                            self.servers,
                                            self.all_servers_info,
                                            src_brick=brick_to_replace)
            self.assertTrue(ret, "Replace of %s failed" % brick_to_replace)
            g.log.info("Replace of brick %s successful for volume %s",
                       brick_to_replace, self.volname)

            # Monitor heal completion
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal has not yet completed')
            g.log.info('Heal has completed successfully')
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        g.log.info(
            "Logging volume info and Status before replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a faulty brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for gluster processes to come online
        time.sleep(30)

        # Log Volume Info and Status after replacing the brick
        g.log.info(
            "Logging volume info and Status after replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_heal_command_unsuccessful_as_bricks_down(self):
        """
        - write 2 Gb file on mount
        - while write is in progress, kill brick b0
        - start heal on the volume (should fail and have error message)
        - bring up the brick which was down (b0)
        - bring down another brick (b1)
        - start heal on the volume (should fail and have error message)
        - bring bricks up
        - wait for heal to complete
        """
        # pylint: disable=too-many-statements
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create 2 Gb file
            g.log.info('Creating files...')
            command = ("cd %s ; dd if=/dev/zero of=file1  bs=10M  count=200"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Bring brick0 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[0])
        ret = bring_bricks_offline(self.volname, [bricks_list[0]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[0]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[0])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[0])

        # Start healing
        # Need to use 'gluster volume heal' command to check error message
        # after g.run
        cmd = "gluster volume heal %s" % self.volname
        ret, _, err = g.run(self.mnode, cmd)
        self.assertTrue(ret, 'Heal is started')
        # Check for error message
        self.assertIn("Launching heal operation to perform index self heal on "
                      "volume %s has been unsuccessful" % self.volname,
                      err,
                      "Error message is not present or not valid")
        g.log.info('Expected: Healing is not started')

        # Bring brick0 online
        g.log.info("Bring bricks: %s online", bricks_list[0])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[0]])
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_list[0])
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_list[0])

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Start healing
        # Need to use 'gluster volume heal' command to check error message
        # after g.run
        cmd = "gluster volume heal %s" % self.volname
        ret, _, err = g.run(self.mnode, cmd)
        self.assertTrue(ret, 'Heal is started')
        # Check for error message
        self.assertIn("Launching heal operation to perform index self heal on "
                      "volume %s has been unsuccessful" % self.volname,
                      err,
                      "Error message is not present or not valid")
        g.log.info('Expected: Healing is not started')

        # Bring brick 1 online
        g.log.info("Bring bricks: %s online", bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_list[1])
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True
Esempio n. 13
0
    def test_manual_heal_should_trigger_heal(self):
        """
        - create a single brick volume
        - add some files and directories
        - get arequal from mountpoint
        - add-brick such that this brick makes the volume a replica vol 1x2
        - start heal
        - make sure heal is completed
        - get arequals from all bricks and compare with arequal from mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_deep_dirs_with_files "
                   "--dir-length 1 "
                   "--dir-depth 1 "
                   "--max-num-of-dirs 1 "
                   "--num-of-files 10 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Get arequal for mount before adding bricks
        g.log.info('Getting arequal before adding bricks...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Form brick list to add
        g.log.info('Forming brick list to add...')
        bricks_to_add = form_bricks_list(self.mnode, self.volname, 1,
                                         self.servers, self.all_servers_info)
        g.log.info('Brick list to add: %s', bricks_to_add)

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret, _, _ = add_brick(self.mnode,
                              self.volname,
                              bricks_to_add,
                              force=True,
                              replica_count=2)
        self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add)
        g.log.info("Adding bricks is successful on volume %s", self.volname)

        # Make sure the newly added bricks are available in the volume
        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)
        for brick in bricks_to_add:
            self.assertIn(brick, bricks_list,
                          'Brick %s is not in brick list' % brick)
        g.log.info('New bricks are present in the volume')

        # Make sure volume change from distribute to replicate volume
        vol_info_dict = get_volume_type_info(self.mnode, self.volname)
        vol_type = vol_info_dict['volume_type_info']['typeStr']
        self.assertEqual(
            'Replicate', vol_type, 'Volume type is not converted to Replicate '
            'after adding bricks')
        g.log.info('Volume type is successfully converted to Replicate '
                   'after adding bricks')

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        g.log.info('Getting arequal on bricks...')
        arequals_after_heal = {}
        for brick in bricks_list:
            g.log.info('Getting arequal on bricks %s...', brick)
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            arequals_after_heal[brick] = brick_total
            self.assertEqual(
                mount_point_total, brick_total,
                'Arequals for mountpoint and %s are not equal' % brick)
            g.log.info('Arequals for mountpoint and %s are equal', brick)
        g.log.info('All arequals are equal for replicated')
    def test_data_split_brain_resolution(self):
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Creating files and directories on client side
        g.log.info('Creating files and directories...')
        cmd = ("for i in `seq 1 10`; do mkdir %s/dir.$i; for j in `seq 1 5`;"
               "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1K count=1;"
               "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" %
               (self.mounts[0].mountpoint, self.mounts[0].mountpoint,
                self.mounts[0].mountpoint))

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Creating files and directories failed")
        g.log.info("Files & directories created successfully")

        # Check arequals for all the bricks
        g.log.info('Getting arequal before getting bricks offline...')
        self.verify_brick_arequals()
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Set option self-heal-daemon to OFF
        g.log.info('Setting option self-heal-daemon to off...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        bricks_list = get_all_bricks(self.mnode, self.volname)

        # Bring brick1 offline
        g.log.info('Bringing brick %s offline', bricks_list[0])
        ret = bring_bricks_offline(self.volname, bricks_list[0])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0])
        g.log.info('Bringing brick %s offline is successful', bricks_list[0])

        # Modify the contents of the files
        cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;"
               "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=1;"
               "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" %
               (self.mounts[0].mountpoint, self.mounts[0].mountpoint))

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Updating file contents failed")
        g.log.info("File contents updated successfully")

        # Bricng brick1 online and check the status
        g.log.info('Bringing brick %s online', bricks_list[0])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[0])
        g.log.info('Bringing brick %s online is successful', bricks_list[0])

        g.log.info("Verifying if brick %s is online", bricks_list[0])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0]))
        g.log.info("Brick %s has come online.", bricks_list[0])

        # Bring brick2 offline
        g.log.info('Bringing brick %s offline', bricks_list[1])
        ret = bring_bricks_offline(self.volname, bricks_list[1])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1])
        g.log.info('Bringing brick %s offline is successful', bricks_list[1])

        # Modify the contents of the files
        cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;"
               "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=2;"
               "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=2; done" %
               (self.mounts[0].mountpoint, self.mounts[0].mountpoint))

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Updating file contents failed")
        g.log.info("File contents updated successfully")

        # Bricng brick2 online and check the status
        g.log.info('Bringing brick %s online', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[1])
        g.log.info('Bringing brick %s online is successful', bricks_list[1])

        g.log.info("Verifying if brick %s is online", bricks_list[1])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1]))
        g.log.info("Brick %s has come online.", bricks_list[1])

        # Set option self-heal-daemon to ON
        g.log.info('Setting option self-heal-daemon to on...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        g.log.info("Checking if files are in split-brain")
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to create split-brain scenario")
        g.log.info("Successfully created split brain scenario")

        g.log.info("Resolving split-brain by using the source-brick option "
                   "by choosing second brick as source for all the files")
        node, _ = bricks_list[1].split(':')
        command = ("gluster v heal " + self.volname + " split-brain "
                   "source-brick " + bricks_list[1])
        ret, _, _ = g.run(node, command)
        self.assertEqual(ret, 0, "Command execution not successful")

        # triggering heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, "Heal not triggered")

        # waiting for heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=120)
        self.assertTrue(ret, "Heal not completed")

        # Try accessing the file content from the mount
        cmd = ("for i in `seq 1 10`; do cat %s/file.$i > /dev/null;"
               "for j in `seq 1 5` ; do cat %s/dir.$i/file.$j > /dev/null;"
               "done ; done" %
               (self.mounts[0].mountpoint, self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Unable to access the file contents")
        g.log.info("File contents are accessible")

        # checking if file is in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, "File still in split-brain")
        g.log.info("Successfully resolved split brain situation using "
                   "CLI based resolution")

        # Check arequals for all the bricks
        g.log.info('Getting arequal for all the bricks after heal...')
        self.verify_brick_arequals()
        g.log.info('Getting arequal after heal is successful')
    def test_afr_heal_with_brickdown_hardlink(self):
        """
        Steps:
        1. Create  2 * 3 distribute replicate volume and disable all heals
        2. Create a file and 3 hardlinks to it from fuse mount.
        3. Kill brick4, rename HLINK1 to an appropriate name so that
           it gets hashed to replicate-1
        4. Likewise rename HLINK3 and HLINK7 as well, killing brick5 and brick6
           respectively each time. i.e. a different brick of the 2nd
           replica is down each time.
        5. Now enable shd and let selfheals complete.
        6. Heal should complete without split-brains.
        """
        bricks_list = get_all_bricks(self.mnode, self.volname)
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "self-heal-daemon": "off"
        }
        g.log.info("setting options %s", options)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for"
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        cmd = ("touch %s/FILE" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # Creating a hardlink for the file created
        for i in range(1, 4):
            ret = create_link_file(
                self.clients[0], '{}/FILE'.format(self.mounts[0].mountpoint),
                '{}/HLINK{}'.format(self.mounts[0].mountpoint, i))
            self.assertTrue(ret, "Unable to create hard link file ")

        # Bring brick3 offline,Rename file HLINK1,and bring back brick3 online
        self._test_brick_down_with_file_rename("HLINK1", "NEW-HLINK1",
                                               bricks_list[3])

        # Bring brick4 offline,Rename file HLINK2,and bring back brick4 online
        self._test_brick_down_with_file_rename("HLINK2", "NEW-HLINK2",
                                               bricks_list[4])

        # Bring brick5 offline,Rename file HLINK3,and bring back brick5 online
        self._test_brick_down_with_file_rename("HLINK3", "NEW-HLINK3",
                                               bricks_list[5])

        # Setting options
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Check data on mount point
        cmd = ("ls %s" % (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "failed to fetch data from mount point")
    def test_self_heal_differing_in_file_type(self):
        """
        testing self heal of files with different file types
        with default configuration

        Description:
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequal and compare with arequal before
        getting bricks offline
        - modify the data
        - arequal before getting bricks online
        - bring bricks online
        - check daemons and healing completion
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        online and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        all_mounts_procs = []
        test_file_type_differs_self_heal_folder = \
            'test_file_type_differs_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=1k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; "
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertEqual(sorted(result_before_offline),
                         sorted(result_after_offline),
                         'Checksums before and after bringing bricks'
                         ' offline are not equal')
        g.log.info('Checksums before and after '
                   'bringing bricks offline are equal')

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 10` ; "
                   "do for j in `seq 1 5` ; "
                   "do for k in `seq 1 10` ; "
                   "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; "
                   "mkdir l1_dir.$i/l2_dir.$j/test.$k ; "
                   "done ; "
                   "done ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertEqual(sorted(result_before_online),
                         sorted(result_after_online),
                         'Checksums before and after bringing bricks'
                         ' online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
    def test_handling_data_split_brain(self):
        """
        - create IO
        - calculate arequal from mountpoint
        - set volume option 'self-heal-daemon' to value "off"
        - kill data brick1
        - calculate arequal checksum and compare it
        - modify files and directories
        - bring back all bricks processes online
        - kill data brick3
        - modify files and directories
        - calculate arequal from mountpoint
        - bring back all bricks processes online
        - run the find command to trigger heal from mountpoint
        - set volume option 'self-heal-daemon' to value "on"
        - check if heal is completed
        - check for split-brain
        - read files
        - calculate arequal checksum and compare it
        """
        # pylint: disable=too-many-locals,too-many-statements

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do mkdir dir.$i ; "
                       "for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1K count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1k count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Setting options
        options = {"self-heal-daemon": "off"}
        g.log.info('Setting options %s for volume %s',
                   options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)

        # Bring brick 1 offline
        bricks_to_bring_offline = [bricks_list[0]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Comparing arequals before getting bricks offline
        # and after getting bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Arequals before getting bricks offline '
                         'and after getting bricks offline are not equal')
        g.log.info('Arequals before getting bricks offline '
                   'and after getting bricks offline are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Modify files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Bring 1-st brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Bring brick 3rd offline
        bricks_to_bring_offline = [bricks_list[-1]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring 3rd brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Mount and unmount mounts
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, 'Failed to unmount %s' % self.volname)

        ret = self.mount_volume(self.mounts)
        self.assertTrue(ret, 'Unable to mount %s' % self.volname)

        # Start heal from mount point
        g.log.info('Starting heal from mount point...')
        for mount_obj in self.mounts:
            g.log.info("Start heal for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = "/usr/bin/env python %s read %s" % (
                self.script_upload_path,
                self.mounts[0].mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Heal triggered for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Heal triggered for all mountpoints')

        # Enable self-heal daemon
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Successfully started self heal daemon')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Reading files
        g.log.info('Reading files...')
        for mount_obj in self.mounts:
            g.log.info("Start reading files for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = ('cd %s/ ; '
                       'for i in `seq 1 10` ; '
                       'do cat file.$i > /dev/null ; '
                       'for j in `seq 1 5` ; '
                       'do cat dir.$i/file.$j > /dev/null ; '
                       'done ; done'
                       % mount_obj.mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Reading files successfully for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Reading files successfully for all mountpoints')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Comparing arequals before getting bricks online
        # and after getting bricks online
        self.assertEqual(result_before_online, result_after_online,
                         'Arequals before getting bricks online '
                         'and after getting bricks online are not equal')
        g.log.info('Arequals before getting bricks online '
                   'and after getting bricks online are equal')
    def test_gfid_split_brain_resolution(self):
        """
        Description: Simulates gfid split brain on multiple files in a dir and
        resolve them via `bigger-file`, `mtime` and `source-brick` methods

        Steps:
        - Create and mount a replicated volume, create a dir and ~10 data files
        - Simulate gfid splits in 9 of the files
        - Resolve each 3 set of files using `bigger-file`, `mtime` and
          `source-bricks` split-brain resoultion methods
        - Trigger and monitor for heal completion
        - Validate all the files are healed and arequal matches for bricks in
          subvols
        """
        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)
        arbiter = self.volume_type.find('arbiter') >= 0

        # Disable self-heal daemon and set `quorum-type` option to `none`
        ret = set_volume_options(self.mnode, self.volname, {
            'self-heal-daemon': 'off',
            'cluster.quorum-type': 'none'
        })
        self.assertTrue(
            ret, 'Not able to disable `quorum-type` and '
            '`self-heal` daemon volume options')

        # Create required dir and files from the mount
        split_dir = 'gfid_split_dir'
        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
                   ' 1M > %s/file$i; done;')
        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
        self.assertTrue(ret, 'Unable to create a directory from mount point')
        ret, _, _ = g.run(client, file_io % (m_point, split_dir))

        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
        # resolution method
        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
        split_cmds = {
            1:
            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
            2:
            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
            3: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
            4: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
        }

        # Get subvols and simulate entry split brain
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        msg = ('Unable to bring files under {} dir to entry split brain while '
               '{} are down')
        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
                                       1):
            # Bring down two bricks from each subvol
            ret = bring_bricks_offline(self.volname, list(bricks))
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            ret, _, _ = g.run(client,
                              'cd {}; {}'.format(m_point, split_cmds[index]))
            self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bricks will be brought down only two times in case of arbiter and
            # bringing remaining files into split brain for `latest-mtime` heal
            if arbiter and index == 2:
                ret, _, _ = g.run(client,
                                  'cd {}; {}'.format(m_point, split_cmds[4]))
                self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bring offline bricks online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                bricks,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))

        # Enable self-heal daemon, trigger heal and assert volume is in split
        # brain condition
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Failed to enable self heal daemon')

        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Not all self heal daemons are online')

        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger index heal on the volume')

        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, 'Volume should be in split brain condition')

        # Select source brick and take note of files in source brick
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
        files = [
            self._get_files_in_brick(path, split_dir) for path in source_bricks
        ]

        # Resolve `file1, file2, file3` gfid split files using `source-brick`
        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
               'source-brick {} /' + split_dir + '/{}')
        for index, source_brick in enumerate(source_bricks):
            for each_file in files[index]:
                run_cmd = cmd.format(source_brick, each_file)
                self._run_cmd_and_assert(run_cmd)

        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain bigger-file /' + split_dir + '/{}')
        for each_file in ('file4', 'file5', 'file6'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain latest-mtime /' + split_dir + '/{}')
        for each_file in ('file7', 'file8', 'file9'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Unless `shd` is triggered manually/automatically files will still
        # appear in `heal info`
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full self heal')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, 'All files in volume should be healed after healing files via'
            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')

        # Validate normal file `file10` and healed files don't differ in
        # subvols via an `arequal`
        for subvol in subvols:
            # Disregard last brick if volume is of arbiter type
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
                   '`bigger-file` and `latest-mtime` methods is complete')
    def test_ec_lookup_and_move_operations_few_bricks_are_offline(self):
        """
        Test Steps:
        1. Mount this volume on 3 mount point, c1, c2, and c3
        2. Bring down two bricks offline in each subvol.
        3. On client1: under dir1 create files f{1..10000} run in background
        4. On client2: under root dir of mountpoint touch x{1..1000}
        5. On client3: after step 4 action completed, start creating
           x{1001..10000}
        6. Bring bricks online which were offline(brought up all the bricks
           which were down (2 in each of the two subvols)
        7. While IO on Client1 and Client3 were happening, On client2 move all
           the x* files into dir1
        8. Perform lookup from client 3
        """
        # List two bricks in each subvol
        all_subvols_dict = get_subvols(self.mnode, self.volname)
        subvols = all_subvols_dict['volume_subvols']
        bricks_to_bring_offline = []
        for subvol in subvols:
            self.assertTrue(subvol, "List is empty")
            bricks_to_bring_offline.extend(sample(subvol, 2))

        # Bring two bricks of each subvol offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Bricks are still online")
        g.log.info("Bricks are offline %s", bricks_to_bring_offline)

        # Validating the bricks are offline or not
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Few of the bricks are still online in"
                             " {} in".format(bricks_to_bring_offline))
        g.log.info("%s bricks are offline as expected",
                   bricks_to_bring_offline)

        # Create directory on client1
        dir_on_mount = self.mounts[0].mountpoint + '/dir1'
        ret = mkdir(self.mounts[0].client_system, dir_on_mount)
        self.assertTrue(ret, "unable to create directory on client"
                             " 1 {}".format(self.mounts[0].client_system))
        g.log.info("Dir1 created on %s successfully",
                   self.mounts[0].client_system)

        # Next IO to be ran in the background so using mount_procs
        # and run_async.
        self.mount_procs = []

        # On client1: under dir1 create files f{1..10000} run in background
        self._run_create_files(file_count=10000, base_name="f_",
                               mpoint=dir_on_mount,
                               client=self.mounts[0].client_system)

        # On client2: under root dir of the mountpoint touch x{1..1000}
        cmd = ("/usr/bin/env python {} create_files -f 1000 --fixed-file-size"
               " 10k --base-file-name x {}".format(self.script_upload_path,
                                                   self.mounts[1].mountpoint))
        ret, _, err = g.run(self.mounts[1].client_system, cmd)
        self.assertEqual(ret, 0, "File creation failed on {} with {}".
                         format(self.mounts[1].client_system, err))
        g.log.info("File creation successful on %s",
                   self.mounts[1].client_system)

        # On client3: start creating x{1001..10000}
        cmd = ("cd {}; for i in `seq 1000 10000`; do touch x$i; done; "
               "cd -".format(self.mounts[2].mountpoint))
        proc = g.run_async(self.mounts[2].client_system, cmd)
        self.mount_procs.append(proc)

        # Bring bricks online with volume start force
        ret, _, err = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, err)
        g.log.info("Volume: %s started successfully", self.volname)

        # Check whether bricks are online or not
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, "Bricks {} are still offline".
                        format(bricks_to_bring_offline))
        g.log.info("Bricks %s are online now", bricks_to_bring_offline)

        # From client2 move all the files with name starting with x into dir1
        cmd = ("for i in `seq 0 999`; do mv {}/x$i.txt {}; "
               "done".format(self.mounts[1].mountpoint, dir_on_mount))
        proc = g.run_async(self.mounts[1].client_system, cmd)
        self.mount_procs.append(proc)

        # Perform a lookup in loop from client3 for 20 iterations
        cmd = ("ls -R {}".format(self.mounts[2].mountpoint))
        counter = 20
        while counter:
            ret, _, err = g.run(self.mounts[2].client_system, cmd)
            self.assertEqual(ret, 0, "ls while mv operation being carried"
                                     " failed with {}".format(err))
            g.log.debug("ls successful for the %s time", 21-counter)
            counter -= 1

        self.assertTrue(validate_io_procs(self.mount_procs, self.mounts),
                        "IO failed on the clients")
        # Emptying mount_procs for not validating IO in tearDown
        self.mount_procs *= 0

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname,)
        self.assertTrue(ret, "Heal didn't completed in the expected time")
        g.log.info("Heal completed successfully on %s volume", self.volname)
    def test_self_heal_when_io_in_progress(self):
        """Test self-heal is successful when IO is in progress.

        Description:
            - simulate brick down.
            - bring bricks online
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before simulating brick failure
        g.log.info(
            "Logging volume info and Status before bringing bricks "
            "offlien from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(
            None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                   bricks_to_bring_offline_dict['cold_tier_bricks'] +
                   bricks_to_bring_offline_dict['volume_bricks']))

        # Bring bricks offline
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s offline", bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Wait for gluster processes to be offline
        time.sleep(10)

        # Log Volume Info and Status
        g.log.info(
            "Logging volume info and Status after bringing bricks "
            "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list:%s are offline")
        g.log.info("Successfully validated that bricks: %s are all offline")

        # Add delay before bringing bricks online
        time.sleep(40)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online", bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Wait for gluster processes to be online
        time.sleep(10)

        # Log Volume Info and Status
        g.log.info(
            "Logging volume info and Status after bringing bricks "
            "online from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
Esempio n. 21
0
    def test_self_heal(self):
        """
        Description:-
        - Create files on mount point
        - Kill one brick from volume
        - rm -rfv on mount point
        - bring bricks online
        - wait for heals
        - list
        """
        # pylint: disable=too-many-statements

        # IO on the mount point
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 35 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (
                       self.script_upload_path,
                       self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Killing one brick from the volume set
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s offline",
                              bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list: %s are offline" %
                        bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Checking volume status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Removing files from the mount point when one brick is down
        g.log.info("Removing files from the mount point")
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system
        cmd = "rm -rfv %s/*" % mountpoint
        ret, _, _ = g.run(client, cmd)
        if ret != 0:
            raise ExecutionError("failed to delete the files")

        # Bringing bricks online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Check if bricks are online
        g.log.info("Checking bricks are online or not")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Monitoring heals on the volume
        g.log.info("Wait for heal completion...")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                             "for 20 minutes.")
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_self_heal_daemon(self):
        """
        Test Data-Self-Heal(heal command)
        Description:
        - Create directory test_hardlink_self_heal
        - Create directory test_data_self_heal
        - Creating files for hardlinks and data files
        - Get arequal before getting bricks offline
        - Select bricks to bring offline
        - Bring brick offline
        - Create hardlinks and append data to data files
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        - Select bricks to bring offline
        - Bring brick offline
        - Truncate data to data files and verify hardlinks
        - Bring brick online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal again

        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_hardlink_self_heal
        ret = mkdir(
            self.mounts[0].client_system,
            "{}/test_hardlink_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_hardlink_self_heal' on %s created "
            "successfully", self.mounts[0])

        # Creating directory test_data_self_heal
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_data_self_heal".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory test_hardlink_self_heal on %s created "
            "successfully", self.mounts[0])

        # Creating files for hardlinks and data files
        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5`;'
               'do mkdir dir.$i ; for j in `seq 1 10` ; do dd if='
               '/dev/urandom of=dir.$i/file.$j bs=1k count=$j;done; done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=128K count=$i;done;'
               'cd ..' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        # Get arequal before getting bricks offline
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal before getting bricks online-%s',
                   result_before_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Append data to data files and create hardlinks
        cmd = ('cd %s/test_data_self_heal;for i in `seq 1 100`;'
               'do dd if=/dev/urandom of=file.$i bs=512K count=$i ; done ;'
               'cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to modify data files.")
        g.log.info("Successfully modified data files")

        cmd = ('cd %s/test_hardlink_self_heal;for i in `seq 1 5` ;do '
               'for j in `seq 1 10`;do ln dir.$i/file.$j dir.$i/link_file.$j;'
               'done ; done ; cd .. ' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Hardlinks creation failed")
        g.log.info("Successfully created hardlinks of files")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Arequal after getting bricks online '
                   'is %s', result_after_online)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Truncate data to data files and verify hardlinks
        cmd = ('cd %s/test_data_self_heal ; for i in `seq 1 100` ;'
               'do truncate -s $(( $i * 128)) file.$i ; done ; cd ..' %
               self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to truncate files")
        g.log.info("Successfully truncated files on mountpoint")

        file_path = ('%s/test_hardlink_self_heal/dir{1..5}/file{1..10}' %
                     (self.mounts[0].mountpoint))
        link_path = ('%s/test_hardlink_self_heal/dir{1..5}/link_file{1..10}' %
                     (self.mounts[0].mountpoint))
        file_stat = get_file_stat(self.mounts[0], file_path)
        link_stat = get_file_stat(self.mounts[0], link_path)
        self.assertEqual(file_stat, link_stat, "Verification of hardlinks "
                         "failed")
        g.log.info("Successfully verified hardlinks")

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))
        g.log.info("Volume %s : All process are online", self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
Esempio n. 23
0
    def test_rebalance_with_brick_down(self):
        """
        Rebalance with brick down in replica
        - Create a Replica volume.
        - Bring down one of the brick down in the replica pair
        - Do some IO and create files on the mount point
        - Add a pair of bricks to the volume
        - Initiate rebalance
        - Bring back the brick which was down.
        - After self heal happens, all the files should be present.
        """
        # Log the volume info and status before brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring one fo the bricks offline
        brick_list = get_all_bricks(self.mnode, self.volname)
        ret = bring_bricks_offline(self.volname, choice(brick_list))

        # Log the volume info and status after brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Create files at mountpoint.
        cmd = (
            "/usr/bin/env python %s create_files "
            "-f 2000 --fixed-file-size 1k --base-file-name file %s"
            % (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(
            self.mounts[0].client_system, cmd, user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete.
        self.assertTrue(wait_for_io_to_complete(self.all_mounts_procs,
                                                self.mounts[0]),
                        "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Compute the arequal checksum before bringing all bricks online
        arequal_before_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Log the volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log the voluem info after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Log the voluem info and status before bringing all bricks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring all bricks online.
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Not able to start volume with force option")
        g.log.info("Volume start with force option successful.")

        # Log the volume info and status after bringing all beicks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Monitor heal completion.
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "heal has not yet completed")
        g.log.info("Self heal completed")

        # Compute the arequal checksum after all bricks online.
        arequal_after_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Comparing arequal checksum before and after the operations.
        self.assertEqual(arequal_before_all_bricks_online,
                         arequal_after_all_bricks_online,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
    def test_heal_info_should_have_fixed_fields(self):
        """
        - Create IO
        - While IO is creating - bring down a couple of bricks
        - Wait for IO to complete
        - Bring up the down bricks
        - Wait for heal to complete
        - Check for fields 'Brick', 'Status', 'Number of entries' in heal info
        """
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -f 50 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get heal info
        g.log.info('Getting heal info...')
        heal_info_dicts = get_heal_info_summary(self.mnode, self.volname)
        self.assertFalse(ret, 'Failed to get heal info')
        g.log.info(heal_info_dicts)

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Check all fields in heal info dict
        g.log.info('Checking for all the fields in heal info...')
        for brick in bricks_list:
            g.log.info('Checking fields for %s', brick)
            self.assertEqual(heal_info_dicts[brick]['status'], 'Connected',
                             'Status is not Connected for brick %s' % brick)
            self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0',
                             'numberOfEntries is not 0 for brick %s' % brick)

        g.log.info('Successfully checked for all the fields in heal info')
Esempio n. 25
0
    def test_metadata_self_heal(self):
        """
        Test MetaData Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Change the permissions, ownership and the group
        of the files under "test_meta_data_self_heal" folder
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - check group and user are 'qa'
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create files
        g.log.info('Creating files...')
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 50` ; "
                   "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder,
                      test_meta_data_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Changing the permissions, ownership and the group
        # of the files under "test_meta_data_self_heal" folder
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Change permissions to 444
        g.log.info('Changing permissions...')
        command = ("cd %s/%s/ ; "
                   "chmod -R 444 *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Permissions are changed successfully')

        # Change the ownership to qa
        g.log.info('Changing the ownership...')
        command = ("cd %s/%s/ ; "
                   "chown -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Ownership is changed successfully')

        # Change the group to qa
        g.log.info('Changing the group...')
        command = ("cd %s/%s/ ; "
                   "chgrp -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Group is changed successfully')

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume process %s not online "
                              "despite waiting for 5 minutes", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')

        # Adding servers and client in single dict to check permissions
        nodes_to_check = {}
        all_bricks = get_all_bricks(self.mnode, self.volname)
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            nodes_to_check[node] = brick_path
        nodes_to_check[self.mounts[0].client_system] = \
            self.mounts[0].mountpoint

        # Checking for user and group
        for node in nodes_to_check:
            # Get file list
            command = ("cd %s/%s/ ; "
                       "ls"
                       % (nodes_to_check[node],
                          test_meta_data_self_heal_folder))
            ret, out, err = g.run(node, command)
            file_list = out.split()

            for file_name in file_list:
                file_to_check = '%s/%s/%s' % (nodes_to_check[node],
                                              test_meta_data_self_heal_folder,
                                              file_name)

                g.log.info('Checking for permissions, user and group for %s',
                           file_name)

                # Check for permissions
                cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'"
                       .format(file_to_check))
                ret, permissions, _ = g.run(node, cmd)
                self.assertEqual(permissions.split('\n')[0], '444',
                                 'Permissions %s is not equal to 444'
                                 % permissions)
                g.log.info("Permissions are '444' for %s", file_name)

                # Check for user
                cmd = ("ls -ld {} | awk '{{print $3}}'"
                       .format(file_to_check))
                ret, username, _ = g.run(node, cmd)
                self.assertEqual(username.split('\n')[0],
                                 'qa', 'User %s is not equal qa'
                                 % username)
                g.log.info("User is 'qa' for %s", file_name)

                # Check for group
                cmd = ("ls -ld {} | awk '{{print $4}}'"
                       .format(file_to_check))
                ret, groupname, _ = g.run(node, cmd)
                self.assertEqual(groupname.split('\n')[0],
                                 'qa', 'Group %s is not equal qa'
                                 % groupname)
                g.log.info("Group is 'qa' for %s", file_name)
Esempio n. 26
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
Esempio n. 27
0
    def test_afr_reset_brick_heal_full(self):
        """
         1. Create files/dirs from mount point
         2. With IO in progress execute reset-brick start
         3. Now format the disk from back-end, using rm -rf <brick path>
         4. Execute reset brick commit and check for the brick is online.
         5. Issue volume heal using "gluster vol heal <volname> full"
         6. Check arequal for all bricks to verify all backend bricks
            including the resetted brick have same data
        """
        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick_to_reset = choice(all_bricks)

        # Start reset brick
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="start")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick: %s started", brick_to_reset)

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname, [brick_to_reset])
        self.assertTrue(ret, "Brick:{} is still online".format(brick_to_reset))

        # rm -rf of the brick directory
        node, brick_path = brick_to_reset.split(":")
        ret = rmdir(node, brick_path, force=True)
        self.assertTrue(
            ret, "Unable to delete the brick {} on "
            "node {}".format(brick_path, node))

        # Reset brick commit
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="commit")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick committed successfully")

        # Check the brick is online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Few volume processess are offline for the "
            "volume: {}".format(self.volname))

        # Trigger full heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, "Unable  to trigger the heal full command")

        # Wait for the heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Heal didn't complete in 20 mins time")

        # Validate io on the clients
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on the mounts")
        self.all_mounts_procs *= 0

        # Check arequal of the back-end bricks after heal completion
        all_subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in all_subvols:
            ret, arequal_from_subvol = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across the"
                " bricks in the subvol {}".format(subvol))
            self.assertEqual(
                len(set(arequal_from_subvol)), 1, "Arequal is "
                "same on all the bricks in the subvol")
Esempio n. 28
0
    def test_data_self_heal_algorithm_diff_heal_command(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff'

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "data-self-heal-algorithm": "diff"
        "self-heal-daemon": "off"
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - modify the data
        - get arequal before getting bricks online
        - bring bricks online
        - expand volume by adding bricks to the volume
        - do rebalance
        - set the volume option "self-heal-daemon": "on" and check for daemons
        - start healing
        - check if heal is completed
        - check for split-brain
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "data-self-heal-algorithm": "diff"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "'self-heal-daemon' "
                   "are set to 'off',"
                   "'data-self-heal-algorithm' "
                   "is set to 'diff' successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Expand volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums are equal')
    def test_data_self_heal_algorithm_full_default(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'full'

        Description:
        - set the volume option "data-self-heal-algorithm" to value "full"
        - create IO
        - bring down all bricks processes from selected set
        - modify the data
        - calculate arequal
        - bring bricks online
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options "data-self-heal-algorithm": "full"...')
        options = {"data-self-heal-algorithm": "full"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'data-self-heal-algorithm' is set to 'full' "
                   "successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')
Esempio n. 30
0
 def _wait_for_heal_is_completed(self):
     """Check if heal is completed"""
     ret = monitor_heal_completion(self.mnode,
                                   self.volname,
                                   timeout_period=3600)
     self.assertTrue(ret, 'Heal has not yet completed')