def test_rename_files_with_brick_down(self):
        """
        Description: Tests to check that there is no data loss when rename is
                      performed with a brick of volume down.
         Steps :
         1) Create a volume.
         2) Mount the volume using FUSE.
         3) Create 1000 files on the mount point.
         4) Create the soft-link for file{1..100}
         5) Create the hard-link for file{101..200}
         6) Check for the file count on the mount point.
         7) Begin renaming the files, in multiple iterations.
         8) Let few iterations of the rename complete successfully.
         9) Then while rename is still in progress, kill a brick part of the
            volume.
         10) Let the brick be down for sometime, such that the a couple
             of rename iterations are completed.
         11) Bring the brick back online.
         12) Wait for the IO to complete.
         13) Check if there is any data loss.
         14) Check if all the files are renamed properly.
         """
        # Creating 1000 files on volume root
        m_point = self.mounts[0].mountpoint
        command = 'touch ' + m_point + '/file{1..1000}_0'
        ret, _, _ = g.run(self.clients[0], command)
        self.assertEqual(ret, 0, "File creation failed on %s"
                         % m_point)
        g.log.info("Files successfully created on the mount point")

        # Create soft links for a few files
        for i in range(1, 100):
            ret = create_link_file(self.clients[0],
                                   '{}/file{}_0'.format(m_point, i),
                                   '{}/soft_link_file{}_0'.format(m_point, i),
                                   soft=True)
            self.assertTrue(ret, "Failed to create soft links for files")
        g.log.info("Created soft links for files successfully")

        # Create hard links for a few files
        for i in range(101, 200):
            ret = create_link_file(self.clients[0],
                                   '{}/file{}_0'.format(m_point, i),
                                   '{}/hard_link_file{}_0'.format(m_point, i),
                                   soft=False)
            self.assertTrue(ret, "Failed to create hard links for files")
        g.log.info("Created hard links for files successfully")

        # Calculate file count for the mount-point
        cmd = ("ls -lR %s/ | wc -l" % m_point)
        ret, count_before, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to get file count")
        g.log.info("File count before rename is:%s", count_before)

        # Start renaming the files in multiple iterations
        g.log.info("Starting to rename the files")
        all_mounts_procs = []
        cmd = ('for i in `seq 1 1000`; do for j in `seq 0 5`;do mv -f '
               '%s/file$i\\_$j %s/file$i\\_$(expr $j + 1); done; done'
               % (m_point, m_point))
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Waiting for some time for a iteration of rename to complete
        g.log.info("Waiting for few rename iterations to complete")
        sleep(120)

        # Get the information about the bricks part of the volume
        brick_list = get_all_bricks(self.mnode, self.volname)

        # Kill a brick part of the volume
        ret = bring_bricks_offline(self.volname, choice(brick_list))
        self.assertTrue(ret, "Failed to bring brick offline")
        g.log.info("Successfully brought brick offline")

        # Let the brick be down for some time
        g.log.info("Keeping brick down for few minutes")
        sleep(60)

        # Bring the brick online using gluster v start force
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")
        g.log.info("Volume start with force successful")

        # Close connection and check if rename has completed
        ret, _, _ = proc.async_communicate()
        self.assertEqual(ret, 0, "Rename is not completed")
        g.log.info("Rename is completed")

        # Do lookup on the files
        # Calculate file count from mount
        cmd = ("ls -lR %s/ | wc -l" % m_point)
        ret, count_after, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed to do lookup and"
                         "get file count")
        g.log.info("Lookup successful. File count after"
                   " rename is:%s", count_after)

        # Check if there is any data loss
        self.assertEqual(int(count_before), int(count_after),
                         "The file count before and after"
                         " rename is not same. There is data loss.")
        g.log.info("The file count before and after rename is same."
                   " No data loss occurred.")

        # Checking if all files were renamed Successfully
        ret = get_volume_type(brick_list[0] + "/")
        if ret in ("Replicate", "Disperse", "Arbiter", "Distributed-Replicate",
                   "Distribute-Disperse", "Distribute-Arbiter"):
            cmd = ("ls -lR %s/file*_6 | wc -l" % m_point)
            ret, out, _ = g.run(self.clients[0], cmd)
            self.assertEqual(int(out), 1000, "Rename failed on some files")
            g.log.info("All the files are renamed successfully")
    def test_self_heal_differing_in_file_type(self):
        """
        testing self heal of files with different file types
        with default configuration

        Description:
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - calculate arequal and compare with arequal before
        getting bricks offline
        - modify the data
        - arequal before getting bricks online
        - bring bricks online
        - check daemons and healing completion
        - start healing
        - calculate arequal and compare with arequal before bringing bricks
        online and after bringing bricks online
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        all_mounts_procs = []
        test_file_type_differs_self_heal_folder = \
            'test_file_type_differs_self_heal'
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)

        # Creating files
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 10` ; "
                   "do mkdir l1_dir.$i ; "
                   "for j in `seq 1 5` ; "
                   "do mkdir l1_dir.$i/l2_dir.$j ; "
                   "for k in `seq 1 10` ; "
                   "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k "
                   "bs=1k count=$k ; "
                   "done ; "
                   "done ; "
                   "done ; " % (self.mounts[0].mountpoint,
                                test_file_type_differs_self_heal_folder,
                                test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts),
                        "Io failed to complete on some of the clients")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks offline
        self.assertItemsEqual(
            result_before_offline, result_after_offline,
            'Checksums before and after '
            'bringing bricks offline are not equal')
        g.log.info('Checksums before and after '
                   'bringing bricks offline are equal')

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("cd %s/%s/ ; "
                   "for i in `seq 1 10` ; "
                   "do for j in `seq 1 5` ; "
                   "do for k in `seq 1 10` ; "
                   "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; "
                   "mkdir l1_dir.$i/l2_dir.$j/test.$k ; "
                   "done ; "
                   "done ; "
                   "done ;" % (self.mounts[0].mountpoint,
                               test_file_type_differs_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(
            result_before_online, result_after_online, 'Checksums before and '
            'after bringing bricks online are not equal')
        g.log.info('Checksums before and after bringing bricks online '
                   'are equal')
    def test_volume_create(self):

        # create and start a volume
        self.volume['name'] = "first_volume"
        self.volname = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, "Failed to create and start volume")

        # bring a brick down and volume start force should bring it to online

        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        ret = bring_bricks_offline(self.volname, bricks_list[0:2])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to start the volume")
        g.log.info("Volume start with force is success")

        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to bring the bricks online")
        g.log.info("Volume start with force successfully brought all the "
                   "bricks online")

        # create volume with previously used bricks and different volume name
        self.volname = "second_volume"
        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: It should fail to create a "
            "volume with previously used bricks. Actual:"
            "Successfully created the volume with previously"
            " used bricks")
        g.log.info("Failed to create the volume with previously used bricks")

        # create a volume with already existing volume name
        self.volume['name'] = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(
            ret, "Expected: It should fail to create a volume"
            " with already existing volume name. Actual: "
            "Successfully created the volume with "
            "already existing volname")
        g.log.info("Failed to create the volume with already existing volname")

        # creating a volume with non existing brick path should fail

        self.volname = "second_volume"
        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       len(self.servers), self.servers,
                                       self.all_servers_info)
        nonexisting_brick_index = random.randint(0, len(bricks_list) - 1)
        non_existing_brick = bricks_list[nonexisting_brick_index].split(":")[0]
        non_existing_path = ":/brick/non_existing_path"
        non_existing_brick = non_existing_brick + non_existing_path
        bricks_list[nonexisting_brick_index] = non_existing_brick

        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: Creating a volume with non "
            "existing brick path should fail. Actual: "
            "Successfully created the volume with "
            "non existing brick path")
        g.log.info("Failed to create the volume with non existing brick path")

        # cleanup the volume and peer detach all servers. form two clusters,try
        # to create a volume with bricks whose nodes are in different clusters

        # cleanup volumes
        vol_list = get_volume_list(self.mnode)
        self.assertIsNotNone(vol_list, "Failed to get the volume list")

        for volume in vol_list:
            ret = cleanup_volume(self.mnode, volume)
            self.assertTrue(ret, "Unable to delete volume % s" % volume)

        # peer detach all servers
        ret = peer_detach_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer detach to all servers is failed")
        g.log.info("Peer detach to all the servers is success")

        # form cluster 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[1])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[0], self.servers[1]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[0], self.servers[1]))

        # form cluster 2
        ret, _, _ = peer_probe(self.servers[2], self.servers[3])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[2], self.servers[3]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[2], self.servers[3]))

        # Creating a volume with bricks which are part of another
        # cluster should fail
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: Creating a volume with bricks"
            " which are part of another cluster should fail."
            " Actual: Successfully created the volume with "
            "bricks which are part of another cluster")
        g.log.info("Failed to create the volume with bricks which are "
                   "part of another cluster")

        # form a cluster, bring a node down. try to create a volume when one of
        # the brick node is down
        ret, _, _ = peer_detach(self.servers[2], self.servers[3])
        self.assertEqual(ret, 0, "Peer detach is failed")
        g.log.info("Peer detach is success")

        ret = peer_probe_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer probe is failed")
        g.log.info("Peer probe to all the servers is success")

        random_server = self.servers[random.randint(1, len(self.servers) - 1)]
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Glusterd is stopped successfully")

        self.volume['name'] = "third_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: It should fail to create a volume "
            "when one of the node is down. Actual: Successfully "
            "created the volume with bbrick whose node is down")

        g.log.info("Failed to create the volume with brick whose node is down")
Beispiel #4
0
    def test_metadata_self_heal(self):
        """
        Test MetaData Self-Heal (heal command)

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        - create IO
        - set the volume option
        "self-heal-daemon": "off"
        - bring down all bricks processes from selected set
        - Change the permissions, ownership and the group
        of the files under "test_meta_data_self_heal" folder
        - get arequal before getting bricks online
        - bring bricks online
        - set the volume option
        "self-heal-daemon": "on"
        - check daemons and start healing
        - check is heal is completed
        - check for split-brain
        - get arequal after getting bricks online and compare with
        arequal before getting bricks online
        - check group and user are 'qa'
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "are set to 'off' successfully")

        # Creating files on client side
        all_mounts_procs = []
        test_meta_data_self_heal_folder = 'test_meta_data_self_heal'
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create files
        g.log.info('Creating files...')
        command = ("cd %s/ ; "
                   "mkdir %s ;"
                   "cd %s/ ;"
                   "for i in `seq 1 50` ; "
                   "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; "
                   "done ;"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder,
                      test_meta_data_self_heal_folder))

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # wait for io to complete
        self.assertTrue(
            wait_for_io_to_complete(all_mounts_procs, self.mounts),
            "Io failed to complete on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Changing the permissions, ownership and the group
        # of the files under "test_meta_data_self_heal" folder
        g.log.info("Modifying data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Change permissions to 444
        g.log.info('Changing permissions...')
        command = ("cd %s/%s/ ; "
                   "chmod -R 444 *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Permissions are changed successfully')

        # Change the ownership to qa
        g.log.info('Changing the ownership...')
        command = ("cd %s/%s/ ; "
                   "chown -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Ownership is changed successfully')

        # Change the group to qa
        g.log.info('Changing the group...')
        command = ("cd %s/%s/ ; "
                   "chgrp -R qa *"
                   % (self.mounts[0].mountpoint,
                      test_meta_data_self_heal_folder))
        ret, out, err = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, err)
        g.log.info('Group is changed successfully')

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume process %s not online "
                              "despite waiting for 5 minutes", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online"
                              % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks online
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums before bringing bricks online '
                   'and after bringing bricks online are equal')

        # Adding servers and client in single dict to check permissions
        nodes_to_check = {}
        all_bricks = get_all_bricks(self.mnode, self.volname)
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            nodes_to_check[node] = brick_path
        nodes_to_check[self.mounts[0].client_system] = \
            self.mounts[0].mountpoint

        # Checking for user and group
        for node in nodes_to_check:
            # Get file list
            command = ("cd %s/%s/ ; "
                       "ls"
                       % (nodes_to_check[node],
                          test_meta_data_self_heal_folder))
            ret, out, err = g.run(node, command)
            file_list = out.split()

            for file_name in file_list:
                file_to_check = '%s/%s/%s' % (nodes_to_check[node],
                                              test_meta_data_self_heal_folder,
                                              file_name)

                g.log.info('Checking for permissions, user and group for %s',
                           file_name)

                # Check for permissions
                cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'"
                       .format(file_to_check))
                ret, permissions, _ = g.run(node, cmd)
                self.assertEqual(permissions.split('\n')[0], '444',
                                 'Permissions %s is not equal to 444'
                                 % permissions)
                g.log.info("Permissions are '444' for %s", file_name)

                # Check for user
                cmd = ("ls -ld {} | awk '{{print $3}}'"
                       .format(file_to_check))
                ret, username, _ = g.run(node, cmd)
                self.assertEqual(username.split('\n')[0],
                                 'qa', 'User %s is not equal qa'
                                 % username)
                g.log.info("User is 'qa' for %s", file_name)

                # Check for group
                cmd = ("ls -ld {} | awk '{{print $4}}'"
                       .format(file_to_check))
                ret, groupname, _ = g.run(node, cmd)
                self.assertEqual(groupname.split('\n')[0],
                                 'qa', 'Group %s is not equal qa'
                                 % groupname)
                g.log.info("Group is 'qa' for %s", file_name)
Beispiel #5
0
    def test_heal_gfid_1x3(self):
        """
        Description: This test case verifies the gfid self-heal on a 1x3
                 replicate volume.
                 1. file created at mount point
                 2. 2 bricks brought down
                 3. file deleted
                 4. created a new file from the mount point
                 5. all bricks brought online
                 6. check if gfid worked correctly
        """

        g.log.info("setting the quorum type to fixed")
        options = {"cluster.quorum-type": "fixed"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, "unable to set the quorum type to fixed")
        g.log.info("Successfully set the quorum type to fixed")

        g.log.info("creating a file from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 1 --base-file-name test_file --fixed-file-size 10k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a file from mount point")

        # getting list of all bricks
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "unable to get list of bricks")
        g.log.info("bringing down brick1 and brick2")
        ret = bring_bricks_offline(self.volname, all_bricks[:2])
        self.assertTrue(ret, "unable to bring bricks offline")
        g.log.info("Successfully brought the following bricks offline "
                   ": %s", str(all_bricks[:2]))

        g.log.info("deleting the file from mount point")
        command = "rm -f " + self.mounts[0].mountpoint + "/test_file1"
        ret, _, _ = g.run(self.mounts[0].client_system, command)
        self.assertEqual(ret, 0, "unable to remove file from mount point")
        g.log.info("Successfully deleted file from mountpoint")

        g.log.info("creating a new file of same name and different size "
                   "from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 1 --base-file-name test_file --fixed-file-size 1M %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a new file of same name "
                   "from mount point")

        g.log.info("bringing bricks 1 and 2 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[:2])
        self.assertIsNotNone(ret, "unable to bring bricks online")
        g.log.info("Successfully brought the following bricks online "
                   ": %s", str(all_bricks[:2]))

        g.log.info("checking if stat structure of the file is returned")
        ret = get_file_stat(self.mounts[0].client_system,
                            self.mounts[0].mountpoint + '/test_file0.txt')
        self.assertTrue(ret, "unable to get file stats")
        g.log.info("file stat structure returned successfully")

        g.log.info("checking if the heal has completed")
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, "heal not completed")
        g.log.info("Self heal was completed successfully")

        g.log.info("checking if the areequal checksum of all the bricks in "
                   "the subvol match")
        checksum_list = []
        for brick in all_bricks:
            node, brick_path = brick.split(':')
            command = "arequal-checksum -p " + brick_path + \
                      " -i .glusterfs -i .landfill"
            ret, out, _ = g.run(node, command)
            self.assertEqual(
                ret, 0, "unable to get the arequal checksum "
                "of the brick")
            checksum_list.append(out)
            # checking file size of healed file on each brick to verify
            # correctness of choice for sink and source
            stat_dict = get_file_stat(node, brick_path + '/test_file0.txt')
            self.assertEqual(
                stat_dict['size'], '1048576',
                "file size of healed file is different "
                "than expected")
        flag = all(val == checksum_list[0] for val in checksum_list)
        self.assertTrue(flag, "the arequal checksum of all bricks is"
                        "not same")
        g.log.info("the arequal checksum of all the bricks in the subvol "
                   "is same")
Beispiel #6
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
    def test_ec_version(self):
        """
        Create a directory on the mountpoint
        Create files on the mountpoint
        Bring down a brick say b1
        Create more files on the mountpoint
        Bring down another brick b2
        Bring up brick b1
        Wait for healing to complete
        Check if EC version is updated
        Check is EC size is updated
        """
        # pylint: disable=too-many-statements,too-many-branches,too-many-locals

        # Creating dir1 on the mountpoint
        ret = mkdir(self.mounts[0].client_system, "%s/dir1"
                    % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("Directory dir1 on %s created successfully", self.mounts[0])

        # Creating files on client side for dir1
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create dirs with file
        command = ("cd %s/dir1; for i in {1..10};do"
                   " dd if=/dev/urandom of=file.$i "
                   "bs=1024 count=10000; done" % self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validating IO's and waiting to complete
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Bringing brick b1 offline
        sub_vols = get_subvols(self.mnode, self.volname)
        self.bricks_list1 = list(choice(sub_vols['volume_subvols']))
        brick_b1_down = choice(self.bricks_list1)
        ret = bring_bricks_offline(self.volname,
                                   brick_b1_down)
        self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down)
        g.log.info('Brick %s is offline successfully', brick_b1_down)

        del self.all_mounts_procs[:]
        # Creating files on client side for dir1
        g.log.info("Generating data for %s:%s",
                   self.mounts[0].client_system, self.mounts[0].mountpoint)

        # Create dirs with file
        command = ("cd %s/dir1; for i in {11..20};do"
                   " dd if=/dev/urandom of=file.$i "
                   "bs=1024 count=10000; done" % self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system, command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validating IO's and waiting to complete
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Changing mode owner and group of files
        dir_file_range = '2..5'
        cmd = ('chmod 777 %s/dir1/file.{%s}'
               % (self.mounts[0].mountpoint, dir_file_range))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertFalse(ret, "Changing mode of files has failed")

        g.log.info("Mode of files have been changed successfully")

        cmd = ('chown root %s/dir1/file.{%s}'
               % (self.mounts[0].mountpoint, dir_file_range))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertFalse(ret, "Changing owner of files has failed")
        g.log.info("Owner of files have been changed successfully")

        cmd = ('chgrp root %s/dir1/file.{%s}'
               % (self.mounts[0].mountpoint, dir_file_range))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertFalse(ret, "Changing group of files has failed")
        g.log.info("Group of files have been changed successfully")

        # Create softlink and hardlink of files in mountpoint.
        cmd = ('cd %s/dir1/; '
               'for FILENAME in *; '
               'do ln -s $FILENAME softlink_$FILENAME; '
               'done;'
               % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertFalse(ret, "Creating Softlinks have failed")
        g.log.info("Softlink of files have been changed successfully")

        cmd = ('cd %s/dir1/; '
               'for FILENAME in *; '
               'do ln $FILENAME hardlink_$FILENAME; '
               'done;'
               % (self.mounts[0].mountpoint))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertFalse(ret, "Creating Hardlinks have failed")
        g.log.info("Hardlink of files have been changed successfully")

        # Bringing brick b2 offline
        bricks_list2 = deepcopy(self.bricks_list1)
        bricks_list2.remove(brick_b1_down)
        brick_b2_down = choice(bricks_list2)
        ret = bring_bricks_offline(self.volname,
                                   brick_b2_down)
        self.assertTrue(ret, 'Brick %s is not offline' % brick_b2_down)
        g.log.info('Brick %s is offline successfully', brick_b2_down)

        # Bring brick b1 online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [brick_b1_down],
                                  'glusterd_restart')
        self.assertTrue(ret, 'Brick %s is not brought'
                             'online' % brick_b1_down)
        g.log.info('Brick %s is online successfully', brick_b1_down)

        # Delete brick2 from brick list as we are not checking for heal
        # completion in brick 2 as it is offline

        self.bricks_list1.remove(brick_b2_down)

        # Check if EC version is same on all bricks which are up
        ret = self.get_xattr("ec.version")
        self.assertTrue(ret, "Healing not completed and EC version is"
                        "not updated")
        g.log.info("Healing is completed and EC version is updated")

        # Check if EC size is same on all bricks which are up
        ret = self.get_xattr("ec.size")
        self.assertTrue(ret, "Healing not completed and EC size is"
                        "not updated")
        g.log.info("Healing is completed and EC size is updated")
Beispiel #8
0
    def test_heal_when_quota_object_limit_exceeded(self):
        # Create a directory to set the quota_limit_objects
        path = "/dir"
        g.log.info("Creating a directory")
        self.all_mounts_procs = []
        for mount_object in self.mounts:
            cmd = "/usr/bin/env python %s create_deep_dir -d 0 -l 0 %s%s" % (
                self.script_upload_path, mount_object.mountpoint, path)
            ret = g.run(mount_object.client_system, cmd)
            self.assertTrue(ret, "Failed to create directory on mountpoint")
            g.log.info("Directory created successfully on mountpoint")

        # Enable Quota
        g.log.info("Enabling quota on the volume %s", self.volname)
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to enable quota on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Set quota-soft-timeout to 0
        g.log.info("Setting up soft timeout to 0")
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, "0")
        self.assertEqual(ret, 0, ("Failed to set quota-soft-timeout"))
        g.log.info("Successfully set the quota-soft-timeout")

        # Set quota-hard-timeout to 0
        g.log.info("Setting up hard timeout with 0")
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, "0")
        self.assertEqual(ret, 0, ("Failed to set quota-hard-timeout"))
        g.log.info("successfully set the quota-hard-timeout")

        # Set Quota limit on the newly created directory
        g.log.info("Set Quota Limit on the path %s of the volume %s", path,
                   self.volname)
        ret, _, _ = quota_limit_objects(self.mnode,
                                        self.volname,
                                        path=path,
                                        limit="5")
        self.assertEqual(ret, 0, ("Failed to set quota limit on path %s of "
                                  " the volume %s", path, self.volname))
        g.log.info(
            "Successfully set the quota limit on %s of the volume "
            "%s", path, self.volname)

        # Create 3 files inside the directory
        for mount_object in self.mounts:
            g.log.info("Creating Files on %s:%s", mount_object.client_system,
                       path)
            cmd = ("/usr/bin/env python %s create_files -f 3 "
                   "--base-file-name file-0 %s%s" %
                   (self.script_upload_path, mount_object.mountpoint, path))
            ret, _, _ = g.run(mount_object.client_system, cmd)
            self.assertEqual(ret, 0, ("Failed to create files on %s", path))
            g.log.info("Files created successfully on mountpoint")

        bricks_list = get_all_bricks(self.mnode, self.volname)

        # Bring brick3 offline
        g.log.info('Bringing brick %s offline', bricks_list[2])
        ret = bring_bricks_offline(self.volname, bricks_list[2])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[2])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2])
        g.log.info('Bringing brick %s offline is successful', bricks_list[2])

        # Try creating 5 more files, which should fail as the quota limit
        # exceeds
        cmd = ("/usr/bin/env python %s create_files -f 5 --base-file-name "
               "file-1 %s%s" %
               (self.script_upload_path, mount_object.mountpoint, path))
        ret, _, _ = g.run(mount_object.client_system, cmd)
        self.assertNotEqual(ret, 0, ("Creating 5 files succeeded while it was"
                                     "not supposed to."))
        g.log.info("Creating 5 files failed as expected due to quota object"
                   "limit on the directory.")

        # Bring brick3 online and check status
        g.log.info('Bringing brick %s online', bricks_list[2])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[2])
        g.log.info('Bringing brick %s online is successful', bricks_list[2])

        g.log.info("Verifying if brick %s is online", bricks_list[2])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[2]))
        g.log.info("Brick %s has come online.", bricks_list[2])

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')
    def test_metadata_split_brain_resolution(self):
        # Setting options
        g.log.info('Setting options...')
        options = {"metadata-self-heal": "off",
                   "entry-self-heal": "off",
                   "data-self-heal": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s",
                   options, self.volname)

        # Creating files and directories on client side
        g.log.info('Creating files and directories...')
        cmd = ("mkdir %s/test_metadata_sb && cd %s/test_metadata_sb &&"
               "for i in `seq 1 3`; do mkdir dir.$i; for j in `seq 1 5`;"
               "do dd if=/dev/urandom of=dir.$i/file.$j bs=1K count=1;"
               "done; dd if=/dev/urandom of=file.$i bs=1K count=1; done"
               % (self.mounts[0].mountpoint, self.mounts[0].mountpoint))

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Creating files and directories failed")
        g.log.info("Files & directories created successfully")

        # Check arequals for all the bricks
        g.log.info('Getting arequal before getting bricks offline...')
        self.verify_brick_arequals()
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Set option self-heal-daemon to OFF
        g.log.info('Setting option self-heal-daemon to off...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        bricks_list = get_all_bricks(self.mnode, self.volname)

        # Bring brick1 offline
        g.log.info('Bringing brick %s offline', bricks_list[0])
        ret = bring_bricks_offline(self.volname, bricks_list[0])
        self.assertTrue(ret, 'Failed to bring bricks %s offline'
                        % bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[0]])
        self.assertTrue(ret, 'Brick %s is not offline'
                        % bricks_list[0])
        g.log.info('Bringing brick %s offline is successful',
                   bricks_list[0])

        # Change metadata of some files & directories
        cmd = ("cd %s/test_metadata_sb &&"
               "for i in `seq 1 2`; do chmod -R 0555 dir.$i file.$i ; done"
               % self.mounts[0].mountpoint)

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Updating file permissions failed")
        g.log.info("File permissions updated successfully")

        # Bricng brick1 online and check the status
        # Bring brick3 online and check status
        g.log.info('Bringing brick %s online', bricks_list[0])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[0]])
        self.assertTrue(ret, 'Failed to bring brick %s online' %
                        bricks_list[0])
        g.log.info('Bringing brick %s online is successful', bricks_list[0])

        g.log.info("Verifying if brick %s is online", bricks_list[0])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0]))
        g.log.info("Brick %s has come online.", bricks_list[0])

        # Bring brick2 offline
        g.log.info('Bringing brick %s offline', bricks_list[1])
        ret = bring_bricks_offline(self.volname, bricks_list[1])
        self.assertTrue(ret, 'Failed to bring bricks %s offline'
                        % bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Brick %s is not offline'
                        % bricks_list[1])
        g.log.info('Bringing brick %s offline is successful',
                   bricks_list[1])

        # Change metadata of same files & directories as before
        cmd = ("cd %s/test_metadata_sb &&"
               "for i in `seq 1 2` ; do chmod -R 0777 dir.$i file.$i ; done"
               % self.mounts[0].mountpoint)

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Updating file permissions failed")
        g.log.info("File permissions updated successfully")

        # Bricng brick2 online and check the status
        g.log.info('Bringing brick %s online', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring brick %s online' %
                        bricks_list[1])
        g.log.info('Bringing brick %s online is successful', bricks_list[1])

        g.log.info("Verifying if brick %s is online", bricks_list[1])
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1]))
        g.log.info("Brick %s has come online.", bricks_list[1])

        # Set option self-heal-daemon to ON
        g.log.info('Setting option self-heal-daemon to on...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        g.log.info("Checking if files are in split-brain")
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to create split-brain scenario")
        g.log.info("Successfully created split brain scenario")

        g.log.info("Resolving split-brain by using the source-brick option "
                   "by choosing second brick as source for all the files")
        node, _ = bricks_list[1].split(':')
        command = ("gluster v heal " + self.volname + " split-brain "
                   "source-brick " + bricks_list[1])
        ret, _, _ = g.run(node, command)
        self.assertEqual(ret, 0, "Command execution not successful")

        # waiting for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Heal not completed")

        # Do lookup on the files from mount
        cmd = ("ls -lR %s/test_metadata_sb"
               % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to lookup")
        g.log.info("Lookup successful")

        # Checking if files are still in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, "File still in split-brain")
        g.log.info("Successfully resolved split brain situation using "
                   "CLI based resolution")

        # Check arequals for all the bricks
        g.log.info('Getting arequal for all the bricks after heal...')
        self.verify_brick_arequals()
        g.log.info('Getting arequal after heal is successful')

        # Change metadata of same files & directories as before
        cmd = ("cd %s/test_metadata_sb &&"
               "for i in `seq 1 2` ; do chmod -R 0555 dir.$i file.$i ; done"
               % self.mounts[0].mountpoint)

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Updating file permissions failed")
        g.log.info("File permissions updated successfully")

        # Do lookup on the mount
        cmd = ("find %s | xargs stat" % self.mounts[0].mountpoint)

        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Lookup on the mount failed")
        g.log.info("Lookup on the mount is successful")

        # Check arequals for all the bricks
        g.log.info('Getting arequal for all the bricks...')
        self.verify_brick_arequals()
        g.log.info('Getting arequal is successful')
    def test_resolving_meta_data(self):
        """
        - Create a file test_file.txt
        - Find out which brick the file resides on and kill arbiter brick
        in the replica pair
        - Modify the permissions of the file
        - Bring back the killed brick
        - Kill the other brick in the replica pair
        - Modify the permissions of the file
        - Bring back the killed brick
        - Trigger heal
        - Check if heal is completed
        - Check for split-brain
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        file_to_create = 'test_file.txt'
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create file
            g.log.info('Creating file...')
            command = ("cd %s ; "
                       "touch %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # get bricks with file
        g.log.info('Getting bricks with file...')
        subvols_dict = get_subvols(self.mnode, self.volname)
        brick_list_with_file = []
        for subvol in subvols_dict['volume_subvols']:
            for brick in subvol:
                node, brick_path = brick.split(':')
                ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path)
                if 'test_file.txt' in brick_file_list:
                    brick_list_with_file.append(brick)
        g.log.info('Bricks with file: %s', brick_list_with_file)

        # Bring arbiter brick offline
        bricks_to_bring_offline = [brick_list_with_file[-1]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Modify the permissions
            g.log.info('Modifying the permissions of the file...')
            command = ("cd %s ; "
                       "chmod 600 %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring arbiter brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Bring 1-st data brick offline
        bricks_to_bring_offline = [brick_list_with_file[0]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Modify the permissions
            g.log.info('Modifying the permissions of the file...')
            command = ("cd %s ; "
                       "chmod 644 %s" % (mount_obj.mountpoint, file_to_create))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring 1-st data brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
    def test_afr_gfid_heal(self):
        """
        Description: This test case runs split-brain resolution
                     on a 5 files in split-brain on a 1x2 volume.
                     After resolving split-brain, it makes sure that
                     split brain resolution doesn't work on files
                     already in split brain.
        """

        g.log.info("disabling the self heal daemon")
        ret = disable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "unable to disable self heal daemon")
        g.log.info("Successfully disabled the self heal daemon")

        # getting list of all bricks
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "failed to get list of bricks")
        g.log.info("bringing down brick1")
        ret = bring_bricks_offline(self.volname, all_bricks[0:1])
        self.assertTrue(ret, "unable to bring brick1 offline")
        g.log.info("Successfully brought the following brick offline "
                   ": %s", str(all_bricks[0]))
        g.log.info("verifying if brick1 is offline")
        ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1])
        self.assertTrue(ret, "brick1 is still online")
        g.log.info("verified: brick1 is offline")

        g.log.info("creating 5 files from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 5 --base-file-name test_file --fixed-file-size 1k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        g.log.info("Wait for IO to complete and validate IO.....")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")
        g.log.info("Successfully created a file from mount point")

        g.log.info("bringing brick 1 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1])
        self.assertIsNotNone(ret, "unable to bring brick 1 online")
        g.log.info("Successfully brought the following brick online "
                   ": %s", str(all_bricks[0]))
        g.log.info("verifying if brick1 is online")
        ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1])
        self.assertTrue(ret, "brick1 is not online")
        g.log.info("verified: brick1 is online")

        g.log.info("bringing down brick2")
        ret = bring_bricks_offline(self.volname, all_bricks[1:2])
        self.assertTrue(ret, "unable to bring brick2 offline")
        g.log.info("Successfully brought the following brick offline "
                   ": %s", str(all_bricks[1]))
        g.log.info("verifying if brick2 is offline")
        ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2])
        self.assertTrue(ret, "brick2 is still online")
        g.log.info("verified: brick2 is offline")

        g.log.info("creating 5 new files of same name from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 5 --base-file-name test_file --fixed-file-size 10k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        g.log.info("Wait for IO to complete and validate IO.....")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")
        g.log.info("Successfully created a new file of same name "
                   "from mount point")

        g.log.info("bringing brick2 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2])
        self.assertIsNotNone(ret, "unable to bring brick2 online")
        g.log.info("Successfully brought the following brick online "
                   ": %s", str(all_bricks[1]))
        g.log.info("verifying if brick2 is online")
        ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2])
        self.assertTrue(ret, "brick2 is not online")
        g.log.info("verified: brick2 is online")

        g.log.info("enabling the self heal daemon")
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "failed to enable self heal daemon")
        g.log.info("Successfully enabled the self heal daemon")

        g.log.info("checking if volume is in split-brain")
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "unable to create split-brain scenario")
        g.log.info("Successfully created split brain scenario")

        g.log.info("resolving split-brain by choosing first brick as "
                   "the source brick")
        node, brick_path = all_bricks[0].split(':')
        for fcount in range(5):
            command = ("gluster v heal " + self.volname + " split-brain "
                       "source-brick " + all_bricks[0] + ' /test_file' +
                       str(fcount) + '.txt')
            ret, _, _ = g.run(node, command)
            self.assertEqual(ret, 0, "command execution not successful")
        # triggering heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, "heal not triggered")
        g.log.info("Successfully triggered heal")
        # waiting for heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=240)
        self.assertTrue(ret, "heal not completed")
        g.log.info("Heal completed successfully")
        # checking if any file is in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, "file still in split-brain")
        g.log.info("Successfully resolved split brain situation using "
                   "CLI based resolution")

        g.log.info("resolving split-brain on a file not in split-brain")
        node, brick_path = all_bricks[0].split(':')
        command = ("gluster v heal " + self.volname + " split-brain "
                   "source-brick " + all_bricks[1] + " /test_file0.txt")
        ret, _, _ = g.run(node, command)
        self.assertNotEqual(
            ret, 0, "Unexpected: split-brain resolution "
            "command is successful on a file which"
            " is not in split-brain")
        g.log.info("Expected: split-brian resolution command failed on "
                   "a file which is not in split-brain")

        g.log.info("checking the split-brain status of each file")
        for fcount in range(5):
            fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) +
                     '.txt')
            status = get_fattr(self.mounts[0].client_system, fpath,
                               'replica.split-brain-status')
            compare_string = ("The file is not under data or metadata "
                              "split-brain")
            self.assertEqual(
                status.rstrip('\x00'), compare_string,
                "file test_file%s is under"
                " split-brain" % str(fcount))
        g.log.info("none of the files are under split-brain")
    def test_gluster_clone_heal(self):
        """
        Test gluster compilation on mount point(Heal command)
        - Creating directory test_compilation
        - Compile gluster on mountpoint
        - Select bricks to bring offline
        - Bring brick offline
        - Validate IO
        - Bring bricks online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        - Compile gluster on mountpoint again
        - Select bricks to bring offline
        - Bring brick offline
        - Validate IO
        - Bring bricks online
        - Wait for volume processes to be online
        - Verify volume's all process are online
        - Monitor heal completion
        - Check for split-brain
        - Get arequal after getting bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Creating directory test_compilation
        ret = mkdir(self.mounts[0].client_system,
                    "{}/test_compilation".format(self.mounts[0].mountpoint))
        self.assertTrue(ret, "Failed to create directory")
        g.log.info(
            "Directory 'test_compilation' on %s created "
            "successfully", self.mounts[0])

        # Compile gluster on mountpoint
        cmd = ("cd %s/test_compilation ; rm -rf glusterfs; git clone"
               " git://github.com/gluster/glusterfs.git ; cd glusterfs ;"
               " ./autogen.sh ;./configure CFLAGS='-g3 -O0 -DDEBUG'; make ;"
               " cd ../..;" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system, cmd)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs([proc], self.mounts[0]),
                        "IO failed on some of the clients")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info("Arequal of mountpoint %s", result_after_online)

        # Compile gluster on mountpoint again
        proc1 = g.run_async(self.mounts[0].client_system, cmd)

        # Select bricks to bring offline
        bricks_to_bring_offline = select_volume_bricks_to_bring_offline(
            self.mnode, self.volname)
        self.assertIsNotNone(bricks_to_bring_offline, "List is empty")

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks {} offline'.format(
                bricks_to_bring_offline))

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline))

        # Validate IO
        self.assertTrue(validate_io_procs([proc1], self.mounts[0]),
                        "IO failed on some of the clients")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            'Failed to bring bricks {} online'.format(bricks_to_bring_offline))

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret,
            ("Volume {} : All process are not online".format(self.volname)))

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')

        # Get arequal after getting bricks online
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info("Arequal of mountpoint %s", result_after_online)
Beispiel #13
0
    def test_self_heal_algorithm_full_daemon_off(self):
        """""
        Description:-
        Checking healing when algorithm is set to "full" and self heal daemon
        is "off".
        """""
        # pylint: disable=too-many-statements

        # Setting volume option of self heal & algorithm
        options = {"metadata-self-heal": "disable",
                   "entry-self-heal": "disable",
                   "data-self-heal": "disable",
                   "data-self-heal-algorithm": "full",
                   "self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, "Failed to set the volume options %s" % options)
        g.log.info(" Volume set options success")

        # Select bricks to bring down
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)

        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks: %s offline"
                        % bricks_to_bring_offline)
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list:%s are offline"
                        % bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks %s are all offline",
                   bricks_to_bring_offline)

        # IO on the mount point
        all_mounts_procs = []
        g.log.info("Creating Files on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("cd %s ;for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=1;done"
               % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Collecting Arequal before bring the bricks up
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_before = collect_mounts_arequal(self.mounts)

        # Turning self heal daemon ON
        optionstwo = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, optionstwo)
        self.assertTrue(ret, "Failed to turn self-heal ON")
        g.log.info("Volume set options %s: success", optionstwo)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_to_bring_offline)
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Waiting for bricks to come online
        g.log.info("Waiting for brick process to come online")
        ret = wait_for_bricks_to_be_online(self.mnode,
                                           self.volname,
                                           timeout=30)
        self.assertTrue(ret, "bricks didn't come online after adding bricks")
        g.log.info("Bricks are online")

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, "Volume %s : All process are not online"
                        % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self heal processes to come online
        g.log.info("Wait for selfheal process to come online")
        ret = wait_for_self_heal_daemons_to_be_online(self.mnode,
                                                      self.volname,
                                                      timeout=300)
        self.assertTrue(ret, "Self-heal process are not online")
        g.log.info("All self heal process are online")

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # arequal after healing
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_after = collect_mounts_arequal(self.mounts)

        # Comparing the results
        g.log.info("comparing both the results")
        self.assertEqual(result_before, result_after, "Arequals are not equal")
Beispiel #14
0
    def test_glustershd_with_restarting_glusterd(self):
        """
        Test Script to verify the self heal daemon process with restarting
        glusterd and rebooting the server

        * stop all volumes
        * restart glusterd - should not run self heal daemon process
        * start replicated involved volumes
        * single self heal daemon process running
        * restart glusterd
        * self heal daemon pid will change
        * bring down brick and restart glusterd
        * self heal daemon pid will change and its different from previous
        * brought up the brick

        """
        # pylint: disable=too-many-statements
        nodes = self.volume['servers']

        # stop the volume
        g.log.info("Stopping the volume %s", self.volname)
        ret = volume_stop(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to stop volume %s" % self.volname))
        g.log.info("Successfully stopped volume %s", self.volname)

        # check the self heal daemon process after stopping the volume
        g.log.info("Verifying the self heal daemon process for "
                   "volume %s", self.volname)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is still running "
                               "even after stopping volume %s" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume %s", self.volname)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        self.assertTrue(
            wait_for_glusterd_to_start(self.servers),
            "Failed to start glusterd on %s" % self.servers)

        # check the self heal daemon process after restarting glusterd process
        g.log.info("Starting to get self-heal daemon process on"
                   " nodes %s", nodes)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is running after "
                               "glusterd restart with volume %s in "
                               "stop state" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume and restarting glusterd %s", self.volname)

        # start the volume
        g.log.info("Starting the volume %s", self.volname)
        ret = volume_start(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to start volume %s" % self.volname))
        g.log.info("Volume %s started successfully", self.volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("Checking whether glustershd process is daemonized or not")
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Single self heal daemon process on all nodes %s", nodes)

        # get the self heal daemon pids after starting volume
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Successful in getting self heal daemon pids")
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info
        # with glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different from "
                              "glustershd server volume file. "
                              "Please check log file for details."))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after starting volume and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_glusterd_restart = pids

        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_glusterd_restart,
                            ("Self Heal Daemon pids are same after "
                             "restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different before and "
                   "after restarting glusterd process")

        # select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # bring bricks offline
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # restart glusterd after brought down the brick
        g.log.info("Restart glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after killing brick and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_killing_brick = pids

        self.assertNotEqual(glustershd_pids_after_glusterd_restart,
                            glustershd_pids_after_killing_brick,
                            ("Self Heal Daemon process are same from before "
                             "killing the brick,restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different after killing the "
                   "brick, restarting the glusterd process")

        # brought the brick online
        g.log.info("bringing up the bricks : %s online",
                   bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to brought the bricks online"))
        g.log.info("Successfully brought the bricks online")

        # check all bricks are online
        g.log.info("Verifying all bricka are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")
    def test_self_heal(self):
        """
        Description:-
        - Create files on mount point
        - Kill one brick from volume
        - rm -rfv on mount point
        - bring bricks online
        - wait for heals
        - list
        """
        # pylint: disable=too-many-statements

        # IO on the mount point
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 35 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (
                       self.script_upload_path,
                       self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Killing one brick from the volume set
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s offline",
                              bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list: %s are offline" %
                        bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Checking volume status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Removing files from the mount point when one brick is down
        g.log.info("Removing files from the mount point")
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system
        cmd = "rm -rfv %s/*" % mountpoint
        ret, _, _ = g.run(client, cmd)
        if ret != 0:
            raise ExecutionError("failed to delete the files")

        # Bringing bricks online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Check if bricks are online
        g.log.info("Checking bricks are online or not")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Monitoring heals on the volume
        g.log.info("Wait for heal completion...")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                             "for 20 minutes.")
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_dist_to_repl_automatic_heal_should_be_triggered(self):
        """
        - create a single brick volume
        - add some files and directories
        - get arequal from mountpoint
        - add-brick such that this brick makes the volume a replica vol 1x2
        - make sure heal is completed
        - get arequals from all bricks and compare with arequal from mountpoint
        - bring down brick 0
        - create new files and validate IO
        - bring brick 0 up
        - make sure heal is completed
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-length 1 "
                   "--dir-depth 1 "
                   "--max-num-of-dirs 1 "
                   "--num-of-files 10 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Get arequal for mount before adding bricks
        g.log.info('Getting arequal before adding bricks...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Form brick list to add
        g.log.info('Forming brick list to add...')
        bricks_to_add = form_bricks_list(self.mnode, self.volname, 1,
                                         self.servers, self.all_servers_info)
        g.log.info('Brick list to add: %s', bricks_to_add)

        # Add bricks
        g.log.info("Start adding bricks to volume...")
        ret, _, _ = add_brick(self.mnode,
                              self.volname,
                              bricks_to_add,
                              force=True,
                              replica_count=2)
        self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add)
        g.log.info("Adding bricks is successful on volume %s", self.volname)

        # Make sure the newly added bricks are available in the volume
        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)
        for brick in bricks_to_add:
            self.assertIn(brick, bricks_list,
                          'Brick %s is not in brick list' % brick)
        g.log.info('New bricks are present in the volume')

        # Make sure volume change from distribute to replicate volume
        vol_info_dict = get_volume_type_info(self.mnode, self.volname)
        vol_type = vol_info_dict['volume_type_info']['typeStr']
        self.assertEqual(
            'Replicate', vol_type, 'Volume type is not converted to Replicate '
            'after adding bricks')
        g.log.info('Volume type is successfully converted to Replicate '
                   'after adding bricks')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        g.log.info('Getting arequal on bricks...')
        arequals_after_heal = {}
        for brick in bricks_list:
            g.log.info('Getting arequal on bricks %s...', brick)
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            arequals_after_heal[brick] = brick_total
            self.assertEqual(
                mount_point_total, brick_total,
                'Arequals for mountpoint and %s are not equal' % brick)
            g.log.info('Arequals for mountpoint and %s are equal', brick)
        g.log.info('All arequals are equal for replicated')

        # Bring brick 0 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[0])
        ret = bring_bricks_offline(self.volname, [bricks_list[0]])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0])
        g.log.info('Bringing bricks %s offline is successful', bricks_list[0])

        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick 0 online
        g.log.info('Bringing bricks %s online...', bricks_list[0])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret,
                        'Failed to bring bricks %s online' % bricks_list[0])
        g.log.info('Bringing bricks %s online is successful', bricks_list[0])

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Beispiel #17
0
    def test_brick_process_not_started_on_read_only_node_disks(self):
        """
        * create volume and start
        * kill one brick
        * start IO
        * unmount the brick directory from node
        * remount the brick directory with read-only option
        * start the volume with "force" option
        * check for error 'posix: initializing translator failed' in log file
        * remount the brick directory with read-write option
        * start the volume with "force" option
        * validate IO
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Creating files for all volumes
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_files -f 100 "
                   "%s/%s/test_dir" %
                   (self.script_upload_path, mount_obj.mountpoint,
                    mount_obj.client_system))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # umount brick
        brick_node, volume_brick = bricks_to_bring_offline[0].split(':')
        node_brick = '/'.join(volume_brick.split('/')[0:3])
        g.log.info('Start umount brick %s...', node_brick)
        ret, _, _ = g.run(brick_node, 'umount -l %s' % node_brick)
        self.assertFalse(ret, 'Failed to umount brick %s' % node_brick)
        g.log.info('Successfully umounted %s', node_brick)

        # get time before remount the directory and checking logs for error
        g.log.info('Getting time before remount the directory and '
                   'checking logs for error...')
        _, time_before_checking_logs, _ = g.run(brick_node, 'date -u +%s')
        g.log.info('Time before remount the directory and checking logs - %s',
                   time_before_checking_logs)

        # remount the directory with read-only option
        g.log.info('Start remount brick %s with read-only option...',
                   node_brick)
        ret, _, _ = g.run(brick_node, 'mount -o ro %s' % node_brick)
        self.assertFalse(ret, 'Failed to remount brick %s' % node_brick)
        g.log.info('Successfully remounted %s with read-only option',
                   node_brick)

        # start volume with "force" option
        g.log.info('starting volume with "force" option...')
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertFalse(
            ret,
            'Failed to start volume %s with "force" option' % self.volname)
        g.log.info('Successfully started volume %s with "force" option',
                   self.volname)

        # check logs for an 'initializing translator failed' error
        g.log.info(
            "Checking logs for an 'initializing translator failed' "
            "error for %s brick...", node_brick)
        error_msg = 'posix: initializing translator failed'
        cmd = ("cat /var/log/glusterfs/bricks/%s-%s-%s.log | "
               "grep '%s'" %
               (volume_brick.split('/')[-3], volume_brick.split('/')[-2],
                volume_brick.split('/')[-1], error_msg))
        ret, log_msgs, _ = g.run(brick_node, cmd)
        log_msg = log_msgs.rstrip().split('\n')[-1]

        self.assertTrue(error_msg in log_msg, 'No errors in logs')
        g.log.info('EXPECTED: %s', error_msg)

        # get time from log message
        log_time_msg = log_msg.split('E')[0][1:-2].split('.')[0]
        log_time_msg_converted = calendar.timegm(
            time.strptime(log_time_msg, '%Y-%m-%d %H:%M:%S'))
        g.log.info('Time_msg from logs - %s ', log_time_msg)
        g.log.info('Time from logs - %s ', log_time_msg_converted)

        # get time after remount the directory checking logs for error
        g.log.info('Getting time after remount the directory and '
                   'checking logs for error...')
        _, time_after_checking_logs, _ = g.run(brick_node, 'date -u +%s')
        g.log.info('Time after remount the directory and checking logs - %s',
                   time_after_checking_logs)

        # check time periods
        g.log.info('Checking if an error is in right time period...')
        self.assertTrue(
            int(time_before_checking_logs) <= int(log_time_msg_converted) <=
            int(time_after_checking_logs),
            'Expected error is not in right time period')
        g.log.info('Expected error is in right time period')

        # umount brick
        g.log.info('Start umount brick %s...', node_brick)
        ret, _, _ = g.run(brick_node, 'umount -l %s' % node_brick)
        self.assertFalse(ret, 'Failed to umount brick %s' % node_brick)
        g.log.info('Successfully umounted %s', node_brick)

        # remount the directory with read-write option
        g.log.info('Start remount brick %s with read-write option...',
                   node_brick)
        ret, _, _ = g.run(brick_node, 'mount %s' % node_brick)
        self.assertFalse(ret, 'Failed to remount brick %s' % node_brick)
        g.log.info('Successfully remounted %s with read-write option',
                   node_brick)

        # start volume with "force" option
        g.log.info('starting volume with "force" option...')
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertFalse(
            ret,
            'Failed to start volume %s with "force" option' % self.volname)
        g.log.info('Successfully started volume %s with "force" option',
                   self.volname)

        # Validate IO
        g.log.info('Validating IO on all mounts')
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info('Successfully Validated IO on all mounts')
        self.io_validation_complete = True
Beispiel #18
0
    def test_heal_info_no_hang(self):
        """
        Testcase steps:
        1. Start kernel untar on the mount
        2. While untar is going on, kill a brick of the replica.
        3. Wait for the untar to be over, resulting in pending heals.
        4. Get the approx. number of pending heals and save it
        5. Bring the brick back online.
        6. Trigger heal
        7. Run more I/Os with dd command
        8. Run heal info command and check that it completes successfully under
           a timeout that is based on the no. of heals in step 4.
        """
        self.list_of_io_processes = []
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Kill brick resulting in heal backlog.
        brick_to_bring_offline = random.choice(self.bricks_list)
        ret = bring_bricks_offline(self.volname, brick_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline'
                        % brick_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 [brick_to_bring_offline])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % brick_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   brick_to_bring_offline)

        ret = self._wait_for_untar_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # Get approx. no. of entries to be healed.
        cmd = ("gluster volume heal %s statistics heal-count | grep Number "
               "| awk '{sum+=$4} END {print sum/2}'" % self.volname)
        ret, self.num_entries, _ = g.run(self.mnode, cmd)
        self.assertEqual(ret, 0, "Failed to get heal-count statistics")

        # Restart the down bricks
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [brick_to_bring_offline])
        self.assertTrue(ret, 'Failed to bring brick %s online' %
                        brick_to_bring_offline)
        g.log.info('Bringing brick %s online is successful',
                   brick_to_bring_offline)
        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Run more I/O
        cmd = ("for i in `seq 1 10`; do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=100; done" % self.mounts[0].mountpoint)
        ret = g.run_async(self.mounts[0].client_system, cmd,
                          user=self.mounts[0].user)

        # Get heal info
        ret = self._does_heal_info_complete_within_timeout()
        self.assertTrue(ret, 'Heal info timed out')
        g.log.info('Heal info completed succesfully')
Beispiel #19
0
    def test_create_snap_bricks(self):
        """
        1. get brick list
        2. check all bricks are online
        3. Selecting one brick randomly to bring it offline
        4. get brick list
        5. check all bricks are online
        6. Offline Bricks list
        7. Online Bricks list
        8. Create snapshot of volume
        9. snapshot create should fail
        """

        bricks_list = []
        # get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # check all bricks are online
        g.log.info("Verifying all bricks are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")

        # Selecting one brick randomly to bring it offline
        g.log.info("Selecting one brick randomly to bring it offline")
        brick_to_bring_offline = random.choice(bricks_list)
        g.log.info("Brick to bring offline:%s " % brick_to_bring_offline)
        ret = bring_bricks_offline(self.volname, brick_to_bring_offline, None)
        self.assertTrue(ret, "Failed to bring the bricks offline")
        g.log.info("Randomly Selected brick: %s" % brick_to_bring_offline)

        # get brick list
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # check all bricks are online
        g.log.info("Verifying all bricks are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertFalse(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # Offline Bricks list
        offbricks = get_offline_bricks_list(self.mnode, self.volname)
        g.log.info("Bricks Offline: %s" % offbricks)

        # Online Bricks list
        onbricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Bricks Online: %s" % onbricks)

        # Create snapshot of volume
        ret = snap_create(self.mnode, self.volname, "snap1", False,
                          "Description with $p3c1al characters!")
        self.assertTrue(ret, ("Failed to create snapshot snap1"))
        g.log.info("Snapshot snap1 of volume %s created Successfully" %
                   (self.volname))

        # Volume status
        ret = get_volume_info(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to perform gluster volume"
                              "info on volume %s" % self.volname))
        g.log.info("Gluster volume info on volume %s is successful" %
                   self.volname)
        # snapshot list
        ret = snap_list(self.mnode)
        self.assertTrue(
            ret, ("Failed to list snapshot of volume %s" % self.volname))
        g.log.info("Snapshot list command for volume %s was successful" %
                   self.volname)
    def test_entry_heal_with_quota(self):
        """
        - Create a 1x3 volume
        - Set quota object limit
        - Create files less than the limit
        - Bring down a brick and create more files until limit is hit
        - Delete one file so that we are below the limit, and create one more
          file
        - Bring the brick back up and launch heal
        - Verify that after heal is complete, the deleted file does not
          re-appear in any of the bricks.
        """
        # pylint: disable=too-many-statements
        # Enable Quota
        g.log.info("Enabling quota on the volume %s", self.volname)
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Check if quota is enabled
        g.log.info("Validate Quota is enabled on the volume %s", self.volname)
        ret = is_quota_enabled(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Quota is not enabled on the volume %s", self.volname))
        g.log.info("Successfully Validated quota is enabled on volume %s",
                   self.volname)

        # Set quota related options
        options = {
            "quota-deem-statfs": "on",
            "soft-timeout": "0",
            "hard-timeout": "0"
        }
        g.log.info("setting quota volume options %s", options)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for "
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Create directory on mount
        ret = mkdir(self.mounts[0].client_system,
                    "%s/dir" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "mkdir failed")

        # Set Quota limit on the directory
        path = "/dir"
        g.log.info(
            "Setting Quota Limit object on the path %s of the "
            "volume %s", path, self.volname)
        ret, _, _ = quota_limit_objects(self.mnode,
                                        self.volname,
                                        path=path,
                                        limit="10")
        self.assertEqual(ret, 0,
                         ("Failed to set quota limit object "
                          "on path %s of the volume %s", path, self.volname))
        g.log.info(
            "Successfully set the Quota limit object on %s of the "
            "volume %s", path, self.volname)

        cmd = ("touch %s/dir/file{1..5}" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # Bring brick3 offline
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info('Bringing brick %s offline', bricks_list[2])
        ret = bring_bricks_offline(self.volname, bricks_list[2])
        self.assertTrue(ret,
                        'Failed to bring brick %s offline' % bricks_list[2])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2])
        g.log.info('Bringing brick %s offline was successful', bricks_list[2])

        # Create files until quota object limit
        cmd = ("touch %s/dir/file{6..9}" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "file creation failed")

        # The next create must fail
        cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(
            ret, 1, ("Creation of %s/dir/file10 succeeded while "
                     "it was not supposed to." % self.mounts[0].mountpoint))
        g.log.info(
            "Creation of %s/dir/file10 failed as expected due to "
            "quota object limit.", self.mounts[0].mountpoint)

        # Delete one file and re-try the create to succeed.
        cmd = ("rm %s/dir/file1" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "File deletion failed")
        cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "File creation failed")

        # Bring brick3 online and check status
        g.log.info('Bringing brick %s online...', bricks_list[2])
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]])
        self.assertTrue(ret,
                        'Failed to bring brick %s online' % bricks_list[2])
        g.log.info('Bringing brick %s online is successful', bricks_list[2])

        g.log.info("Verifying if brick3 is online....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("brick3 did not come up"))
        g.log.info("brick3 has come online.")

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Verify that file10 did not get recreated on the down brick by an
        # accidental conservative merge.
        for brick in bricks_list:
            node, brick_path = brick.split(':')
            ret, _, _ = g.run(node, 'stat %s/dir/file10' % brick_path)
            self.assertFalse(ret, 'File present!')
    def test_add_identical_brick(self):
        """
        In this test case:
        1. Create Dist Volume on Node 1
        2. Down brick on Node 1
        3. Peer Probe N2 from N1
        4. Add identical brick on newly added node
        5. Check volume status
        """

        # pylint: disable=too-many-statements
        # Create a distributed volume on Node1
        number_of_brick = 1
        servers_info_from_single_node = {
            self.servers[0]: self.all_servers_info[self.servers[0]]
        }
        self.volname = "testvol"
        bricks_list = form_bricks_list(self.servers[0], self.volname,
                                       number_of_brick, self.servers[0],
                                       servers_info_from_single_node)
        ret, _, _ = volume_create(self.servers[0],
                                  self.volname,
                                  bricks_list,
                                  force=False)
        self.assertEqual(ret, 0, "Volume create failed")
        g.log.info("Volume %s created successfully", self.volname)

        ret, _, _ = volume_start(self.servers[0], self.volname, True)
        self.assertEqual(ret, 0, ("Failed to start the "
                                  "volume %s", self.volname))
        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        ret = bring_bricks_offline(self.volname, bricks_list[0])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret, _, _ = peer_probe(self.servers[0], self.servers[1])
        self.assertEqual(ret, 0, ("peer probe from %s to %s is failed",
                                  self.servers[0], self.servers[1]))
        g.log.info("peer probe is success from %s to "
                   "%s", self.servers[0], self.servers[1])

        # wait for some time before add-brick
        time.sleep(2)

        # Replace just host IP to create identical brick
        add_bricks = []
        add_bricks.append(
            string.replace(bricks_list[0], self.servers[0], self.servers[1]))
        ret, _, _ = add_brick(self.mnode, self.volname, add_bricks)
        self.assertEqual(ret, 0, "Failed to add the bricks to the volume")
        g.log.info("Successfully added bricks to volume %s", add_bricks[0])

        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")

        vol_status = get_volume_status(self.mnode, self.volname)
        self.assertIsNotNone(
            vol_status, "Failed to get volume "
            "status for %s" % self.volname)
Beispiel #22
0
    def test_heal_client_io_hang(self):
        mountpoint = self.mounts[0].mountpoint

        # disable server side heal
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to disable server side heal"))
        g.log.info("Successfully disabled server side heal")

        # Log Volume Info and Status after disabling client side heal
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the bricks list")

        # Create files
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do touch file$i; done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished creating files while all the bricks are UP')

        # Bring bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        # Start pumping IO from client
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writing on files while a brick is DOWN')

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring up the bricks")
        g.log.info("Successfully brought the bricks up")

        # Verifying all bricks online
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, "All bricks are not online")

        # Start client side heal by reading/writing files.
        appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                     "do dd if=/dev/urandom of=file$i bs=1M "
                     "count=1 oflag=append conv=notrunc;done" % mountpoint)

        readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                   "do dd if=file$i of=/dev/zero bs=1M "
                   "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, appendcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished append on files after bringing bricks online')

        ret, _, err = g.run(self.mounts[0].client_system, readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished read on files after bringing bricks online')

        # check the heal info and completion
        ec_check_heal_comp(self)

        # Log Volume Info and Status after bringing the brick up
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and status "
            "of volume %s", self.volname)
    def test_heal_info_should_have_fixed_fields(self):
        """
        - Create IO
        - While IO is creating - bring down a couple of bricks
        - Wait for IO to complete
        - Bring up the down bricks
        - Wait for heal to complete
        - Check for fields 'Brick', 'Status', 'Number of entries' in heal info
        """
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -f 50 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get heal info
        g.log.info('Getting heal info...')
        heal_info_dicts = get_heal_info_summary(self.mnode, self.volname)
        self.assertFalse(ret, 'Failed to get heal info')
        g.log.info(heal_info_dicts)

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Check all fields in heal info dict
        g.log.info('Checking for all the fields in heal info...')
        for brick in bricks_list:
            g.log.info('Checking fields for %s', brick)
            self.assertEqual(heal_info_dicts[brick]['status'], 'Connected',
                             'Status is not Connected for brick %s' % brick)
            self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0',
                             'numberOfEntries is not 0 for brick %s' % brick)

        g.log.info('Successfully checked for all the fields in heal info')
Beispiel #24
0
    def test_multiple_clients_dd_on_same_file_default(self):
        """
        - Create 2GB file
        - While creating file, start reading file
        - Bring down brick1
        - Bring back the brick brick1
        - Start healing
        - Bring down brick1
        - Wait for IO to complete
        - Wait for reading to complete
        - Bring back the brick brick1
        - Start healing
        - Wait for heal to complete
        - Check for split-brain
        - Calculate arequals on all the bricks and compare with mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("cd %s ; "
                       "dd if=/dev/urandom of=test_file bs=1M count=2020"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Reading files on client side
        all_mounts_procs_read = []
        for mount_obj in self.mounts:
            g.log.info("Reading data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Reading files...')
            command = ("python %s read %s"
                       % (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            all_mounts_procs_read.append(proc)

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Bring brick1 online
        g.log.info('Bringing bricks %s online...', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_list[1])
        g.log.info('Bringing bricks %s online is successful',
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline...', bricks_list[1])
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname,
                                 [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_list[1])

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Validate reading
        self.assertTrue(
            validate_io_procs(all_mounts_procs_read, self.mounts),
            "Reading failed on some of the clients"
        )
        self.io_validation_complete = True

        # Bring brick1 online
        g.log.info('Bringing bricks %s online...', bricks_list[1])
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [bricks_list[1]])
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_list[1])
        g.log.info('Bringing bricks %s online is successful',
                   bricks_list[1])

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal for mount
        g.log.info('Getting arequal...')
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        g.log.info('Getting arequal on bricks...')
        arequals_after_heal = {}
        for brick in bricks_list:
            g.log.info('Getting arequal on bricks %s...', brick)
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan'
                       % brick_path)
            ret, arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s'
                             % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            arequals_after_heal[brick] = brick_total
            self.assertEqual(mount_point_total, brick_total,
                             'Arequals for mountpoint and %s are not equal'
                             % brick)
            g.log.info('Arequals for mountpoint and %s are equal', brick)
        g.log.info('All arequals are equal')
Beispiel #25
0
    def test_rebalance_with_brick_down(self):
        """
        Rebalance with brick down in replica
        - Create a Replica volume.
        - Bring down one of the brick down in the replica pair
        - Do some IO and create files on the mount point
        - Add a pair of bricks to the volume
        - Initiate rebalance
        - Bring back the brick which was down.
        - After self heal happens, all the files should be present.
        """
        # Log the volume info and status before brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring one fo the bricks offline
        brick_list = get_all_bricks(self.mnode, self.volname)
        ret = bring_bricks_offline(self.volname, choice(brick_list))

        # Log the volume info and status after brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Create files at mountpoint.
        cmd = (
            "/usr/bin/env python %s create_files "
            "-f 2000 --fixed-file-size 1k --base-file-name file %s"
            % (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(
            self.mounts[0].client_system, cmd, user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete.
        self.assertTrue(wait_for_io_to_complete(self.all_mounts_procs,
                                                self.mounts[0]),
                        "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Compute the arequal checksum before bringing all bricks online
        arequal_before_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Log the volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log the voluem info after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Log the voluem info and status before bringing all bricks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring all bricks online.
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Not able to start volume with force option")
        g.log.info("Volume start with force option successful.")

        # Log the volume info and status after bringing all beicks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Monitor heal completion.
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "heal has not yet completed")
        g.log.info("Self heal completed")

        # Compute the arequal checksum after all bricks online.
        arequal_after_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Comparing arequal checksum before and after the operations.
        self.assertEqual(arequal_before_all_bricks_online,
                         arequal_after_all_bricks_online,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
    def test_client_side_quorum_with_fixed_for_cross3(self):
        """
        Test Script to verify the Client Side Quorum with fixed
        for cross 3 volume

        * Disable self heal daemom
        * set cluster.quorum-type to fixed.
        * start I/O( write and read )from the mount point - must succeed
        * Bring down brick1
        * start I/0 ( write and read ) - must succeed
        * Bring down brick2
        * start I/0 ( write and read ) - must succeed
        * set the cluster.quorum-count to 1
        * start I/0 ( write and read ) - must succeed
        * set the cluster.quorum-count to 2
        * start I/0 ( write and read ) - read and write will fail
        * bring back the brick1 online
        * start I/0 ( write and read ) - must succeed
        * Bring back brick2 online
        * start I/0 ( write and read ) - must succeed
        * set cluster.quorum-type to auto
        * start I/0 ( write and read ) - must succeed
        * Bring down brick1 and brick2
        * start I/0 ( write and read ) - read and write will fail
        * set the cluster.quorum-count to 1
        * start I/0 ( write and read ) - read and write will fail
        * set the cluster.quorum-count to 3
        * start I/0 ( write and read ) - read and write will fail
        * set the quorum-type to none
        * start I/0 ( write and read ) - must succeed

        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        # Disable self heal daemon
        options = {"cluster.self-heal-daemon": "off"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set %s for volume %s" %
                              (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # set cluster.quorum-type to fixed
        options = {"cluster.quorum-type": "fixed"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set %s for volume %s" %
                              (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/O( write ) - must succeed
        all_mounts_procs = []
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "Reads failed on some of the clients")

        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.volname)
        subvols_dict = get_subvols(self.mnode, self.volname)
        num_subvols = len(subvols_dict['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # bring down brick1 for all the subvolumes
        offline_brick1_from_replicasets = []
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list)
            brick_to_bring_offline1 = subvol_brick_list[0]
            g.log.info("Going to bring down the brick process "
                       "for %s", brick_to_bring_offline1)
            ret = bring_bricks_offline(self.volname, brick_to_bring_offline1)
            self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                                  "check the log file for more details."))
            g.log.info("Brought down the brick process "
                       "for %s successfully", brick_to_bring_offline1)
            offline_brick1_from_replicasets.append(brick_to_bring_offline1)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name testfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # bring down brick2 for all the subvolumes
        offline_brick2_from_replicasets = []
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list)
            brick_to_bring_offline2 = subvol_brick_list[1]
            g.log.info("Going to bring down the brick process "
                       "for %s", brick_to_bring_offline2)
            ret = bring_bricks_offline(self.volname, brick_to_bring_offline2)
            self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                                  "check the log file for more details."))
            g.log.info("Brought down the brick process "
                       "for %s successfully", brick_to_bring_offline2)
            offline_brick2_from_replicasets.append(brick_to_bring_offline2)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name newfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # set the cluster.quorum-count to 1
        options = {"cluster.quorum-count": "1"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(
            ret, "Unable to set %s for volume %s" % (options, self.volname))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name filename %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # set the cluster.quorum-count to 2
        options = {"cluster.quorum-count": "2"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set %s for volume %s" %
                              (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - read and write will fail
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("dd if=/dev/urandom of=%s/test_file bs=1M count=1" %
               self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected Error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while creating file")

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while reading file")

        # bring back the brick1 online for all subvolumes
        g.log.info("bringing up the brick : %s online",
                   offline_brick1_from_replicasets)
        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            offline_brick1_from_replicasets,
            bring_bricks_online_methods='glusterd_restart')
        self.assertTrue(ret, ("Failed to brought the brick %s online" %
                              offline_brick1_from_replicasets))
        g.log.info("Successfully brought the brick %s online",
                   offline_brick1_from_replicasets)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name newfilename %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # Bring back brick2 online
        g.log.info("bringing up the brick : %s online",
                   offline_brick2_from_replicasets)
        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            offline_brick2_from_replicasets,
            bring_bricks_online_methods='glusterd_restart')
        self.assertTrue(ret, ("Failed to brought the brick %s online" %
                              offline_brick2_from_replicasets))
        g.log.info("Successfully brought the brick %s online",
                   offline_brick2_from_replicasets)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name textfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # set cluster.quorum-type to auto
        options = {"cluster.quorum-type": "auto"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set %s for volume %s" %
                              (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name newtextfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)

        # bring down brick1 and brick2 for all the subvolumes
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list)
            bricks_to_bring_offline = subvol_brick_list[0:2]
            g.log.info("Going to bring down the brick process for %s",
                       bricks_to_bring_offline)
            ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
            self.assertTrue(
                ret, "Failed to bring down the bricks. Please "
                "check the log file for more details.")
            g.log.info("Brought down the brick process "
                       "for %s successfully", bricks_to_bring_offline)

        # start I/0 ( write and read ) - read and write will fail
        all_mounts_procs = []
        g.log.info("Start creating file on mountpoint %s",
                   self.mounts[0].mountpoint)
        cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" %
               self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while creating files")

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        g.log.info("Starting reading file")
        cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while reading file")

        # set the cluster.quorum-count to 1
        options = {"cluster.quorum-count": "1"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(
            ret, "Unable to set %s for volume %s" % (options, self.volname))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - read and write will fail
        g.log.info("Start creating files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" %
               self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while creating files")

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while reading file")

        # set the cluster.quorum-count to 3
        options = {"cluster.quorum-count": "3"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(
            ret, "Unable to set %s for volume %s" % (options, self.volname))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - read and write will fail
        g.log.info("Start creating files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" %
               self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while creating files")

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        g.log.info("Validating whether IO failed with "
                   "Transport endpoint is not connected")
        ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs,
                                             self.mounts, self.mount_type)
        self.assertTrue(ret, ("Unexpected error and IO successful"
                              " on not connected transport endpoint"))
        g.log.info("EXPECTED: Transport endpoint is not connected"
                   " while reading file")

        # set the quorum-type to none
        options = {"cluster.quorum-type": "none"}
        g.log.info("setting %s for the volume %s", options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(
            ret, "Unable to set %s for volume %s" % (options, self.volname))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # start I/0 ( write and read ) - must succeed
        g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 10 --base-file-name lastfile %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on mountpoint %s" % self.mounts[0].mountpoint)

        # read the file
        g.log.info("Start reading files on mountpoint %s",
                   self.mounts[0].mountpoint)
        all_mounts_procs = []
        cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path,
                                                  self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)
Beispiel #27
0
    def test_shd_should_not_crash_executed_heal_info(self):
        """
        - set "entry-self-heal", "metadata-self-heal", "data-self-heal" to off
        - write a few files
        - bring down brick0
        - add IO
        - do a heal info and check for files pending heal on last 2 bricks
        - set "performance.enable-least-priority" to "enable"
        - bring down brick1
        - set the "quorum-type" to "fixed"
        - add IO
        - do a heal info and check for files pending heal on the last brick
        """
        # pylint: disable=too-many-statements
        bricks_list = get_all_bricks(self.mnode, self.volname)
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files -f 10 "
                       "--fixed-file-size 1M %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Bring brick0 offline
        g.log.info('Bringing bricks %s offline', bricks_list[0])
        ret = bring_bricks_offline(self.volname, bricks_list[0])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[0])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0])
        g.log.info('Bringing bricks %s offline is successful', bricks_list[0])

        # Creating files on client side
        number_of_files_one_brick_off = '1000'
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files "
                       "-f %s "
                       "--fixed-file-size 1k "
                       "--base-file-name new_file "
                       "%s" %
                       (self.script_upload_path, number_of_files_one_brick_off,
                        mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Get heal info
        g.log.info("Getting heal info...")
        heal_info_data = get_heal_info_summary(self.mnode, self.volname)
        self.assertIsNotNone(heal_info_data, 'Failed to get heal info.')
        g.log.info('Success in getting heal info')

        # Check quantity of file pending heal
        for brick in bricks_list[1:]:
            self.assertEqual(heal_info_data[brick]['numberOfEntries'],
                             str(int(number_of_files_one_brick_off) + 1),
                             'Number of files pending heal is not correct')

        # Setting options
        g.log.info('Setting options...')
        options = {"performance.enable-least-priority": "enable"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Bring brick1 offline
        g.log.info('Bringing bricks %s offline', bricks_list[1])
        ret = bring_bricks_offline(self.volname, bricks_list[1])
        self.assertTrue(ret,
                        'Failed to bring bricks %s offline' % bricks_list[1])

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1])
        g.log.info('Bringing bricks %s offline is successful', bricks_list[1])

        # Setting options
        g.log.info('Setting options...')
        options = {"quorum-type": "fixed"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Creating files on client side
        number_of_files_two_brick_off = '100'
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("/usr/bin/env python %s create_files "
                       "-f %s "
                       "--fixed-file-size 1k "
                       "--base-file-name new_new_file "
                       "%s" %
                       (self.script_upload_path, number_of_files_two_brick_off,
                        mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # Get heal info
        g.log.info("Getting heal info...")
        heal_info_data = get_heal_info_summary(self.mnode, self.volname)
        self.assertIsNotNone(heal_info_data, 'Failed to get heal info.')
        g.log.info('Success in getting heal info')

        # Check quantity of file pending heal
        number_of_files_to_check = str(
            int(number_of_files_one_brick_off) +
            int(number_of_files_two_brick_off) + 1)
        self.assertEqual(heal_info_data[bricks_list[-1]]['numberOfEntries'],
                         number_of_files_to_check,
                         'Number of files pending heal is not correct')
    def test_ec_lookup_and_move_operations_few_bricks_are_offline(self):
        """
        Test Steps:
        1. Mount this volume on 3 mount point, c1, c2, and c3
        2. Bring down two bricks offline in each subvol.
        3. On client1: under dir1 create files f{1..10000} run in background
        4. On client2: under root dir of mountpoint touch x{1..1000}
        5. On client3: after step 4 action completed, start creating
           x{1001..10000}
        6. Bring bricks online which were offline(brought up all the bricks
           which were down (2 in each of the two subvols)
        7. While IO on Client1 and Client3 were happening, On client2 move all
           the x* files into dir1
        8. Perform lookup from client 3
        """
        # List two bricks in each subvol
        all_subvols_dict = get_subvols(self.mnode, self.volname)
        subvols = all_subvols_dict['volume_subvols']
        bricks_to_bring_offline = []
        for subvol in subvols:
            self.assertTrue(subvol, "List is empty")
            bricks_to_bring_offline.extend(sample(subvol, 2))

        # Bring two bricks of each subvol offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Bricks are still online")
        g.log.info("Bricks are offline %s", bricks_to_bring_offline)

        # Validating the bricks are offline or not
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Few of the bricks are still online in"
                             " {} in".format(bricks_to_bring_offline))
        g.log.info("%s bricks are offline as expected",
                   bricks_to_bring_offline)

        # Create directory on client1
        dir_on_mount = self.mounts[0].mountpoint + '/dir1'
        ret = mkdir(self.mounts[0].client_system, dir_on_mount)
        self.assertTrue(ret, "unable to create directory on client"
                             " 1 {}".format(self.mounts[0].client_system))
        g.log.info("Dir1 created on %s successfully",
                   self.mounts[0].client_system)

        # Next IO to be ran in the background so using mount_procs
        # and run_async.
        self.mount_procs = []

        # On client1: under dir1 create files f{1..10000} run in background
        self._run_create_files(file_count=10000, base_name="f_",
                               mpoint=dir_on_mount,
                               client=self.mounts[0].client_system)

        # On client2: under root dir of the mountpoint touch x{1..1000}
        cmd = ("/usr/bin/env python {} create_files -f 1000 --fixed-file-size"
               " 10k --base-file-name x {}".format(self.script_upload_path,
                                                   self.mounts[1].mountpoint))
        ret, _, err = g.run(self.mounts[1].client_system, cmd)
        self.assertEqual(ret, 0, "File creation failed on {} with {}".
                         format(self.mounts[1].client_system, err))
        g.log.info("File creation successful on %s",
                   self.mounts[1].client_system)

        # On client3: start creating x{1001..10000}
        cmd = ("cd {}; for i in `seq 1000 10000`; do touch x$i; done; "
               "cd -".format(self.mounts[2].mountpoint))
        proc = g.run_async(self.mounts[2].client_system, cmd)
        self.mount_procs.append(proc)

        # Bring bricks online with volume start force
        ret, _, err = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, err)
        g.log.info("Volume: %s started successfully", self.volname)

        # Check whether bricks are online or not
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, "Bricks {} are still offline".
                        format(bricks_to_bring_offline))
        g.log.info("Bricks %s are online now", bricks_to_bring_offline)

        # From client2 move all the files with name starting with x into dir1
        cmd = ("for i in `seq 0 999`; do mv {}/x$i.txt {}; "
               "done".format(self.mounts[1].mountpoint, dir_on_mount))
        proc = g.run_async(self.mounts[1].client_system, cmd)
        self.mount_procs.append(proc)

        # Perform a lookup in loop from client3 for 20 iterations
        cmd = ("ls -R {}".format(self.mounts[2].mountpoint))
        counter = 20
        while counter:
            ret, _, err = g.run(self.mounts[2].client_system, cmd)
            self.assertEqual(ret, 0, "ls while mv operation being carried"
                                     " failed with {}".format(err))
            g.log.debug("ls successful for the %s time", 21-counter)
            counter -= 1

        self.assertTrue(validate_io_procs(self.mount_procs, self.mounts),
                        "IO failed on the clients")
        # Emptying mount_procs for not validating IO in tearDown
        self.mount_procs *= 0

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname,)
        self.assertTrue(ret, "Heal didn't completed in the expected time")
        g.log.info("Heal completed successfully on %s volume", self.volname)
Beispiel #29
0
    def test_data_self_heal_algorithm_diff_heal_command(self):
        """
        Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff'

        Description:
        - set the volume option
        "metadata-self-heal": "off"
        "entry-self-heal": "off"
        "data-self-heal": "off"
        "data-self-heal-algorithm": "diff"
        "self-heal-daemon": "off"
        - create IO
        - calculate arequal
        - bring down all bricks processes from selected set
        - modify the data
        - get arequal before getting bricks online
        - bring bricks online
        - expand volume by adding bricks to the volume
        - do rebalance
        - set the volume option "self-heal-daemon": "on" and check for daemons
        - start healing
        - check if heal is completed
        - check for split-brain
        - calculate arequal and compare with arequal before bringing bricks
        offline and after bringing bricks online
        """
        # pylint: disable=too-many-branches,too-many-statements
        # Setting options
        g.log.info('Setting options...')
        options = {
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "data-self-heal": "off",
            "data-self-heal-algorithm": "diff"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Options "
                   "'metadata-self-heal', "
                   "'entry-self-heal', "
                   "'data-self-heal', "
                   "'self-heal-daemon' "
                   "are set to 'off',"
                   "'data-self-heal-algorithm' "
                   "is set to 'diff' successfully")

        # Creating files on client side
        all_mounts_procs = []
        g.log.info("Generating data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        # Creating files
        command = "/usr/bin/env python %s create_files -f 100 %s" % (
            self.script_upload_path, self.mounts[0].mountpoint)

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(
            filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                          bricks_to_bring_offline_dict['cold_tier_bricks'] +
                          bricks_to_bring_offline_dict['volume_bricks'])))

        # Bring brick offline
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        all_mounts_procs = []
        g.log.info("Modifying data for %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        command = ("/usr/bin/env python %s create_files -f 100 "
                   "--fixed-file-size 1M %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))

        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Expand volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume...")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Expanding volume is successful on volume %s", self.volname)

        # Do rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Failed to start rebalance')
        g.log.info('Rebalance is started')

        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Rebalance is not completed')
        g.log.info('Rebalance is completed successfully')

        # Setting options
        g.log.info('Setting options...')
        options = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options')
        g.log.info("Option 'self-heal-daemon' is set to 'on' successfully")

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Checking arequals before bringing bricks offline
        # and after bringing bricks online
        self.assertItemsEqual(result_before_online, result_after_online,
                              'Checksums are not equal')
        g.log.info('Checksums are equal')
    def test_file_access(self):
        """
        Test file access.
        """
        # pylint: disable=protected-access
        # pylint: disable=too-many-locals
        # pylint: disable=too-many-statements
        mount_obj = self.mounts[0]
        mountpoint = mount_obj.mountpoint

        # get subvol list
        subvols = (get_subvols(self.mnode, self.volname))['volume_subvols']
        self.assertIsNotNone(subvols, "failed to get subvols")

        # create a file
        srcfile = mountpoint + '/testfile'
        ret, _, err = g.run(self.clients[0], ("touch %s" % srcfile))
        self.assertEqual(ret, 0, ("File creation failed for %s err %s",
                                  srcfile, err))
        g.log.info("testfile creation successful")

        # find hashed subvol
        srchashed, scount = find_hashed_subvol(subvols, "/", "testfile")
        self.assertIsNotNone(srchashed, "could not find srchashed")
        g.log.info("hashed subvol for srcfile %s subvol count %s",
                   srchashed._host, str(scount))

        # rename the file such that the new name hashes to a new subvol
        tmp = find_new_hashed(subvols, "/", "testfile")
        self.assertIsNotNone(tmp, "could not find new hashed for dstfile")
        g.log.info("dst file name : %s dst hashed_subvol : %s "
                   "subvol count : %s", tmp.newname,
                   tmp.hashedbrickobject._host, str(tmp.subvol_count))

        dstname = str(tmp.newname)
        dstfile = mountpoint + "/" + dstname
        dsthashed = tmp.hashedbrickobject
        dcount = tmp.subvol_count
        ret, _, err = g.run(self.clients[0], ("mv %s %s" %
                                              (srcfile, dstfile)))
        self.assertEqual(ret, 0, ("rename failed for %s err %s",
                                  srcfile, err))
        g.log.info("cmd: mv srcfile dstfile successful")

        # check that on dsthash_subvol the file is a linkto file
        filepath = dsthashed._fqpath + "/" + dstname
        file_stat = get_file_stat(dsthashed._host, filepath)
        self.assertEqual(file_stat['access'], "1000", ("Expected file "
                                                       "permission to be 1000"
                                                       " on subvol %s",
                                                       dsthashed._host))
        g.log.info("dsthash_subvol has the expected linkto file")

        # check on srchashed the file is a data file
        filepath = srchashed._fqpath + "/" + dstname
        file_stat = get_file_stat(srchashed._host, filepath)
        self.assertNotEqual(file_stat['access'], "1000", ("Expected file "
                                                          "permission not to"
                                                          "be 1000 on subvol"
                                                          "%s",
                                                          srchashed._host))

        # Bring down the hashed subvol of dstfile(linkto file)
        ret = bring_bricks_offline(self.volname, subvols[dcount])
        self.assertTrue(ret, ('Error in bringing down subvolume %s',
                              subvols[dcount]))
        g.log.info('dst subvol %s is offline', subvols[dcount])

        # Need to access the file through a fresh lookup through a new mount
        # create a new dir(choosing server to do a mount)
        ret, _, _ = g.run(self.mnode, ("mkdir -p /mnt"))
        self.assertEqual(ret, 0, ('mkdir of mount dir failed'))
        g.log.info("mkdir of mount dir succeeded")

        # do a temp mount
        ret = mount_volume(self.volname, self.mount_type, "/mnt",
                           self.mnode, self.mnode)
        self.assertTrue(ret, ('temporary mount failed'))
        g.log.info("temporary mount succeeded")

        # check that file is accessible (stat)
        ret, _, _ = g.run(self.mnode, ("stat /mnt/%s" % dstname))
        self.assertEqual(ret, 0, ('stat error on for dst file %s', dstname))
        g.log.info("stat on /mnt/%s successful", dstname)

        # cleanup temporary mount
        ret = umount_volume(self.mnode, "/mnt")
        self.assertTrue(ret, ('temporary mount failed'))
        g.log.info("umount successful")

        # Bring up the hashed subvol
        ret = bring_bricks_online(self.mnode, self.volname, subvols[dcount],
                                  bring_bricks_online_methods=None)
        self.assertTrue(ret, "Error in bringing back subvol online")
        g.log.info('Subvol is back online')

        # now bring down the cached subvol
        ret = bring_bricks_offline(self.volname, subvols[scount])
        self.assertTrue(ret, ('Error in bringing down subvolume %s',
                              subvols[scount]))
        g.log.info('target subvol %s is offline', subvols[scount])

        # file access should fail
        ret, _, _ = g.run(self.clients[0], ("stat %s" % dstfile))
        self.assertEqual(ret, 1, ('stat error on for file %s', dstfile))
        g.log.info("dstfile access failed as expected")