Example #1
0
    def toggle_bricks_and_perform_io(self, file_list, brick_list):
        """
        Kills bricks, does I/O and brings the brick back up.
        """
        # Bring down bricks.
        g.log.info("Going to bring down the brick process for %s", brick_list)
        ret = bring_bricks_offline(self.volname, brick_list)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", brick_list)
        ret = are_bricks_offline(self.mnode, self.volname, brick_list)
        self.assertTrue(ret, 'Bricks %s are not offline' % brick_list)

        # Perform I/O
        for filename in file_list:
            fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \
                    filename
            cmd = ("dd if=/dev/urandom of=%s bs=1024 count=1" % fpath)
            ret, _, _ = g.run(self.clients[0], cmd)
            self.assertEqual(ret, 0, "Creating %s failed" % fpath)

        # Bring up bricks
        ret = bring_bricks_online(self.mnode, self.volname, brick_list)
        self.assertTrue(ret, 'Failed to bring brick %s online' % brick_list)
        g.log.info('Bringing brick %s online is successful', brick_list)

        # Waiting for bricks to come online
        g.log.info("Waiting for brick process to come online")
        timeout = 30
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout)
        self.assertTrue(ret, "bricks didn't come online after adding bricks")
        g.log.info("Bricks are online")
Example #2
0
def wait_for_volume_process_to_be_online(mnode, volname, timeout=300):
    """Waits for the volume's processes to be online until timeout
    Args:
        mnode (str): Node on which commands will be executed.
        volname (str): Name of the volume.
    Kwargs:
        timeout (int): timeout value in seconds to wait for all volume
        processes to be online.
    Returns:
        True if the volume's processes are online within timeout,
        False otherwise
    """
    # Adding import here to avoid cyclic imports
    from glustolibs.gluster.brick_libs import wait_for_bricks_to_be_online

    # Wait for bricks to be online
    bricks_online_status = wait_for_bricks_to_be_online(mnode, volname,
                                                        timeout)
    if not bricks_online_status:
        g.log.error("Failed to wait for the volume '%s' processes "
                    "to be online", volname)
        return False

    # ToDo: Wait for self-heal-daemons to be online

    # TODO: Add any process checks here

    g.log.info("Volume '%s' processes are all online", volname)
    return True
    def tearDown(self):

        # UnMount Volume
        g.log.info("Starting to Unmount Volume %s", self.volname)
        ret = umount_volume(self.mounts[0].client_system,
                            self.mounts[0].mountpoint,
                            mtype=self.mount_type)
        self.assertTrue(ret, ("Failed to Unmount Volume %s" % self.volname))
        g.log.info("Successfully Unmounted Volume %s", self.volname)

        # Clean up all volumes and peer probe to form cluster
        vol_list = get_volume_list(self.mnode)
        if vol_list is not None:
            for volume in vol_list:
                # check all bricks are online
                ret = wait_for_bricks_to_be_online(self.mnode, volume)
                if not ret:
                    raise ExecutionError("Failed to bring bricks online"
                                         "for volume %s" % volume)
                ret = cleanup_volume(self.mnode, volume)
                if not ret:
                    raise ExecutionError("Failed to cleanup volume")
                g.log.info("Volume deleted successfully : %s", volume)

        # Peer probe detached servers
        pool = nodes_from_pool_list(self.mnode)
        for node in pool:
            peer_detach(self.mnode, node)
        ret = peer_probe_servers(self.mnode, self.servers)
        if not ret:
            raise ExecutionError("Failed to probe peer "
                                 "servers %s" % self.servers)
        g.log.info("Peer probe success for detached "
                   "servers %s", self.servers)
        self.get_super_method(self, 'tearDown')()
    def _bring_bricks_online(self):
        """
        Bring bricks online and monitor heal completion
        """
        # Bring bricks online
        ret = bring_bricks_online(
            self.mnode,
            self.volname,
            self.bricks_to_bring_offline,
            bring_bricks_online_methods=['volume_start_force'])
        self.assertTrue(ret, 'Failed to bring bricks online')

        # Wait for volume processes to be online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume {} processes to "
                              "be online".format(self.volname)))
    def bricks_online_and_volume_reset(cls):
        """
        reset the volume if any bricks are offline.
        waits for all bricks to be online and resets
        volume options set
        """
        bricks_offline = get_offline_bricks_list(cls.mnode, cls.volname)
        if bricks_offline is not None:
            ret = volume_start(cls.mnode, cls.volname, force=True)
            if not ret:
                raise ExecutionError("Failed to force start volume"
                                     "%s" % cls.volname)
        ret = wait_for_bricks_to_be_online(cls.mnode, cls.volname)
        if not ret:
            raise ExecutionError("Failed to bring bricks online"
                                 "for volume %s" % cls.volname)

        ret, _, _ = volume_reset(cls.mnode, cls.volname, force=True)
        if ret:
            raise ExecutionError("Failed to reset volume %s" % cls.volname)
        g.log.info("Successful in volume reset %s", cls.volname)
    def test_volume_create(self):

        # create and start a volume
        self.volume['name'] = "first_volume"
        self.volname = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(ret, "Failed to create and start volume")

        # bring a brick down and volume start force should bring it to online

        g.log.info("Get all the bricks of the volume")
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the brick list")
        g.log.info("Successfully got the list of bricks of volume")

        ret = bring_bricks_offline(self.volname, bricks_list[0:2])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to start the volume")
        g.log.info("Volume start with force is success")

        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to bring the bricks online")
        g.log.info("Volume start with force successfully brought all the "
                   "bricks online")

        # create volume with previously used bricks and different volume name
        self.volname = "second_volume"
        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: It should fail to create a "
            "volume with previously used bricks. Actual:"
            "Successfully created the volume with previously"
            " used bricks")
        g.log.info("Failed to create the volume with previously used bricks")

        # create a volume with already existing volume name
        self.volume['name'] = "first_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertTrue(
            ret, "Expected: It should fail to create a volume"
            " with already existing volume name. Actual: "
            "Successfully created the volume with "
            "already existing volname")
        g.log.info("Failed to create the volume with already existing volname")

        # creating a volume with non existing brick path should fail

        self.volname = "second_volume"
        bricks_list = form_bricks_list(self.mnode, self.volname,
                                       len(self.servers), self.servers,
                                       self.all_servers_info)
        nonexisting_brick_index = random.randint(0, len(bricks_list) - 1)
        non_existing_brick = bricks_list[nonexisting_brick_index].split(":")[0]
        non_existing_path = ":/brick/non_existing_path"
        non_existing_brick = non_existing_brick + non_existing_path
        bricks_list[nonexisting_brick_index] = non_existing_brick

        ret, _, _ = volume_create(self.mnode, self.volname, bricks_list)
        self.assertNotEqual(
            ret, 0, "Expected: Creating a volume with non "
            "existing brick path should fail. Actual: "
            "Successfully created the volume with "
            "non existing brick path")
        g.log.info("Failed to create the volume with non existing brick path")

        # cleanup the volume and peer detach all servers. form two clusters,try
        # to create a volume with bricks whose nodes are in different clusters

        # cleanup volumes
        vol_list = get_volume_list(self.mnode)
        self.assertIsNotNone(vol_list, "Failed to get the volume list")

        for volume in vol_list:
            ret = cleanup_volume(self.mnode, volume)
            self.assertTrue(ret, "Unable to delete volume % s" % volume)

        # peer detach all servers
        ret = peer_detach_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer detach to all servers is failed")
        g.log.info("Peer detach to all the servers is success")

        # form cluster 1
        ret, _, _ = peer_probe(self.servers[0], self.servers[1])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[0], self.servers[1]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[0], self.servers[1]))

        # form cluster 2
        ret, _, _ = peer_probe(self.servers[2], self.servers[3])
        self.assertEqual(
            ret, 0, "Peer probe from %s to %s is failed" %
            (self.servers[2], self.servers[3]))
        g.log.info("Peer probe is success from %s to %s" %
                   (self.servers[2], self.servers[3]))

        # Creating a volume with bricks which are part of another
        # cluster should fail
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: Creating a volume with bricks"
            " which are part of another cluster should fail."
            " Actual: Successfully created the volume with "
            "bricks which are part of another cluster")
        g.log.info("Failed to create the volume with bricks which are "
                   "part of another cluster")

        # form a cluster, bring a node down. try to create a volume when one of
        # the brick node is down
        ret, _, _ = peer_detach(self.servers[2], self.servers[3])
        self.assertEqual(ret, 0, "Peer detach is failed")
        g.log.info("Peer detach is success")

        ret = peer_probe_servers(self.mnode, self.servers)
        self.assertTrue(ret, "Peer probe is failed")
        g.log.info("Peer probe to all the servers is success")

        random_server = self.servers[random.randint(1, len(self.servers) - 1)]
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Glusterd is stopped successfully")

        self.volume['name'] = "third_volume"
        ret = setup_volume(self.mnode, self.all_servers_info, self.volume)
        self.assertFalse(
            ret, "Expected: It should fail to create a volume "
            "when one of the node is down. Actual: Successfully "
            "created the volume with bbrick whose node is down")

        g.log.info("Failed to create the volume with brick whose node is down")
Example #7
0
    def test_self_heal_algorithm_full_daemon_off(self):
        """""
        Description:-
        Checking healing when algorithm is set to "full" and self heal daemon
        is "off".
        """""
        # pylint: disable=too-many-statements

        # Setting volume option of self heal & algorithm
        options = {"metadata-self-heal": "disable",
                   "entry-self-heal": "disable",
                   "data-self-heal": "disable",
                   "data-self-heal-algorithm": "full",
                   "self-heal-daemon": "off"}
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, "Failed to set the volume options %s" % options)
        g.log.info(" Volume set options success")

        # Select bricks to bring down
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)

        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks: %s offline"
                        % bricks_to_bring_offline)
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list:%s are offline"
                        % bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks %s are all offline",
                   bricks_to_bring_offline)

        # IO on the mount point
        all_mounts_procs = []
        g.log.info("Creating Files on %s:%s", self.mounts[0].client_system,
                   self.mounts[0].mountpoint)
        cmd = ("cd %s ;for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=1;done"
               % self.mounts[0].mountpoint)
        proc = g.run_async(self.mounts[0].client_system, cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(
            validate_io_procs(all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )

        # Collecting Arequal before bring the bricks up
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_before = collect_mounts_arequal(self.mounts)

        # Turning self heal daemon ON
        optionstwo = {"self-heal-daemon": "on"}
        ret = set_volume_options(self.mnode, self.volname, optionstwo)
        self.assertTrue(ret, "Failed to turn self-heal ON")
        g.log.info("Volume set options %s: success", optionstwo)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks: %s online"
                        % bricks_to_bring_offline)
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Waiting for bricks to come online
        g.log.info("Waiting for brick process to come online")
        ret = wait_for_bricks_to_be_online(self.mnode,
                                           self.volname,
                                           timeout=30)
        self.assertTrue(ret, "bricks didn't come online after adding bricks")
        g.log.info("Bricks are online")

        # Verifying all bricks online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, "Volume %s : All process are not online"
                        % self.volname)
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self heal processes to come online
        g.log.info("Wait for selfheal process to come online")
        ret = wait_for_self_heal_daemons_to_be_online(self.mnode,
                                                      self.volname,
                                                      timeout=300)
        self.assertTrue(ret, "Self-heal process are not online")
        g.log.info("All self heal process are online")

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                        "for 20 minutes. 20 minutes is too much a time for "
                        "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # arequal after healing
        g.log.info("Collecting Arequal before the bring of bricks down")
        result_after = collect_mounts_arequal(self.mounts)

        # Comparing the results
        g.log.info("comparing both the results")
        self.assertEqual(result_before, result_after, "Arequals are not equal")
Example #8
0
    def test_ec_replace_brick(self):
        """
        - Start resource consumption tool
        - Create directory dir1
        - Create 5 directory and 5 files in dir of mountpoint
        - Rename all files inside dir1 at mountpoint
        - Create softlink and hardlink of files in dir1 of mountpoint
        - Delete op for deleting all file in one of the dirs inside dir1
        - Change chmod, chown, chgrp
        - Create tiny, small, medium and large file
        - Get arequal before replacing brick
        - Replace brick
        - Get arequal after replacing brick
        - Compare Arequal's
        - Create IO's
        - Replace brick while IO's are going on
        - Validating IO's and waiting for it to complete
        """
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Starting resource consumption using top
        log_file_mem_monitor = '/var/log/glusterfs/mem_usage.log'
        cmd = ("for i in {1..20};do top -n 1 -b|egrep "
               "'RES|gluster' & free -h 2>&1 >> %s ;"
               "sleep 10;done" % (log_file_mem_monitor))
        g.log.info(cmd)
        cmd_list_procs = []
        for server in self.servers:
            proc = g.run_async(server, cmd)
            cmd_list_procs.append(proc)

        # Creating dir1
        ret = mkdir(self.mounts[0].client_system,
                    "%s/dir1" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("Directory dir1 on %s created successfully", self.mounts[0])

        # Create 5 dir and 5 files in each dir at mountpoint on dir1
        start, end = 1, 5
        for mount_obj in self.mounts:
            # Number of dir and files to be created.
            dir_range = ("%s..%s" % (str(start), str(end)))
            file_range = ("%s..%s" % (str(start), str(end)))
            # Create dir 1-5 at mountpoint.
            ret = mkdir(mount_obj.client_system,
                        "%s/dir1/dir{%s}" % (mount_obj.mountpoint, dir_range))
            self.assertTrue(ret, "Failed to create directory")
            g.log.info("Directory created successfully")

            # Create files inside each dir.
            cmd = ('touch %s/dir1/dir{%s}/file{%s};' %
                   (mount_obj.mountpoint, dir_range, file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "File creation failed")
            g.log.info("File created successfull")

            # Increment counter so that at next client dir and files are made
            # with diff offset. Like at next client dir will be named
            # dir6, dir7...dir10. Same with files.
            start += 5
            end += 5

        # Rename all files inside dir1 at mountpoint on dir1
        cmd = ('cd %s/dir1/dir1/; '
               'for FILENAME in *;'
               'do mv $FILENAME Unix_$FILENAME; cd ~;'
               'done;' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to rename file on " "client")
        g.log.info("Successfully renamed file on client")

        # Truncate at any dir in mountpoint inside dir1
        # start is an offset to be added to dirname to act on
        # diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s/; '
                   'for FILENAME in *;'
                   'do echo > $FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Truncate failed")
            g.log.info("Truncate of files successfull")

        # Create softlink and hardlink of files in mountpoint. Start is an
        # offset to be added to dirname to act on diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln -s $FILENAME softlink_$FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Softlinks have failed")
            g.log.info("Softlink of files have been changed successfully")

            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln $FILENAME hardlink_$FILENAME; cd ~;'
                   'done;' % (mount_obj.mountpoint, str(start + 1)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Hardlinks have failed")
            g.log.info("Hardlink of files have been changed successfully")
            start += 5

        # chmod, chown, chgrp inside dir1
        # start and end used as offset to access diff files
        # at diff clients.
        start, end = 2, 5
        for mount_obj in self.mounts:
            dir_file_range = '%s..%s' % (str(start), str(end))
            cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing mode of files has failed")
            g.log.info("Mode of files have been changed successfully")

            cmd = ('chown root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing owner of files has failed")
            g.log.info("Owner of files have been changed successfully")

            cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing group of files has failed")
            g.log.info("Group of files have been changed successfully")
            start += 5
            end += 5

        # Create tiny, small, medium and large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for tiny files failed")
            g.log.info("Fallocate for tiny files successfully")

            cmd = 'fallocate -l 20M small_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for small files failed")
            g.log.info("Fallocate for small files successfully")

            cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for medium files failed")
            g.log.info("Fallocate for medium files successfully")

            cmd = 'fallocate -l 1G large_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for large files failed")
            g.log.info("Fallocate for large files successfully")
            offset += 1

        # Get arequal before replacing brick
        ret, result_before_replacing_brick = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before replacing of brick '
                   'is successful')

        # Replacing a brick of random choice
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Unexpected:Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Check if bricks are online
        all_bricks = get_all_bricks(self.mnode, self.volname)
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'Unexpected:All bricks are not online')
        g.log.info('All bricks are online')

        # Get areequal after replacing brick
        ret, result_after_replacing_brick = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after replacing of brick '
                   'is successful')

        # Comparing arequals
        self.assertEqual(
            result_before_replacing_brick, result_after_replacing_brick,
            'Arequals are not equals before replacing '
            'brick and after replacing brick')
        g.log.info('Arequals are equals before replacing brick '
                   'and after replacing brick')

        # Creating files on client side for dir1
        # Write IO
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" %
                   (self.script_upload_path1, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Replacing a brick while IO's are going on
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Create 2 directories and start IO's which opens FD
        ret = mkdir(self.mounts[0].client_system,
                    "%s/count{1..2}" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create directories")
        g.log.info("Directories created on %s successfully", self.mounts[0])

        all_fd_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("cd %s ;/usr/bin/env python %s -n 10 -t 120 "
                   "-d 5 -c 16 --dir count%s" %
                   (mount_obj.mountpoint, self.script_upload_path2, count))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_fd_procs.append(proc)
            count += 1

        # Replacing a brick while open FD IO's are going on
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Unexpected:Replace brick is not successful")
        g.log.info("Expected : Replace brick is successful")

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Unexpected:Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_fd_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Close connection and check file exist for memory log
        ret = file_exists(self.mnode, '/var/log/glusterfs/mem_usage.log')
        self.assertTrue(ret, "Unexpected:Memory log file does " "not exist")
        g.log.info("Memory log file exists")
        for proc in cmd_list_procs:
            ret, _, _ = proc.async_communicate()
            self.assertEqual(ret, 0, "Memory logging failed")
            g.log.info("Memory logging is successful")
    def test_ec_all_healtypes(self):
        """
        Test steps:
        - Create directory dir1
        - Create files inside dir1
        - Rename all file inside dir1
        - Create softlink and hardlink of files in mountpoint
        - Create tiny, small, medium nd large file
        - Get arequal of dir1
        - Create directory dir2
        - Creating files on dir2
        - Bring down other bricks to max redundancy
        - Create directory dir3
        - Start pumping IO to dir3
        - Validating IO's on dir2 and waiting to complete
        - Bring bricks online
        - Wait for bricks to come online
        - Check if bricks are online
        - Monitor heal completion
        - Get arequal of dir1
        - Compare arequal of dir1
        """

        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        # Get the bricks from the volume
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system

        # Creating dir1
        ret = mkdir(client, "%s/dir1" % mountpoint)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("Directory dir1 on %s created successfully", self.mounts[0])

        # Create files inside dir1
        cmd = ('touch %s/dir1/file{1..5};' % mountpoint)
        ret, _, _ = g.run(client, cmd)
        self.assertFalse(ret, "File creation failed")
        g.log.info("File created successfull")

        # Rename all files inside dir1
        cmd = ('cd %s/dir1/; '
               'for FILENAME in *;'
               'do mv $FILENAME Unix_$FILENAME; cd ~;'
               'done;' % mountpoint)
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "Failed to rename files on " "client")
        g.log.info("Successfully renamed files on client")

        # Create softlink and hardlink of files in mountpoint
        cmd = ('cd %s/dir1/; '
               'for FILENAME in *; '
               'do ln -s $FILENAME softlink_$FILENAME; cd ~;'
               'done;' % mountpoint)
        ret, _, _ = g.run(client, cmd)
        self.assertFalse(ret, "Creating Softlinks have failed")
        g.log.info("Softlink of files have been changed successfully")

        cmd = ('cd %s/dir1/; '
               'for FILENAME in *; '
               'do ln $FILENAME hardlink_$FILENAME; cd ~;'
               'done;' % mountpoint)
        ret, _, _ = g.run(client, cmd)
        self.assertFalse(ret, "Creating Hardlinks have failed")
        g.log.info("Hardlink of files have been changed successfully")

        # Create tiny, small, medium and large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for tiny files failed")
            g.log.info("Fallocate for tiny files successfully")

            cmd = 'fallocate -l 20M small_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for small files failed")
            g.log.info("Fallocate for small files successfully")

            cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for medium files failed")
            g.log.info("Fallocate for medium files successfully")

            cmd = 'fallocate -l 1G large_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for large files failed")
            g.log.info("Fallocate for large files successfully")
            offset += 1

        # Get arequal of dir1
        ret, result_before_brick_down = (collect_mounts_arequal(self.mounts[0],
                                                                path='dir1/'))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal of dir1 ' 'is successful')

        # Creating dir2
        ret = mkdir(self.mounts[0].client_system, "%s/dir2" % mountpoint)
        self.assertTrue(ret, "Failed to create dir2")
        g.log.info("Directory dir2 on %s created successfully", self.mounts[0])

        # Creating files on dir2
        # Write IO
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 2 "
                   "--dir-length 10 --max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir2" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Bring down other bricks to max redundancy
        # Bringing bricks offline
        bricks_to_offline = sample(bricks_list, 2)
        ret = bring_bricks_offline(self.volname, bricks_to_offline)
        self.assertTrue(ret, 'Bricks not offline')
        g.log.info('Bricks are offline successfully')

        # Creating dir3
        ret = mkdir(self.mounts[0].client_system, "%s/dir3" % mountpoint)
        self.assertTrue(ret, "Failed to create dir2")
        g.log.info("Directory dir2 on %s created successfully", self.mounts[0])

        # Start pumping IO to dir3
        cmd = ("cd %s/dir3; for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writing on files while a brick is DOWN')

        appendcmd = ("cd %s/dir3; for i in `seq 1 100` ;"
                     "do dd if=/dev/urandom of=file$i bs=1M "
                     "count=1 oflag=append conv=notrunc;done" % mountpoint)

        readcmd = ("cd %s/dir3; for i in `seq 1 100` ;"
                   "do dd if=file$i of=/dev/null bs=1M "
                   "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, appendcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished append on files after redundant bricks offline')

        ret, _, err = g.run(self.mounts[0].client_system, readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished read on files after redundant bricks offline')

        # Validating IO's on dir2 and waiting to complete
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all IO's")

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname, bricks_to_offline)
        self.assertTrue(ret, 'Bricks not brought online')
        g.log.info('Bricks are online successfully')

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("EXPECTED : Bricks are online")

        # Check if bricks are online
        ret = get_offline_bricks_list(self.mnode, self.volname)
        self.assertListEqual(ret, [], 'All bricks are not online')
        g.log.info('All bricks are online')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Get arequal of dir1
        ret, result_after_brick_up = (collect_mounts_arequal(self.mounts[0],
                                                             path='dir1/'))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal of dir1 ' 'is successful')

        # Comparing arequals of dir1
        self.assertEqual(
            result_before_brick_down, result_after_brick_up,
            'Arequals are not equals before and after '
            'bringing down redundant bricks')
        g.log.info('Arequals are equals before before and after '
                   'bringing down redundant bricks')
    def test_glusterd_quorum_validation(self):
        """
        -> Creating two volumes and starting them, stop the second volume
        -> set the server quorum and set the ratio to 90
        -> Stop the glusterd in one of the node, so the quorum won't meet
        -> Peer probing a new node should fail
        -> Volume stop will fail
        -> volume delete will fail
        -> volume reset will fail
        -> Start the glusterd on the node where it is stopped
        -> Volume stop, start, delete will succeed once quorum is met
        """
        # pylint: disable=too-many-statements, too-many-branches

        # Peer probe first 3 servers
        servers_info_from_three_nodes = {}
        for server in self.servers[0:3]:
            servers_info_from_three_nodes[server] = self.all_servers_info[
                server]

            # Peer probe the first 3 servers
            ret, _, _ = peer_probe(self.mnode, server)
            self.assertEqual(ret, 0,
                             ("Peer probe failed to one of the server"))
        g.log.info("Peer probe to first 3 nodes succeeded")

        self.volume['servers'] = self.servers[0:3]
        # Create a volume using the first 3 nodes
        ret = setup_volume(self.mnode,
                           servers_info_from_three_nodes,
                           self.volume,
                           force=True)
        self.assertTrue(ret, ("Failed to create and start volume"))
        g.log.info("Volume created and started successfully")

        # Creating another volume and stopping it
        second_volume = "second_volume"
        self.volume['name'] = second_volume
        ret = setup_volume(self.mnode,
                           servers_info_from_three_nodes,
                           self.volume,
                           force=True)
        self.assertTrue(ret, ("Failed to create and start volume"))
        g.log.info("Volume created and started succssfully")

        # stopping the second volume
        g.log.info("Stopping the second volume %s", second_volume)
        ret, _, _ = volume_stop(self.mnode, second_volume)
        self.assertEqual(ret, 0, ("Failed to stop the volume"))
        g.log.info("Successfully stopped second volume %s", second_volume)

        # Setting the server-quorum-type as server
        self.options = {"cluster.server-quorum-type": "server"}
        vol_list = get_volume_list(self.mnode)
        self.assertIsNotNone(vol_list, "Failed to get the volume list")
        g.log.info("Fetched the volume list")
        for volume in vol_list:
            g.log.info(
                "Setting the server-quorum-type as server"
                " on volume %s", volume)
            ret = set_volume_options(self.mnode, volume, self.options)
            self.assertTrue(ret, ("Failed to set the quorum type as a server"
                                  " on volume %s", volume))
        g.log.info("Server Quorum type is set as a server")

        # Setting the server quorum ratio to 90
        self.quorum_perecent = {'cluster.server-quorum-ratio': '90%'}
        ret = set_volume_options(self.mnode, 'all', self.quorum_perecent)
        self.assertTrue(ret, ("Failed to set the server quorum ratio "
                              "to 90 on servers"))
        g.log.info("Successfully set server quorum ratio to 90% on servers")

        # Stop glusterd on one of the node
        ret = stop_glusterd(self.servers[2])
        self.assertTrue(ret, ("Failed to stop glusterd on "
                              "node %s", self.servers[2]))
        g.log.info("Glusterd stop on the nodes : %s"
                   " succeeded", self.servers[2])

        # Check glusterd is stopped
        ret = is_glusterd_running(self.servers[2])
        self.assertEqual(ret, 1, "Unexpected: Glusterd is running on node")
        g.log.info("Expected: Glusterd stopped on node %s", self.servers[2])

        # Adding a new peer will fail as quorum not met
        ret, _, _ = peer_probe(self.mnode, self.servers[3])
        self.assertNotEqual(ret, 0,
                            ("Unexpected:"
                             "Succeeded to peer probe new node %s when quorum "
                             "is not met", self.servers[3]))
        g.log.info("Failed to peer probe new node as expected"
                   " when quorum not met")

        # Stopping an already started volume should fail as quorum is not met
        ret, _, _ = volume_start(self.mnode, second_volume)
        self.assertNotEqual(
            ret, 0, "Unexpected: Successfuly started "
            "volume even when quorum not met.")
        g.log.info(
            "Volume start %s failed as expected when quorum "
            "is not met", second_volume)

        # Stopping a volume should fail stop the first volume
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(
            ret, 1, "Unexpected: Successfully stopped"
            " volume even when quourm is not met")
        g.log.info(
            "volume stop %s failed as expected when quorum "
            "is not met", self.volname)

        # Stopping a volume with force option should fail
        ret, _, _ = volume_stop(self.mnode, self.volname, force=True)
        self.assertNotEqual(
            ret, 0, "Unexpected: Successfully "
            "stopped volume with force. Expected: "
            "Volume stop should fail when quourm is not met")
        g.log.info("volume stop failed as expected when quorum is not met")

        # Deleting a volume should fail. Deleting the second volume.
        ret = volume_delete(self.mnode, second_volume)
        self.assertFalse(
            ret, "Unexpected: Volume delete was "
            "successful even when quourm is not met")
        g.log.info("volume delete failed as expected when quorum is not met")

        # Volume reset should fail when quorum is not met
        ret, _, _ = volume_reset(self.mnode, self.volname)
        self.assertNotEqual(
            ret, 0, "Unexpected: Volume reset was "
            "successful even when quorum is not met")
        g.log.info("volume reset failed as expected when quorum is not met")

        # Volume reset should fail even with force when quourum is not met
        ret, _, _ = volume_reset(self.mnode, self.volname, force=True)
        self.assertNotEqual(
            ret, 0, "Unexpected: Volume reset was "
            "successful with force even "
            "when quourm is not met")
        g.log.info("volume reset failed as expected when quorum is not met")

        # Start glusterd on the node where glusterd is stopped
        ret = start_glusterd(self.servers[2])
        self.assertTrue(ret, "Failed to start glusterd on one node")
        g.log.info("Started glusterd on server"
                   " %s successfully", self.servers[2])

        ret = is_glusterd_running(self.servers[2])
        self.assertEqual(ret, 0, ("glusterd is not running on "
                                  "node %s", self.servers[2]))
        g.log.info("glusterd is running on node" " %s ", self.servers[2])

        # Check peer status whether all peer are in connected state none of the
        # nodes should be in peer rejected state
        halt, counter, _rc = 30, 0, False
        g.log.info("Wait for some seconds, right after glusterd start it "
                   "will create two daemon process it need few seconds "
                   "(like 3-5) to initialize the glusterd")
        while counter < halt:
            ret = is_peer_connected(self.mnode, self.servers[0:3])
            if not ret:
                g.log.info("Peers are not connected state,"
                           " Retry after 2 seconds .......")
                sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("Peers are in connected state in the cluster")
                break

        self.assertTrue(_rc, ("Peers are not connected state after "
                              "bringing back glusterd online on the "
                              "nodes in which previously glusterd "
                              "had been stopped"))

        # Check all bricks are online or wait for the bricks to be online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "All bricks are not online")
        g.log.info("All bricks of the volume %s are online", self.volname)

        # Once quorum is met should be able to cleanup the volume
        ret = volume_delete(self.mnode, second_volume)
        self.assertTrue(ret, "Volume delete failed even when quorum is met")
        g.log.info("volume delete succeed without any issues")

        # Volume stop should succeed
        ret, _, _ = volume_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume stop failed")
        g.log.info("succeeded stopping the volume as expected")

        # volume reset should succeed
        ret, _, _ = volume_reset(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Volume reset failed ")
        g.log.info("volume reset succeeded as expected when quorum is not met")

        # Peer probe new node should succeed
        ret, _, _ = peer_probe(self.mnode, self.servers[3])
        self.assertEqual(
            ret, 0, ("Failed to peer probe new node even when quorum is met"))
        g.log.info("Succeeded to peer probe new node when quorum met")

        # Check peer status whether all peer are in connected state none of the
        # nodes should be in peer rejected state
        halt, counter, _rc = 30, 0, False
        g.log.info("Wait for some seconds, right after peer probe")
        while counter < halt:
            ret = is_peer_connected(self.mnode, self.servers[0:3])
            if not ret:
                g.log.info("Peers are not connected state,"
                           " Retry after 2 seconds .......")
                sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("Peers are in connected state in the cluster")
                break

        self.assertTrue(_rc, ("Peers are not connected state"))
    def test_ec_data_integrity(self):
        """
        Test steps:
        - Create directory dir1
        - Create 5 dir and 5 files in each dir in directory 1
        - Rename all file inside dir1
        - Truncate at any dir in mountpoint inside dir1
        - Create softlink and hardlink of files in mountpoint
        - chmod, chown, chgrp inside dir1
        - Create tiny, small, medium nd large file
        - Creating files on client side for dir1
        - Validating IO's and waiting to complete
        - Get arequal of dir1
        - Bring redundant bricks offline
        - Get arequal of dir1 after 1st set of bricks down
        - Bring redundant bricks offline
        - Get arequal of dir1 after 2nd set of bricks down
        """

        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        brickset_to_offline = []

        # Creating dir1
        ret = mkdir(self.mounts[0].client_system, "%s/dir1"
                    % self.mounts[0].mountpoint)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("Directory dir1 on %s created successfully", self.mounts[0])

        # Create 5 dir and 5 files in each dir at mountpoint on dir1
        start, end = 1, 5
        for mount_obj in self.mounts:
            # Number of dir and files to be created.
            dir_range = ("%s..%s" % (str(start), str(end)))
            file_range = ("%s..%s" % (str(start), str(end)))
            # Create dir 1-5 at mountpoint.
            ret = mkdir(mount_obj.client_system, "%s/dir1/dir{%s}"
                        % (mount_obj.mountpoint, dir_range))
            self.assertTrue(ret, "Failed to create directory")
            g.log.info("Directory created successfully")

            # Create files inside each dir.
            cmd = ('touch %s/dir1/dir{%s}/file{%s};'
                   % (mount_obj.mountpoint, dir_range, file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "File creation failed")
            g.log.info("File created successfull")

            # Increment counter so that at next client dir and files are made
            # with diff offset. Like at next client dir will be named
            # dir6, dir7...dir10. Same with files.
            start += 5
            end += 5

        # Rename all files inside dir1 at mountpoint on dir1
        cmd = ('cd %s/dir1/dir1/; '
               'for FILENAME in *;'
               'do mv $FILENAME Unix_$FILENAME; cd ~;'
               'done;'
               % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to rename file on "
                         "client")
        g.log.info("Successfully renamed file on client")

        # Truncate at any dir in mountpoint inside dir1
        # start is an offset to be added to dirname to act on
        # diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s/; '
                   'for FILENAME in *;'
                   'do echo > $FILENAME; cd ~;'
                   'done;'
                   % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Truncate failed")
            g.log.info("Truncate of files successfull")

        # Create softlink and hardlink of files in mountpoint
        start = 1
        for mount_obj in self.mounts:
            for link_type, ln_mode in (('softlink', 'ln -s'),
                                       ('hardlink', 'ln')):
                cmd = ('cd %s/dir1/dir%s; '
                       'for FILENAME in *; '
                       'do %s $FILENAME %s_$FILENAME; cd ~;'
                       'done;'
                       % (mount_obj.mountpoint, str(start), ln_mode,
                          link_type))
                ret, _, _ = g.run(mount_obj.client_system, cmd)
                self.assertFalse(ret, "Creating %s have failed" % link_type)
                g.log.info("%s of files created successfully", link_type)
            start += 5

        # chmod, chown, chgrp inside dir1
        # start and end used as offset to access diff files
        # at diff clients.
        start, end = 2, 5
        for mount_obj in self.mounts:
            dir_file_range = '%s..%s' % (str(start), str(end))
            cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}'
                   % (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing mode of files has failed")
            g.log.info("Mode of files have been changed successfully")

            cmd = ('chown root %s/dir1/dir{%s}/file{%s}'
                   % (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing owner of files has failed")
            g.log.info("Owner of files have been changed successfully")

            cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}'
                   % (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing group of files has failed")
            g.log.info("Group of files have been changed successfully")
            start += 5
            end += 5

        # Create tiny, small, medium and large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            for size, filename in (('100', 'tiny_file'), ('20M', 'small_file'),
                                   ('200M', 'medium_file'),
                                   ('1G', 'large_file')):
                cmd = 'fallocate -l {} {}{}.txt'.format(size, filename, offset)
                ret, _, _ = g.run(mount_obj.client_system, cmd)
                self.assertFalse(ret, "Fallocate for files failed")
                g.log.info("Fallocate for files successfully")
            offset += 1

        # Creating files on client side for dir1
        # Write IO
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 2 "
                   "--dir-length 10 --max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" % (
                       self.script_upload_path, count,
                       mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all IO's")

        # Get arequal of dir1
        ret, result_before_bricks_down = (
            collect_mounts_arequal(self.mounts[0], path='dir1/'))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal of dir1 '
                   'is successful')

        # Bring redundant bricks offline
        brickset_to_offline = self._bring_redundant_bricks_offline(
            self.mnode, self.volname)

        # Get arequal of dir1 after 1st set of bricks down
        ret, result_after_1st_brickset_down = (
            collect_mounts_arequal(self.mounts[0], path='dir1/'))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal of dir1 '
                   'is successful')

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  brickset_to_offline)
        self.assertTrue(ret, 'Bricks not brought online')
        g.log.info('Bricks are online successfully')

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("EXPECTED : Bricks are online")

        # Check if bricks are online
        ret = get_offline_bricks_list(self.mnode, self.volname)
        self.assertListEqual(ret, [], 'All bricks are not online')
        g.log.info('All bricks are online')

        # Bring redundant bricks offline
        brickset_to_offline = self._bring_redundant_bricks_offline(
            self.mnode, self.volname)

        # Get arequal of dir1 after 2nd set of bricks down
        ret, result_after_2nd_brickset_down = (
            collect_mounts_arequal(self.mounts[0], path='dir1/'))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal of dir1 '
                   'is successful')

        # Comparing arequals
        self.assertEqual(result_before_bricks_down,
                         result_after_1st_brickset_down,
                         'Arequals are not equals before brickset '
                         'down and after 1st brickset down')
        g.log.info('Arequals are equals before brickset down '
                   'and after brickset down')

        self.assertEqual(result_after_2nd_brickset_down,
                         result_after_1st_brickset_down,
                         'Arequals are not equals before 2nd set '
                         'brick down and after 1st set brick down')
        g.log.info('Arequals are equals for 2nd brickset down '
                   'and 1st brickset down')
    def test_ec_quorumcount_5(self):
        """
        Test Steps:
        - Write IO's when all bricks are online
        - Get subvol from which bricks to be brought down
        - Set volume disperse quorum count to 5
        - Start writing and reading IO's
        - Bring a brick down,say b1
        - Validate write and read is successful
        - Bring a brick down,say b2
        - Validate write has failed and read is successful
        - Start IO's again while quorum is not met on volume
          write should fail and read should pass
        - Add-brick and log
        - Start Rebalance
        - Wait for rebalance,which should fail as quorum is not met
        - Bring brick online
        - Wait for brick to come online
        - Check if bricks are online
        - Start IO's again when all bricks are online
        - IO's should complete successfully
        - Start IO's again and reset volume
        - Bring down other bricks to max redundancy
        - Validating IO's and waiting to complete
        """

        # pylint: disable=too-many-branches,too-many-statements,too-many-locals

        mountpoint = self.mounts[0].mountpoint
        client1 = self.mounts[0].client_system
        client2 = self.mounts[1].client_system

        # Write IO's  when all bricks are online
        writecmd = ("cd %s; for i in `seq 1 100` ;"
                    "do dd if=/dev/urandom of=file$i bs=1M "
                    "count=5;done" % mountpoint)

        # IO's should complete successfully
        ret, _, err = g.run(client1, writecmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writes on files sucessfully')

        # Select a subvol from which bricks to be brought down
        sub_vols = get_subvols(self.mnode, self.volname)
        bricks_list1 = list(choice(sub_vols['volume_subvols']))
        brick_1, brick_2 = sample(bricks_list1, 2)

        # Set volume disperse quorum count to 5
        ret = set_volume_options(self.mnode, self.volname,
                                 {"disperse.quorum-count": "5"})
        self.assertTrue(
            ret, 'Failed to set volume {}'
            ' options'.format(self.volname))
        g.log.info('Successfully set disperse quorum on %s', self.volname)

        # Start writing and reading IO's
        procwrite, procread, count = [], [], 1
        for mount_obj in self.mounts:
            writecmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                        "--dirname-start-num %d --dir-depth 5 "
                        "--dir-length 10 --max-num-of-dirs 2 "
                        "--num-of-files 15 %s" %
                        (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               writecmd,
                               user=mount_obj.user)
            procwrite.append(proc)
            count += 10

        self.generate_read_cmd(mountpoint, '1', '10')
        ret = g.run_async(client2, self.readcmd)
        procread.append(ret)

        # Brick 1st brick down
        ret = bring_bricks_offline(self.volname, brick_1)
        self.assertTrue(ret, 'Brick {} is not offline'.format(brick_1))
        g.log.info('Brick %s is offline successfully', brick_1)

        writecmd = ("cd %s; for i in `seq 101 110` ;"
                    "do dd if=/dev/urandom of=file$i bs=1M "
                    "count=5;done" % mountpoint)

        # IO's should complete successfully
        ret, _, err = g.run(client1, writecmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writes on files sucessfully')

        self.generate_read_cmd(mountpoint, '101', '110')
        ret, _, err = g.run(client1, self.readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished reads on files sucessfully')

        # Brick 2nd brick down
        ret = bring_bricks_offline(self.volname, brick_2)
        self.assertTrue(ret, 'Brick {} is not offline'.format(brick_2))
        g.log.info('Brick %s is offline successfully', brick_2)

        # Validate write has failed and read is successful
        ret = validate_io_procs(procwrite, self.mounts)
        self.assertFalse(
            ret, 'Write successful even after disperse quorum is '
            'not met')
        g.log.info('EXPECTED - Writes failed as disperse quroum is not met')

        ret = validate_io_procs(procread, self.mounts[1])
        self.assertTrue(ret, 'Read operation failed on the client')
        g.log.info('Reads on files successful')

        # Start IO's again while quorum is not met on volume
        procwrite = []
        writecmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                    "--dirname-start-num 20 --dir-depth 1 "
                    "--dir-length 10 --max-num-of-dirs 1 "
                    "--num-of-files 10 %s" %
                    (self.script_upload_path, mountpoint))
        proc = g.run_async(client1, writecmd)
        procwrite.append(proc)
        ret = validate_io_procs(procwrite, self.mounts[0])
        self.assertFalse(
            ret, 'Write successful even after disperse quorum is '
            'not met')
        g.log.info('EXPECTED - Writes failed as disperse quroum is not met')

        self.generate_read_cmd(mountpoint, '1', '100')
        ret, _, err = g.run(client2, self.readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Reads on files successful')

        # Add brick
        ret = expand_volume(self.mnode,
                            self.volname,
                            self.servers,
                            self.all_servers_info,
                            force=True)
        self.assertTrue(
            ret, ("Failed to expand the volume {}".format(self.volname)))
        g.log.info("Expanding volume %s is successful", self.volname)

        # Log Volume Info and Status after expanding the volume
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume {}".format(self.volname)))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ('Rebalance failed on the volume'
                                  ' {}'.format(self.volname)))
        g.log.info('Rebalance has started on volume %s', self.volname)

        # Wait for rebalance to complete
        # Which should also fail as quorum is not met
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=600)
        self.assertFalse(
            ret, "Rebalance passed though disperse quorum "
            "is not met on volume")
        g.log.info(
            "Expected: Rebalance failed on the volume %s,disperse"
            " quorum is not met", self.volname)

        # Bring brick online
        brick_list = brick_1, brick_2
        ret = bring_bricks_online(self.mnode, self.volname, brick_list)
        self.assertTrue(ret, 'Brick not brought online')
        g.log.info('Brick brought online successfully')

        # Wait for brick to come online
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Bricks are not online')
        g.log.info('EXPECTED : Bricks are online')

        # Check if bricks are online
        ret = get_offline_bricks_list(self.mnode, self.volname)
        self.assertListEqual(ret, [], 'All bricks are not online')
        g.log.info('All bricks are online')

        # Start IO's again when all bricks are online
        writecmd = ("cd %s; for i in `seq 101 200` ;"
                    "do dd if=/dev/urandom of=file$i bs=1M "
                    "count=5;done" % mountpoint)
        self.generate_read_cmd(mountpoint, '101', '120')

        # IO's should complete successfully
        ret, _, err = g.run(client1, writecmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Writes on client % successful', client1)

        ret, _, err = g.run(client2, self.readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Read on client % successful', client2)

        # Start IO's again
        all_mounts_procs, count = [], 30
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 2 "
                   "--dir-length 10 --max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Reset volume
        ret, _, err = volume_reset(self.mnode, self.volname)
        self.assertEqual(ret, 0, err)
        g.log.info('Reset of volume %s successful', self.volname)

        # Bring down other bricks to max redundancy
        # Bringing bricks offline
        bricks_to_offline = sample(bricks_list1, 2)
        ret = bring_bricks_offline(self.volname, bricks_to_offline)
        self.assertTrue(ret, 'Redundant bricks not offline')
        g.log.info('Redundant bricks are offline successfully')

        # Validating IO's and waiting to complete
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, 'IO failed on some of the clients')
        g.log.info("Successfully validated all IO's")
Example #13
0
    def test_brickreset_ec_volume(self):
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        """
        - Start resource consumption tool
        - Create IO on dir2 of volume mountpoint
        - Reset brick start
        - Check if brick is offline
        - Reset brick with destination same as source with force running IO's
        - Validating IO's and waiting for it to complete on dir2
        - Remove dir2
        - Create 5 directory and 5 files in dir of mountpoint
        - Rename all files inside dir1 at mountpoint
        - Create softlink and hardlink of files in dir1 of mountpoint
        - Delete op for deleting all file in one of the dirs inside dir1
        - Change chmod, chown, chgrp
        - Create tiny, small, medium and large file
        - Create IO's
        - Validating IO's and waiting for it to complete
        - Calculate arequal before kiiling brick
        - Get brick from Volume
        - Reset brick
        - Check if brick is offline
        - Reset brick by giving a different source and dst node
        - Reset brick by giving dst and source same without force
        - Obtain hostname
        - Reset brick with dst-source same force using hostname - Successful
        - Monitor heal completion
        - Bring down other bricks to max redundancy
        - Get arequal after bringing down bricks
        - Bring bricks online
        - Reset brick by giving a same source and dst brick
        - Kill brick manually
        - Check if brick is offline
        - Reset brick by giving a same source and dst brick
        - Wait for brick to come online
        - Bring down other bricks to max redundancy
        - Get arequal after bringing down bricks
        - Bring bricks online
        - Remove brick from backend
        - Check if brick is offline
        - Reset brick by giving dst and source same without force - Successful
        - Monitor heal completion
        - Compare the arequal's calculated
        """
        # Starting resource consumption using top
        log_file_mem_monitor = getcwd() + '/mem_usage.log'
        cmd = 'for i in {1..100};do top -n 1 -b|egrep \
                "RES|gluster" & free -h 2>&1 >> '                                                  + \
            log_file_mem_monitor + ' ;sleep 10;done'
        g.log.info(cmd)
        for mount_obj in self.mounts:
            g.run_async(mount_obj.client_system, cmd)
        bricks_list = []

        # Get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # Creating directory2
        cmd = ('mkdir %s/dir2' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create directory2")
        g.log.info("Directory 2 on %s created successfully", self.mounts[0])

        # Creating files on client side for dir2
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)

            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -n 2 -f 20 %s/dir2" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Reset a brick
        g.log.info('Reset of brick using start')
        brick_reset = choice(bricks_list)
        ret, _, _ = reset_brick(self.mnode, self.volname, brick_reset, "start")

        # Check if the brick is offline
        g.log.info("Check the brick status if it is offline")
        offline_bricks = get_offline_bricks_list(self.mnode, self.volname)
        self.assertEqual(offline_bricks[0], brick_reset, "Brick not offline")
        g.log.info("Expected : Brick is offline")

        # Reset brick with dest same as source with force while running IO's
        g.log.info('Reset of brick with same src and dst brick')
        ret, _, _ = reset_brick(self.mnode,
                                self.volname,
                                brick_reset,
                                "commit",
                                brick_reset,
                                force="true")
        self.assertEqual(ret, 0, "Not Expected: Reset brick failed")
        g.log.info("Expected : Reset brick is successful")

        # Validating IO's and waiting to complete
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")

        # Deleting dir2
        cmd = ('rm -rf %s/dir2' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to delete directory2")
        g.log.info("Directory 2 deleted successfully for %s", self.mounts[0])

        del self.all_mounts_procs[:]

        # Creating dir1
        cmd = ('mkdir  %s/dir1' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create directory1")
        g.log.info("Directory 1 created successfully for %s", self.mounts[0])

        # Create 5 dir and 5 files in each dir at mountpoint on dir1
        start, end = 1, 5
        for mount_obj in self.mounts:
            # Number of dir and files to be created.
            dir_range = str(start) + ".." + str(end)
            file_range = str(start) + ".." + str(end)
            # Create dir 1-5 at mountpoint.
            cmd = ('mkdir %s/dir1/dir{%s};' %
                   (mount_obj.mountpoint, dir_range))
            g.run(mount_obj.client_system, cmd)

            # Create files inside each dir.
            cmd = ('touch %s/dir1/dir{%s}/file{%s};' %
                   (mount_obj.mountpoint, dir_range, file_range))
            g.run(mount_obj.client_system, cmd)

            # Increment counter so that at next client dir and files are made
            # with diff offset. Like at next client dir will be named
            # dir6, dir7...dir10. Same with files.
            start += 5
            end += 5

        # Rename all files inside dir1 at mountpoint on dir1
        clients = []
        for mount_obj in self.mounts:
            clients.append(mount_obj.client_system)
            cmd = ('cd %s/dir1/dir1/; '
                   'for FILENAME in *;'
                   'do mv $FILENAME Unix_$FILENAME; '
                   'done;' % mount_obj.mountpoint)
            g.run_parallel(clients, cmd)

        # Truncate at any dir in mountpoint inside dir1
        # start is an offset to be added to dirname to act on
        # diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s/; '
                   'for FILENAME in *;'
                   'do echo > $FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start)))
            g.run(mount_obj.client_system, cmd)

        # Create softlink and hardlink of files in mountpoint. Start is an
        # offset to be added to dirname to act on diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln -s $FILENAME softlink_$FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start)))
            g.run(mount_obj.client_system, cmd)
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln $FILENAME hardlink_$FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start + 1)))
            g.run(mount_obj.client_system, cmd)
            start += 5

        # Delete op for deleting all file in one of the dirs. start is being
        # used as offset like in previous testcase in dir1
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do rm -f $FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start)))
            g.run(mount_obj.client_system, cmd)
            start += 5

        # chmod, chown, chgrp inside dir1
        # start and end used as offset to access diff files
        # at diff clients.
        start, end = 2, 5
        for mount_obj in self.mounts:
            dir_file_range = '%s..%s' % (str(start), str(end))
            cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            g.run(mount_obj.client_system, cmd)

            cmd = ('chown root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            g.run(mount_obj.client_system, cmd)

            cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            g.run(mount_obj.client_system, cmd)

            start += 5
            end += 5

        # Create tiny, small, medium nd large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset)
            g.run(mount_obj.client_system, cmd)
            cmd = 'fallocate -l 20M small_file%s.txt' % str(offset)
            g.run(mount_obj.client_system, cmd)
            cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset)
            g.run(mount_obj.client_system, cmd)
            cmd = 'fallocate -l 1G large_file%s.txt' % str(offset)
            g.run(mount_obj.client_system, cmd)
            offset += 1

        # Creating files on client side for dir1
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -n 2 -f 20 %s/dir1" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validating IO's and waiting to complete
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        self.io_validation_complete = True

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")

        # Get areequal before killing the brick
        g.log.info('Getting areequal before killing of brick...')
        ret, result_before_killing_brick = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal before killing of brick ' 'is successful')

        # Reset a brick
        g.log.info('Reset of brick using start')
        ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0],
                                "start")

        # Check if the brick is offline
        g.log.info("Check the brick status if it is offline")
        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]])
        self.assertTrue(ret, "Brick is not offline")
        g.log.info("Expected : Brick is offline")

        # Reset brick by giving a different source and dst brick
        g.log.info('Reset of brick by giving different src and dst brick')
        ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0],
                                "commit", bricks_list[1])
        self.assertNotEqual(ret, 0, "Not Expected: Reset brick is successfull")
        g.log.info("Expected : Source and Destination brick must be same for"
                   " reset")

        # Reset brick with destination same as source
        g.log.info('Reset of brick with same src and dst brick')
        ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0],
                                "commit", bricks_list[0])
        self.assertNotEqual(ret, 0, "Not Expected : Reset brick is successful")
        g.log.info("Expected : Reset brick failed,Vol id is same use force")

        # Obtain hostname of node
        ret, hostname_node1, _ = g.run(self.mnode, "hostname")
        self.assertEqual(ret, 0,
                         ("Failed to obtain hostname of node %s", self.mnode))
        g.log.info("Obtained hostname of client. IP- %s, hostname- %s",
                   self.mnode, hostname_node1.strip())

        # Reset brick with destination same as source with force using hostname
        g.log.info('Reset of brick with same src and dst brick')
        ret, _, _ = reset_brick(hostname_node1.strip(),
                                self.volname,
                                bricks_list[0],
                                "commit",
                                bricks_list[0],
                                force="true")
        self.assertEqual(ret, 0, "Not Expected: Reset brick failed")
        g.log.info("Expected : Reset brick is successful")

        # Wait for brick to come online
        g.log.info("Waiting for brick to come online")
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Check if bricks are online
        all_bricks = get_all_bricks(self.mnode, self.volname)
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'All bricks are not online')
        g.log.info('All bricks are online')

        # Bring down other bricks to max redundancy
        # Get List of bricks to bring offline

        # Bringing bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[1:3])
        self.assertTrue(ret, 'Bricks not offline')
        g.log.info('Bricks are offline successfully')
        sleep(2)

        # Check if 4 bricks are online
        all_bricks = []
        all_bricks = [
            bricks_list[0], bricks_list[3], bricks_list[4], bricks_list[5]
        ]
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'All bricks are not online')
        g.log.info('All bricks are online')

        # Check mount point
        cmd = 'ls -lrt /mnt'
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        g.log.info("Client mount point details ")

        # Get arequal after bringing down bricks
        g.log.info('Getting arequal after bringing down bricks...')
        ret, result_offline_redundant_brick1 = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Bring bricks online
        list_of_bricks_to_bring_online = bricks_list[1:3]
        ret = bring_bricks_online(self.mnode, self.volname,
                                  list_of_bricks_to_bring_online)
        self.assertTrue(ret, 'Bricks not brought online')
        g.log.info('Bricks are online successfully')

        # Wait for brick to come online
        g.log.info("Waiting for brick to come online")
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Check if bricks are online
        all_bricks = get_all_bricks(self.mnode, self.volname)
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'All bricks are not online')
        g.log.info('All bricks are online')

        # Reset brick without bringing down brick
        g.log.info('Reset of brick by giving different src and dst brick')
        ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[1],
                                "commit", bricks_list[1])
        self.assertNotEqual(ret, 0, "Not Expected: Reset brick passed")
        g.log.info("Expected : Brick reset failed as source brick must be"
                   " stopped")

        # Kill the brick manually
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Brick not offline')
        g.log.info('Brick is offline successfully')

        # Check if the brick is offline
        g.log.info("Check the brick status if it is offline")
        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(ret, "Brick is not offline")
        g.log.info("Expected : Brick is offline")

        # Reset brick with dest same as source after killing brick manually
        g.log.info('Reset of brick by giving different src and dst brick')
        ret, _, _ = reset_brick(self.mnode,
                                self.volname,
                                bricks_list[1],
                                "commit",
                                bricks_list[1],
                                force="true")
        self.assertEqual(ret, 0, "Not Expected: Reset brick failed")
        g.log.info("Expected : Reset brick is successful")

        # Wait for brick to come online
        g.log.info("Waiting for brick to come online")
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("Expected : Bricks are online")

        # Check if bricks are online
        all_bricks = get_all_bricks(self.mnode, self.volname)
        ret = are_bricks_online(self.mnode, self.volname, all_bricks)
        self.assertTrue(ret, 'All bricks are not online')
        g.log.info('All bricks are online')

        # Bring down other bricks to max redundancy
        # Bringing bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[2:4])
        self.assertTrue(ret, 'Bricks not offline')
        g.log.info('Bricks are offline successfully')

        # Check mount point
        cmd = 'ls -lrt /mnt'
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        g.log.info("Client mount point details")

        # Get arequal after bringing down bricks
        g.log.info('Getting arequal after bringing down redundant bricks...')
        ret, result_offline_redundant_brick2 = (collect_mounts_arequal(
            self.mounts[0]))
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Bring bricks online
        list_of_bricks_to_bring_online = bricks_list[2:4]
        ret = bring_bricks_online(self.mnode, self.volname,
                                  list_of_bricks_to_bring_online)
        self.assertTrue(ret, 'Bricks not brought online')
        g.log.info('Bricks are online successfully')

        # Removing brick from backend
        brick = bricks_list[0].strip().split(":")
        cmd = "rm -rf %s" % brick[1]
        ret, _, _ = g.run(self.mnode, cmd)
        self.assertEqual(ret, 0, "Failed to delete brick %s" % bricks_list[0])
        g.log.info("Removed brick %s sucessfully", bricks_list[0])

        # Check if the brick is offline
        count = 0
        while count <= 20:
            g.log.info("Check the brick status if it is offline")
            ret = are_bricks_offline(self.mnode, self.volname,
                                     [bricks_list[0]])
            if ret:
                break
            sleep(2)
            count = +1
        self.assertTrue(ret, "Brick is not offline")
        g.log.info("Expected : Brick is offline")

        # Reset brick with destination same as source
        g.log.info('Reset of brick with same src and dst brick')
        ret, _, _ = reset_brick(hostname_node1.strip(), self.volname,
                                bricks_list[0], "commit", bricks_list[0])
        self.assertEqual(ret, 0, "Not Expected: Reset brick failed")
        g.log.info("Expected : Reset brick is successful")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Comparing arequals
        self.assertEqual(
            result_before_killing_brick, result_offline_redundant_brick1,
            'Arequals are not equals before killing brick'
            'processes and after offlining redundant bricks')
        g.log.info('Arequals are equals before killing brick'
                   'processes and after offlining redundant bricks')

        # Comparing arequals
        self.assertEqual(
            result_offline_redundant_brick2, result_offline_redundant_brick1,
            'Arequals are not equals for offlining redundant'
            ' bricks')
        g.log.info('Arequals are equals for offlining redundant bricks')

        # Deleting dir1
        cmd = ('rm -rf %s/dir1' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to delete directory1")
        g.log.info("Directory 1 deleted successfully for %s", self.mounts[0])
Example #14
0
    def test_fops_ec_brickdown(self):
        # pylint: disable=too-many-branches,too-many-statements,too-many-locals
        """
        - 1.Start resource consumption tool
        - 2.Create directory dir1
        - 3.Create 5 dir and 5 files in each dir in directory 1
        - 4.Rename all file inside dir1
        - 5.Truncate at any dir in mountpoint inside dir1
        - 6.Create softlink and hardlink of files in mountpoint
        - 7.chmod, chown, chgrp inside dir1
        - 8.Create tiny, small, medium nd large file
        - 9.Creating files on client side for dir1
        - 10.Brick redundant bricks down
        - 11.Validating IO's and waiting to complete
        - 12.Creating dir2
        - 13.Creating files on client side for dir2
        - 14.Bring bricks online
        - 15.Wait for brick to come online
        - 16.Check if bricks are online
        - 17.Monitor heal completion
        - 18.Validating IO's and waiting to complete
        """

        # Starting resource consumption using top
        log_file_mem_monitor = '/var/log/glusterfs/mem_usage.log'
        cmd = ('for i in {1..100};do top -n 1 -b|egrep \
              "RES|gluster" & free -h 2>&1 >> %s ; \
              sleep 10;done' % (log_file_mem_monitor))
        g.log.info(cmd)
        for server in self.servers:
            g.run_async(server, cmd)
        bricks_list = []

        # get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Brick list is empty")
        g.log.info("Brick List : %s", bricks_list)

        # Creating dir1
        cmd = ('mkdir  %s/dir1' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create dir1")
        g.log.info("dir1 created successfully for %s", self.mounts[0])

        # Create 5 dir and 5 files in each dir at mountpoint on dir1
        start, end = 1, 5
        for mount_obj in self.mounts:
            # Number of dir and files to be created.
            dir_range = ("%s..%s" % (str(start), str(end)))
            file_range = ("%s..%s" % (str(start), str(end)))
            # Create dir 1-5 at mountpoint.
            cmd = ('mkdir %s/dir1/dir{%s};' %
                   (mount_obj.mountpoint, dir_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Directory creation failed")
            g.log.info("Directory created successfull")

            # Create files inside each dir.
            cmd = ('touch %s/dir1/dir{%s}/file{%s};' %
                   (mount_obj.mountpoint, dir_range, file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "File creation failed")
            g.log.info("File created successfull")

            # Increment counter so that at next client dir and files are made
            # with diff offset. Like at next client dir will be named
            # dir6, dir7...dir10. Same with files.
            start += 5
            end += 5

        # Rename all files inside dir1 at mountpoint on dir1
        cmd = ('cd %s/dir1/dir1/; '
               'for FILENAME in *;'
               'do mv $FILENAME Unix_$FILENAME; '
               'done;' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to rename file on" "client")
        g.log.info("Successfully renamed file on client")

        # Truncate at any dir in mountpoint inside dir1
        # start is an offset to be added to dirname to act on
        # diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s/; '
                   'for FILENAME in *;'
                   'do echo > $FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Truncate failed")
            g.log.info("Truncate of files successfull")

        # Create softlink and hardlink of files in mountpoint. Start is an
        # offset to be added to dirname to act on diff files at diff clients.
        start = 1
        for mount_obj in self.mounts:
            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln -s $FILENAME softlink_$FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Softlinks have failed")
            g.log.info("Softlink of files have been changed successfully")

            cmd = ('cd %s/dir1/dir%s; '
                   'for FILENAME in *; '
                   'do ln $FILENAME hardlink_$FILENAME; '
                   'done;' % (mount_obj.mountpoint, str(start + 1)))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Creating Hardlinks have failed")
            g.log.info("Hardlink of files have been changed successfully")
            start += 5

        # chmod, chown, chgrp inside dir1
        # start and end used as offset to access diff files
        # at diff clients.
        start, end = 2, 5
        for mount_obj in self.mounts:
            dir_file_range = '%s..%s' % (str(start), str(end))
            cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing mode of files has failed")
            g.log.info("Mode of files have been changed successfully")

            cmd = ('chown root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing owner of files has failed")
            g.log.info("Owner of files have been changed successfully")

            cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' %
                   (mount_obj.mountpoint, dir_file_range, dir_file_range))
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Changing group of files has failed")
            g.log.info("Group of files have been changed successfully")
            start += 5
            end += 5

        # Create tiny, small, medium nd large file
        # at mountpoint. Offset to differ filenames
        # at diff clients.
        offset = 1
        for mount_obj in self.mounts:
            cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for tiny files failed")
            g.log.info("Fallocate for tiny files successfully")

            cmd = 'fallocate -l 20M small_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for small files failed")
            g.log.info("Fallocate for small files successfully")

            cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for medium files failed")
            g.log.info("Fallocate for medium files successfully")

            cmd = 'fallocate -l 1G large_file%s.txt' % str(offset)
            ret, _, _ = g.run(mount_obj.client_system, cmd)
            self.assertFalse(ret, "Fallocate for large files failed")
            g.log.info("Fallocate for large files successfully")
            offset += 1

        # Creating files on client side for dir1
        # Write IO
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir1" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Bring down other bricks to max redundancy
        # Bringing bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[2:4])
        self.assertTrue(ret, 'Bricks not offline')
        g.log.info('Bricks are offline successfully')

        # Validating IO's and waiting to complete
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Creating dir2
        cmd = ('mkdir  %s/dir2' % self.mounts[0].mountpoint)
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Failed to create dir2 ")
        g.log.info("dir2 created successfully for %s", self.mounts[0])

        # Creating files on client side for dir2
        # Write IO
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s/dir2" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Bring bricks online
        list_of_bricks_to_bring_online = bricks_list[2:4]
        ret = bring_bricks_online(self.mnode, self.volname,
                                  list_of_bricks_to_bring_online)
        self.assertTrue(ret, 'Bricks not brought online')
        g.log.info('Bricks are online successfully')

        # Wait for brick to come online
        g.log.info("Waiting for brick to come online")
        ret = wait_for_bricks_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Bricks are not online")
        g.log.info("EXPECTED : Bricks are online")

        # Check if bricks are online
        ret = get_offline_bricks_list(self.mnode, self.volname)
        self.assertListEqual(ret, [], 'All bricks are not online')
        g.log.info('All bricks are online')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        # Validating IO's and waiting to complete
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Check file exist for memory log
        g.log.info("Validating log exists")
        ret = file_exists(self.mnode, '/var/log/glusterfs/mem_usage.log')
        self.assertTrue(ret, "Memory log file does not exist")
        g.log.info("Memory log file exists")