def test_ec_open_fd(self):
        """
        Test Steps:
        - disable server side heal
        - Create a file
        - Set volume option to implement open FD on file
        - Bring a brick down,say b1
        - Open FD on file
        - Bring brick b1 up
        - write to open FD file
        - Monitor heal
        - Check xattr , ec.version and ec.size of file
        - Check stat of file
        """

        # pylint: disable=too-many-branches,too-many-statements,too-many-locals

        mountpoint = self.mounts[0].mountpoint

        # Disable server side heal
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to disable server side heal"))
        g.log.info("Successfully disabled server side heal")

        # Log Volume Info and Status after disabling server side heal
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))

        # Create a file
        cmd = ("cd %s; touch 'file_openfd';" % mountpoint)
        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished creating a file while all the bricks are UP')

        # Set volume options
        ret = set_volume_options(self.mnode, self.volname,
                                 {"performance.read-after-open": "yes"})
        self.assertTrue(ret, 'Failed to set volume {}'
                        ' options'.format(self.volname))
        g.log.info('Successfully set %s volume options', self.volname,)

        # Bringing brick b1 offline
        sub_vols = get_subvols(self.mnode, self.volname)
        subvols_list = sub_vols['volume_subvols']
        bricks_list1 = subvols_list[0]
        brick_b1_down = choice(bricks_list1)
        ret = bring_bricks_offline(self.volname,
                                   brick_b1_down)
        self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down)
        g.log.info('Brick %s is offline successfully', brick_b1_down)

        node = self.mounts[0].client_system
        # Open FD
        proc = open_file_fd(mountpoint, time=100,
                            client=node)

        # Bring brick b1 online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [brick_b1_down],
                                  'glusterd_restart')
        self.assertTrue(ret, 'Brick {} is not brought '
                        'online'.format(brick_b1_down))
        g.log.info('Brick %s is online successfully', brick_b1_down)

        # Validate peers are connected
        ret = self.validate_peers_are_connected()
        self.assertTrue(ret, "Peers are not in connected state after bringing"
                        " an offline brick to online via `glusterd restart`")
        g.log.info("Successfully validated peers are in connected state")

        # Check if write to FD is successful
        g.log.info('Open FD on file successful')
        ret, _, _ = proc.async_communicate()
        self.assertEqual(ret, 0, "Write to FD is successful")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')
        g.log.info('Heal has completed successfully')

        file_openfd = os.path.join(mountpoint, 'file_openfd')

        # Check if data exists on file
        ret = check_if_pattern_in_file(node, 'xyz', file_openfd)
        self.assertEqual(ret, 0, 'xyz does not exists in file')
        g.log.info('xyz exists in file')

        file_fd = 'file_openfd'

        # Check if EC version is same on all bricks which are up
        ret = validate_xattr_on_all_bricks(bricks_list1, file_fd,
                                           'trusted.ec.version')
        self.assertTrue(ret, "Healing not completed and EC version is "
                        "not updated")
        g.log.info("Healing is completed and EC version is updated")

        # Check if EC size is same on all bricks which are up
        ret = validate_xattr_on_all_bricks(bricks_list1, file_fd,
                                           'trusted.ec.size')
        self.assertTrue(ret, "Healing not completed and EC size is "
                        "not updated")
        g.log.info("Healing is completed and EC size is updated")

        # Check stat of file
        cmd = "cd %s; du -kh file_openfd" % mountpoint
        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('File %s is accessible', file_fd)
Example #2
0
    def test_heal_client_io_hang(self):
        mountpoint = self.mounts[0].mountpoint

        # disable server side heal
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to disable server side heal"))
        g.log.info("Successfully disabled server side heal")

        # Log Volume Info and Status after disabling client side heal
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the bricks list")

        # Create files
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do touch file$i; done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished creating files while all the bricks are UP')

        # Bring bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        # Start pumping IO from client
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writing on files while a brick is DOWN')

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring up the bricks")
        g.log.info("Successfully brought the bricks up")

        # Verifying all bricks online
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, "All bricks are not online")

        # Start client side heal by reading/writing files.
        appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                     "do dd if=/dev/urandom of=file$i bs=1M "
                     "count=1 oflag=append conv=notrunc;done" % mountpoint)

        readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                   "do dd if=file$i of=/dev/zero bs=1M "
                   "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, appendcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished append on files after bringing bricks online')

        ret, _, err = g.run(self.mounts[0].client_system, readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished read on files after bringing bricks online')

        # check the heal info and completion
        ec_check_heal_comp(self)

        # Log Volume Info and Status after bringing the brick up
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and status "
            "of volume %s", self.volname)
Example #3
0
    def test_heal_full_node_reboot(self):
        """
        - Create IO from mountpoint.
        - Calculate arequal from mount.
        - Delete data from backend from the EC volume.
        - Trigger heal full.
        - Disable Heal.
        - Again Enable and do Heal full.
        - Reboot a Node.
        - Calculate arequal checksum and compare it.
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -n 2 -f 20 %s" % (
                           self.script_upload_path,
                           mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before deleting the files from brick
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_killing_procs = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']

        # Delete data from backend from the erasure node
        for subvol in subvols:
            erasure = subvol[-1]
            g.log.info('Clearing ec brick %s', erasure)
            node, brick_path = erasure.split(':')
            ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path)
            self.assertFalse(ret, err)
            g.log.info('Clearing ec brick %s is successful', erasure)
        g.log.info('Clearing data from brick is unsuccessful')

        # Trigger heal full
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Disable Heal and Enable Heal Full Again
        g.log.info("Disabling Healon the Servers")
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, "Disabling Failed")
        g.log.info("Healing is Now Disabled")

        g.log.info("Enbaling Heal Now")
        ret = enable_heal(self.mnode, self.volname)
        self.assertTrue(ret, "Enabling Heal failed")
        g.log.info("Healing is now enabled")
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Reboot A Node
        g.log.info("Rebooting Node from the Cluster")
        subvols_dict = get_subvols(self.mnode, self.volname)
        nodes_to_reboot = []
        for subvol in subvols_dict['volume_subvols']:
            # Define nodes to reboot
            brick_list = subvol[1:2]
            for brick in brick_list:
                node, brick_path = brick.split(':')
                if node not in nodes_to_reboot:
                    nodes_to_reboot.append(node)

        # Reboot nodes on subvol and wait while rebooting
        g.log.info("Rebooting the nodes %s", nodes_to_reboot)
        ret = reboot_nodes(nodes_to_reboot)
        self.assertTrue(ret, 'Failed to reboot nodes %s '
                        % nodes_to_reboot)

        # Check if nodes are online
        counter = 0
        timeout = 700
        _rc = False
        while counter < timeout:
            ret, reboot_results = are_nodes_online(nodes_to_reboot)
            if not ret:
                g.log.info("Nodes are offline, Retry after 5 seconds ... ")
                sleep(5)
                counter = counter + 5
            _rc = True
            break

        if not _rc:
            for node in reboot_results:
                if not reboot_results[node]:
                    g.log.error("Node %s is offline even after "
                                "%d minutes", node, timeout / 60.0)
        g.log.info("All nodes %s are up and running", nodes_to_reboot)

        # Trigger Heal Full
        ret = trigger_heal_full(self.mnode, self.volname)
        if not ret:
            sleep(10)
            ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after healing
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_healing = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Comparing arequals
        self.assertEqual(result_before_killing_procs, result_after_healing,
                         'Arequals before killing arbiter '
                         'processes and after healing are not equal')
        g.log.info('Arequals before killing arbiter '
                   'processes and after healing are equal')