def test_handling_data_split_brain(self):
        """
        - create IO
        - calculate arequal from mountpoint
        - set volume option 'self-heal-daemon' to value "off"
        - kill data brick1
        - calculate arequal checksum and compare it
        - modify files and directories
        - bring back all bricks processes online
        - kill data brick3
        - modify files and directories
        - calculate arequal from mountpoint
        - bring back all bricks processes online
        - run the find command to trigger heal from mountpoint
        - set volume option 'self-heal-daemon' to value "on"
        - check if heal is completed
        - check for split-brain
        - read files
        - calculate arequal checksum and compare it
        """
        # pylint: disable=too-many-locals,too-many-statements

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Creating files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do mkdir dir.$i ; "
                       "for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1K count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1k count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks offline
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        # Setting options
        options = {"self-heal-daemon": "off"}
        g.log.info('Setting options %s for volume %s',
                   options, self.volname)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Option 'self-heal-daemon' is set to 'off' successfully")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)

        # Bring brick 1 offline
        bricks_to_bring_offline = [bricks_list[0]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Get arequal after getting bricks offline
        g.log.info('Getting arequal after getting bricks offline...')
        ret, result_after_offline = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks offline '
                   'is successful')

        # Comparing arequals before getting bricks offline
        # and after getting bricks offline
        self.assertEqual(result_before_offline, result_after_offline,
                         'Arequals before getting bricks offline '
                         'and after getting bricks offline are not equal')
        g.log.info('Arequals before getting bricks offline '
                   'and after getting bricks offline are equal')

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Modify files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Bring 1-st brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Bring brick 3rd offline
        bricks_to_bring_offline = [bricks_list[-1]]
        g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline' %
                        bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Modify the data
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Modifying data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            # Create files
            g.log.info('Modifying files...')
            command = ("cd %s ; "
                       "for i in `seq 1 10` ; "
                       "do for j in `seq 1 5` ; "
                       "do dd if=/dev/urandom of=dir.$i/file.$j "
                       "bs=1M count=1 ; "
                       "done ; "
                       "dd if=/dev/urandom of=file.$i bs=1M count=1 ; "
                       "done"
                       % mount_obj.mountpoint)

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before getting bricks online
        g.log.info('Getting arequal before getting bricks online...')
        ret, result_before_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks online '
                   'is successful')

        # Bring 3rd brick online
        g.log.info('Bringing bricks %s online...', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Mount and unmount mounts
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, 'Failed to unmount %s' % self.volname)

        ret = self.mount_volume(self.mounts)
        self.assertTrue(ret, 'Unable to mount %s' % self.volname)

        # Start heal from mount point
        g.log.info('Starting heal from mount point...')
        for mount_obj in self.mounts:
            g.log.info("Start heal for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = "/usr/bin/env python %s read %s" % (
                self.script_upload_path,
                self.mounts[0].mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Heal triggered for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Heal triggered for all mountpoints')

        # Enable self-heal daemon
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Successfully started self heal daemon')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Reading files
        g.log.info('Reading files...')
        for mount_obj in self.mounts:
            g.log.info("Start reading files for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
            command = ('cd %s/ ; '
                       'for i in `seq 1 10` ; '
                       'do cat file.$i > /dev/null ; '
                       'for j in `seq 1 5` ; '
                       'do cat dir.$i/file.$j > /dev/null ; '
                       'done ; done'
                       % mount_obj.mountpoint)
            ret, _, err = g.run(mount_obj.client_system, command)
            self.assertFalse(ret, err)
            g.log.info("Reading files successfully for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)
        g.log.info('Reading files successfully for all mountpoints')

        # Get arequal after getting bricks online
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_online = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Comparing arequals before getting bricks online
        # and after getting bricks online
        self.assertEqual(result_before_online, result_after_online,
                         'Arequals before getting bricks online '
                         'and after getting bricks online are not equal')
        g.log.info('Arequals before getting bricks online '
                   'and after getting bricks online are equal')
コード例 #2
0
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # The operation with key `4` in every op_type will be used for
            # final data consistency check
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
            # Create files and directories when brick is down with no
            # initial IO
            'gfid': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.2.$i; mkdir dir.2.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.2.$i/file.2.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.3.$i; mkdir dir.3.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.3.$i/file.3.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 4 6`;
                    do {1} 2K > file.$i; mkdir dir.$i;
                    for j in `seq 4 6`;
                    do {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create different file type with same name while a brick was down
            # with no initial IO and validate failure
            'file_type': {
                2:
                'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;',
                3:
                'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 2K > file.$i;
                    for j in `seq 1 6`;
                    do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create symlinks for files and directories while a brick was down
            # Out of 6 files, 6 dirs and 6 files in each dir, symlink
            # outer 2 files, inner 2 files in each dir, 2 dirs and
            # verify it's a symlink(-L) and linking file exists(-e)
            'symlink': {
                2:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.2.$i;
                    [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done;
                    [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k;
                    [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.3.$i;
                    [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done;
                    [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k;
                    [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1;
                    done;''',
                4:
                '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1;
                    {1} 4K >> sl_dir_new.4/dir.1/test_file;
                    {1} 4K >> sl_dir_new.4/test_file;
                    ''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(brick))
            self.assertTrue(
                are_bricks_offline(self.mnode, self.volname, [brick]),
                'Brick {} is not offline'.format(brick))

            # Perform file/dir operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            if op_type == 'file_type' and index == 3:
                # Should fail with ENOTCONN as one brick is down, lookupt can't
                # happen and quorum is not met
                self.assertNotEqual(
                    ret, 0, '{0} should fail as lookup fails, quorum is not '
                    'met'.format(cmd))
                self.assertIn(
                    'Transport', err, '{0} should fail with ENOTCONN '
                    'error'.format(cmd))
            else:
                self.assertEqual(ret, 0,
                                 '{0} failed with {1}'.format(cmd, err))
                self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(
                are_bricks_online(self.mnode, self.volname, [brick]),
                'Brick {} is not online'.format(brick))

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
コード例 #3
0
    def test_gfid_split_brain_resolution(self):
        """
        Description: Simulates gfid split brain on multiple files in a dir and
        resolve them via `bigger-file`, `mtime` and `source-brick` methods

        Steps:
        - Create and mount a replicated volume, create a dir and ~10 data files
        - Simulate gfid splits in 9 of the files
        - Resolve each 3 set of files using `bigger-file`, `mtime` and
          `source-bricks` split-brain resoultion methods
        - Trigger and monitor for heal completion
        - Validate all the files are healed and arequal matches for bricks in
          subvols
        """
        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)
        arbiter = self.volume_type.find('arbiter') >= 0

        # Disable self-heal daemon and set `quorum-type` option to `none`
        ret = set_volume_options(self.mnode, self.volname, {
            'self-heal-daemon': 'off',
            'cluster.quorum-type': 'none'
        })
        self.assertTrue(
            ret, 'Not able to disable `quorum-type` and '
            '`self-heal` daemon volume options')

        # Create required dir and files from the mount
        split_dir = 'gfid_split_dir'
        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
                   ' 1M > %s/file$i; done;')
        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
        self.assertTrue(ret, 'Unable to create a directory from mount point')
        ret, _, _ = g.run(client, file_io % (m_point, split_dir))

        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
        # resolution method
        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
        split_cmds = {
            1:
            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
            2:
            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
            3: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
            4: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
        }

        # Get subvols and simulate entry split brain
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        msg = ('Unable to bring files under {} dir to entry split brain while '
               '{} are down')
        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
                                       1):
            # Bring down two bricks from each subvol
            ret = bring_bricks_offline(self.volname, list(bricks))
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            ret, _, _ = g.run(client,
                              'cd {}; {}'.format(m_point, split_cmds[index]))
            self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bricks will be brought down only two times in case of arbiter and
            # bringing remaining files into split brain for `latest-mtime` heal
            if arbiter and index == 2:
                ret, _, _ = g.run(client,
                                  'cd {}; {}'.format(m_point, split_cmds[4]))
                self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bring offline bricks online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                bricks,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))

        # Enable self-heal daemon, trigger heal and assert volume is in split
        # brain condition
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Failed to enable self heal daemon')

        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Not all self heal daemons are online')

        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger index heal on the volume')

        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, 'Volume should be in split brain condition')

        # Select source brick and take note of files in source brick
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
        files = [
            self._get_files_in_brick(path, split_dir) for path in source_bricks
        ]

        # Resolve `file1, file2, file3` gfid split files using `source-brick`
        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
               'source-brick {} /' + split_dir + '/{}')
        for index, source_brick in enumerate(source_bricks):
            for each_file in files[index]:
                run_cmd = cmd.format(source_brick, each_file)
                self._run_cmd_and_assert(run_cmd)

        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain bigger-file /' + split_dir + '/{}')
        for each_file in ('file4', 'file5', 'file6'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain latest-mtime /' + split_dir + '/{}')
        for each_file in ('file7', 'file8', 'file9'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Unless `shd` is triggered manually/automatically files will still
        # appear in `heal info`
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full self heal')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, 'All files in volume should be healed after healing files via'
            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')

        # Validate normal file `file10` and healed files don't differ in
        # subvols via an `arequal`
        for subvol in subvols:
            # Disregard last brick if volume is of arbiter type
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
                   '`bigger-file` and `latest-mtime` methods is complete')
    def test_afr_gfid_heal(self):
        """
        Description: This test case runs split-brain resolution
                     on a 5 files in split-brain on a 1x2 volume.
                     After resolving split-brain, it makes sure that
                     split brain resolution doesn't work on files
                     already in split brain.
        """

        g.log.info("disabling the self heal daemon")
        ret = disable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "unable to disable self heal daemon")
        g.log.info("Successfully disabled the self heal daemon")

        # getting list of all bricks
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "failed to get list of bricks")
        g.log.info("bringing down brick1")
        ret = bring_bricks_offline(self.volname, all_bricks[0:1])
        self.assertTrue(ret, "unable to bring brick1 offline")
        g.log.info("Successfully brought the following brick offline "
                   ": %s", str(all_bricks[0]))
        g.log.info("verifying if brick1 is offline")
        ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1])
        self.assertTrue(ret, "brick1 is still online")
        g.log.info("verified: brick1 is offline")

        g.log.info("creating 5 files from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 5 --base-file-name test_file --fixed-file-size 1k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        g.log.info("Wait for IO to complete and validate IO.....")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")
        g.log.info("Successfully created a file from mount point")

        g.log.info("bringing brick 1 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1])
        self.assertIsNotNone(ret, "unable to bring brick 1 online")
        g.log.info("Successfully brought the following brick online "
                   ": %s", str(all_bricks[0]))
        g.log.info("verifying if brick1 is online")
        ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1])
        self.assertTrue(ret, "brick1 is not online")
        g.log.info("verified: brick1 is online")

        g.log.info("bringing down brick2")
        ret = bring_bricks_offline(self.volname, all_bricks[1:2])
        self.assertTrue(ret, "unable to bring brick2 offline")
        g.log.info("Successfully brought the following brick offline "
                   ": %s", str(all_bricks[1]))
        g.log.info("verifying if brick2 is offline")
        ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2])
        self.assertTrue(ret, "brick2 is still online")
        g.log.info("verified: brick2 is offline")

        g.log.info("creating 5 new files of same name from mount point")
        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("python %s create_files "
                   "-f 5 --base-file-name test_file --fixed-file-size 10k %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
        # Validate I/O
        g.log.info("Wait for IO to complete and validate IO.....")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")
        g.log.info("Successfully created a new file of same name "
                   "from mount point")

        g.log.info("bringing brick2 back online")
        ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2])
        self.assertIsNotNone(ret, "unable to bring brick2 online")
        g.log.info("Successfully brought the following brick online "
                   ": %s", str(all_bricks[1]))
        g.log.info("verifying if brick2 is online")
        ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2])
        self.assertTrue(ret, "brick2 is not online")
        g.log.info("verified: brick2 is online")

        g.log.info("enabling the self heal daemon")
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "failed to enable self heal daemon")
        g.log.info("Successfully enabled the self heal daemon")

        g.log.info("checking if volume is in split-brain")
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "unable to create split-brain scenario")
        g.log.info("Successfully created split brain scenario")

        g.log.info("resolving split-brain by choosing first brick as "
                   "the source brick")
        node, brick_path = all_bricks[0].split(':')
        for fcount in range(5):
            command = ("gluster v heal " + self.volname + " split-brain "
                       "source-brick " + all_bricks[0] + ' /test_file' +
                       str(fcount) + '.txt')
            ret, _, _ = g.run(node, command)
            self.assertEqual(ret, 0, "command execution not successful")
        # triggering heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, "heal not triggered")
        g.log.info("Successfully triggered heal")
        # waiting for heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=240)
        self.assertTrue(ret, "heal not completed")
        g.log.info("Heal completed successfully")
        # checking if any file is in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, "file still in split-brain")
        g.log.info("Successfully resolved split brain situation using "
                   "CLI based resolution")

        g.log.info("resolving split-brain on a file not in split-brain")
        node, brick_path = all_bricks[0].split(':')
        command = ("gluster v heal " + self.volname + " split-brain "
                   "source-brick " + all_bricks[1] + " /test_file0.txt")
        ret, _, _ = g.run(node, command)
        self.assertNotEqual(
            ret, 0, "Unexpected: split-brain resolution "
            "command is successful on a file which"
            " is not in split-brain")
        g.log.info("Expected: split-brian resolution command failed on "
                   "a file which is not in split-brain")

        g.log.info("checking the split-brain status of each file")
        for fcount in range(5):
            fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) +
                     '.txt')
            status = get_fattr(self.mounts[0].client_system, fpath,
                               'replica.split-brain-status')
            compare_string = ("The file is not under data or metadata "
                              "split-brain")
            self.assertEqual(
                status.rstrip('\x00'), compare_string,
                "file test_file%s is under"
                " split-brain" % str(fcount))
        g.log.info("none of the files are under split-brain")
コード例 #5
0
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            # Perform metadata/data operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
            self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
コード例 #6
0
    def test_afr_gfid_heal(self):
        """
        Description: This test case runs split-brain resolution CLIs
                     on a file in gfid split-brain on 1x2 volume.
                     1. kill 1 brick
                     2. create a file at mount point
                     3. bring back the killed brick
                     4. kill the other brick
                     5. create same file at mount point
                     6. bring back the killed brick
                     7. try heal from CLI and check if it gets completed
        """

        g.log.info("disabling the self heal daemon")
        ret = disable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "unable to disable self heal daemon")
        g.log.info("Successfully disabled the self heal daemon")

        # getting list of all bricks
        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "unable to get list of bricks")
        g.log.info("bringing down brick1")
        ret = bring_bricks_offline(self.volname, all_bricks[0])
        self.assertTrue(ret, "unable to bring %s offline" % all_bricks[0])
        g.log.info("Successfully brought the following brick offline "
                   ": %s", all_bricks[0])

        g.log.info("creating a file from mount point")
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 1 --base-file-name test_file --fixed-file-size 1k %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd)
        all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a file from mount point")

        g.log.info("bringing brick 1 back online")
        ret = bring_bricks_online(self.mnode, self.volname, [all_bricks[0]])
        self.assertIsNotNone(ret, "unable to bring %s online" % all_bricks[0])
        g.log.info("Successfully brought the following brick online "
                   ": %s", all_bricks[0])

        g.log.info("bringing down brick2")
        ret = bring_bricks_offline(self.volname, all_bricks[1])
        self.assertTrue(ret, "unable to bring %s offline" % all_bricks[0])
        g.log.info("Successfully brought the following brick offline "
                   ": %s", all_bricks[1])

        g.log.info("creating a new file of same name from mount point")
        all_mounts_procs = []
        cmd = ("/usr/bin/env python %s create_files "
               "-f 1 --base-file-name test_file --fixed-file-size 1k %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system, cmd)
        all_mounts_procs.append(proc)
        # Validate I/O
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("Successfully created a new file of same name "
                   "from mount point")

        g.log.info("bringing brick2 back online")
        ret = bring_bricks_online(self.mnode, self.volname, [all_bricks[1]])
        self.assertIsNotNone(ret, "unable to bring %s online" % all_bricks[0])
        g.log.info("Successfully brought the following brick online "
                   ": %s", all_bricks[1])

        g.log.info("enabling the self heal daemon")
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "failed to enable self heal daemon")
        g.log.info("Successfully enabled the self heal daemon")

        g.log.info("checking if file is in split-brain")
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "unable to create split-brain scenario")
        g.log.info("Successfully created split brain scenario")

        g.log.info("resolving split-brain by choosing second brick as "
                   "the source brick")
        node, _ = all_bricks[0].split(':')
        command = ("gluster volume heal %s split-brain source-brick %s "
                   "/test_file0.txt" % (self.volname, all_bricks[1]))
        ret, _, _ = g.run(node, command)
        self.assertEqual(ret, 0, "command execution not successful")
        # triggering heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, "heal not triggered")
        # waiting for heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=120)
        self.assertTrue(ret, "heal not completed")
        # checking if file is in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, "file still in split-brain")
        g.log.info("Successfully resolved split brain situation using "
                   "CLI based resolution")
コード例 #7
0
    def test_gfid_split_brain_resolution(self):
        """
        - create gfid split-brain of files and resolves them using source-brick
          option of the CLI.
        """

        # pylint: disable=too-many-statements
        # pylint: disable=too-many-locals

        # Disable all self-heals and client-quorum
        options = {
            "self-heal-daemon": "off",
            "data-self-heal": "off",
            "metadata-self-heal": "off",
            "entry-self-heal": "off",
            "cluster.quorum-type": "none"
        }
        g.log.info("setting volume options %s", options)
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, ("Unable to set volume option %s for "
                              "volume %s" % (options, self.volname)))
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Create dir inside which I/O will be performed.
        ret = mkdir(self.mounts[0].client_system,
                    "%s/test_gfid_split_brain" % self.mounts[0].mountpoint)
        self.assertTrue(ret, "mkdir failed")

        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.volname)
        subvols_dict = get_subvols(self.mnode, self.volname)
        num_subvols = len(subvols_dict['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # Toggle bricks and perform I/O
        file_list = [
            "file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt",
            "file6.txt", "file7.txt", "file8.txt", "file9.txt", "file10.txt"
        ]
        brick_index = 0
        offline_bricks = []
        for _ in range(0, 3):
            for i in range(0, num_subvols):
                subvol_brick_list = subvols_dict['volume_subvols'][i]
                offline_bricks.append(subvol_brick_list[brick_index % 3])
                offline_bricks.append(subvol_brick_list[(brick_index + 1) % 3])
            self.toggle_bricks_and_perform_io(file_list, offline_bricks)
            brick_index += 1
            offline_bricks[:] = []

        # Enable shd
        g.log.info("enabling the self heal daemon")
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, "failed to enable self heal daemon")
        g.log.info("Successfully enabled the self heal daemon")

        # Wait for self heal processes to come online
        g.log.info("Wait for selfheal process to come online")
        timeout = 300
        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname,
                                                      timeout)
        self.assertTrue(ret, "Self-heal process are not online")
        g.log.info("All self heal process are online")

        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # checking if file is in split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, "Files are not in split-brain as expected.")
        g.log.info("Files are still in split-brain")

        # First brick of each replica will be used as source-brick
        first_brick_list = []
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            brick = subvol_brick_list[0]
            first_brick_list.append(brick)

        # Find which dht subvols the 10 files are present in and trigger heal
        for filename in file_list:
            fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \
                    filename
            gfile = GlusterFile(self.clients[0], fpath)
            for brick in first_brick_list:
                _, brick_path = brick.split(':')
                match = [
                    brick for item in gfile.hashed_bricks if brick_path in item
                ]
                if match:
                    self.resolve_gfid_split_brain(
                        "/test_gfid_split_brain/" + filename, brick)

        # Trigger heal to complete pending data/metadata heals
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Get arequals and compare
        for i in range(0, num_subvols):
            # Get arequal for first brick
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            node, brick_path = subvol_brick_list[0].split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            first_brick_total = arequal.splitlines()[-1].split(':')[-1]

            # Get arequal for every brick and compare with first brick
            for brick in subvol_brick_list[1:]:
                node, brick_path = brick.split(':')
                command = ('arequal-checksum -p %s '
                           '-i .glusterfs -i .landfill -i .trashcan' %
                           brick_path)
                ret, brick_arequal, _ = g.run(node, command)
                self.assertFalse(ret,
                                 'Failed to get arequal on brick %s' % brick)
                g.log.info('Getting arequal for %s is successful', brick)
                brick_total = brick_arequal.splitlines()[-1].split(':')[-1]

                self.assertEqual(
                    first_brick_total, brick_total,
                    'Arequals for subvol and %s are not equal' % brick)
                g.log.info('Arequals for subvol and %s are equal', brick)