def test_handling_data_split_brain(self): """ - create IO - calculate arequal from mountpoint - set volume option 'self-heal-daemon' to value "off" - kill data brick1 - calculate arequal checksum and compare it - modify files and directories - bring back all bricks processes online - kill data brick3 - modify files and directories - calculate arequal from mountpoint - bring back all bricks processes online - run the find command to trigger heal from mountpoint - set volume option 'self-heal-daemon' to value "on" - check if heal is completed - check for split-brain - read files - calculate arequal checksum and compare it """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do mkdir dir.$i ; " "for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1K count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1k count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Setting options options = {"self-heal-daemon": "off"} g.log.info('Setting options %s for volume %s', options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Bring brick 1 offline bricks_to_bring_offline = [bricks_list[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Comparing arequals before getting bricks offline # and after getting bricks offline self.assertEqual(result_before_offline, result_after_offline, 'Arequals before getting bricks offline ' 'and after getting bricks offline are not equal') g.log.info('Arequals before getting bricks offline ' 'and after getting bricks offline are equal') # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Bring 1-st brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring brick 3rd offline bricks_to_bring_offline = [bricks_list[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Modifying files...') command = ("cd %s ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1M count=1 ; " "done ; " "dd if=/dev/urandom of=file.$i bs=1M count=1 ; " "done" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring 3rd brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Mount and unmount mounts ret = self.unmount_volume(self.mounts) self.assertTrue(ret, 'Failed to unmount %s' % self.volname) ret = self.mount_volume(self.mounts) self.assertTrue(ret, 'Unable to mount %s' % self.volname) # Start heal from mount point g.log.info('Starting heal from mount point...') for mount_obj in self.mounts: g.log.info("Start heal for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = "/usr/bin/env python %s read %s" % ( self.script_upload_path, self.mounts[0].mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Heal triggered for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Heal triggered for all mountpoints') # Enable self-heal daemon ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Successfully started self heal daemon') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Reading files g.log.info('Reading files...') for mount_obj in self.mounts: g.log.info("Start reading files for %s:%s", mount_obj.client_system, mount_obj.mountpoint) command = ('cd %s/ ; ' 'for i in `seq 1 10` ; ' 'do cat file.$i > /dev/null ; ' 'for j in `seq 1 5` ; ' 'do cat dir.$i/file.$j > /dev/null ; ' 'done ; done' % mount_obj.mountpoint) ret, _, err = g.run(mount_obj.client_system, command) self.assertFalse(ret, err) g.log.info("Reading files successfully for %s:%s", mount_obj.client_system, mount_obj.mountpoint) g.log.info('Reading files successfully for all mountpoints') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Comparing arequals before getting bricks online # and after getting bricks online self.assertEqual(result_before_online, result_after_online, 'Arequals before getting bricks online ' 'and after getting bricks online are not equal') g.log.info('Arequals before getting bricks online ' 'and after getting bricks online are equal')
def _perform_brick_ops_and_enable_self_heal(self, op_type): '''Refactor of steps common to all tests: Brick down and perform metadata/data operations''' # First brick in the subvol will always be online and used for self # heal, so make keys match brick index self.op_cmd = { # The operation with key `4` in every op_type will be used for # final data consistency check # Metadata Operations (owner and permission changes) 'metadata': { 2: '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', 3: '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', 4: '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', }, # Data Operations (append data to the files) 'data': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K >> file.$i; for j in `seq 1 3`; do {1} 2K >> dir.$i/file.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 3K >> file.$i; for j in `seq 1 3`; do {1} 3K >> dir.$i/file.$j; done; done;''', 4: '''cd {0}; for i in `seq 1 6`; do {1} 4K >> file.$i; for j in `seq 1 6`; do {1} 4K >> dir.$i/file.$j; done; done;''', }, # Create files and directories when brick is down with no # initial IO 'gfid': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.2.$i; mkdir dir.2.$i; for j in `seq 1 3`; do {1} 2K > dir.2.$i/file.2.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.3.$i; mkdir dir.3.$i; for j in `seq 1 3`; do {1} 2K > dir.3.$i/file.3.$j; done; done;''', 4: '''cd {0}; for i in `seq 4 6`; do {1} 2K > file.$i; mkdir dir.$i; for j in `seq 4 6`; do {1} 2K > dir.$i/file.$j; done; done;''', }, # Create different file type with same name while a brick was down # with no initial IO and validate failure 'file_type': { 2: 'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;', 3: 'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;', 4: '''cd {0}; for i in `seq 1 6`; do {1} 2K > file.$i; for j in `seq 1 6`; do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done; done;''', }, # Create symlinks for files and directories while a brick was down # Out of 6 files, 6 dirs and 6 files in each dir, symlink # outer 2 files, inner 2 files in each dir, 2 dirs and # verify it's a symlink(-L) and linking file exists(-e) 'symlink': { 2: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.2.$i; [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done; [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k; [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1; done;''', 3: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.3.$i; [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done; [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k; [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1; done;''', 4: '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1; {1} 4K >> sl_dir_new.4/dir.1/test_file; {1} 4K >> sl_dir_new.4/test_file; ''', }, } bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks, 'Not able to get list of bricks in the volume') # Make first brick always online and start operations from second brick for index, brick in enumerate(bricks[1:], start=2): # Bring brick offline ret = bring_bricks_offline(self.volname, brick) self.assertTrue(ret, 'Unable to bring {} offline'.format(brick)) self.assertTrue( are_bricks_offline(self.mnode, self.volname, [brick]), 'Brick {} is not offline'.format(brick)) # Perform file/dir operation cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) ret, _, err = g.run(self.client, cmd) if op_type == 'file_type' and index == 3: # Should fail with ENOTCONN as one brick is down, lookupt can't # happen and quorum is not met self.assertNotEqual( ret, 0, '{0} should fail as lookup fails, quorum is not ' 'met'.format(cmd)) self.assertIn( 'Transport', err, '{0} should fail with ENOTCONN ' 'error'.format(cmd)) else: self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) # Bring brick online ret = bring_bricks_online( self.mnode, self.volname, brick, bring_bricks_online_methods='volume_start_force') self.assertTrue( are_bricks_online(self.mnode, self.volname, [brick]), 'Brick {} is not online'.format(brick)) # Assert metadata/data operations resulted in pending heals self.assertFalse(is_heal_complete(self.mnode, self.volname)) # Enable and wait self heal daemon to be online self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), 'Enabling self heal daemon failed') self.assertTrue( wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), 'Not all self heal daemons are online')
def test_gfid_split_brain_resolution(self): """ Description: Simulates gfid split brain on multiple files in a dir and resolve them via `bigger-file`, `mtime` and `source-brick` methods Steps: - Create and mount a replicated volume, create a dir and ~10 data files - Simulate gfid splits in 9 of the files - Resolve each 3 set of files using `bigger-file`, `mtime` and `source-bricks` split-brain resoultion methods - Trigger and monitor for heal completion - Validate all the files are healed and arequal matches for bricks in subvols """ io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' client, m_point = (self.mounts[0].client_system, self.mounts[0].mountpoint) arbiter = self.volume_type.find('arbiter') >= 0 # Disable self-heal daemon and set `quorum-type` option to `none` ret = set_volume_options(self.mnode, self.volname, { 'self-heal-daemon': 'off', 'cluster.quorum-type': 'none' }) self.assertTrue( ret, 'Not able to disable `quorum-type` and ' '`self-heal` daemon volume options') # Create required dir and files from the mount split_dir = 'gfid_split_dir' file_io = ('cd %s; for i in {1..10}; do ' + io_cmd + ' 1M > %s/file$i; done;') ret = mkdir(client, '{}/{}'.format(m_point, split_dir)) self.assertTrue(ret, 'Unable to create a directory from mount point') ret, _, _ = g.run(client, file_io % (m_point, split_dir)) # `file{4,5,6}` are re-created every time to be used in `bigger-file` # resolution method cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}' split_cmds = { 1: ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)), 2: ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)), 3: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)), 4: ';'.join( cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)), } # Get subvols and simulate entry split brain subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] self.assertTrue(subvols, 'Not able to get list of subvols') msg = ('Unable to bring files under {} dir to entry split brain while ' '{} are down') for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter), 1): # Bring down two bricks from each subvol ret = bring_bricks_offline(self.volname, list(bricks)) self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[index])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bricks will be brought down only two times in case of arbiter and # bringing remaining files into split brain for `latest-mtime` heal if arbiter and index == 2: ret, _, _ = g.run(client, 'cd {}; {}'.format(m_point, split_cmds[4])) self.assertEqual(ret, 0, msg.format(split_dir, bricks)) # Bring offline bricks online ret = bring_bricks_online( self.mnode, self.volname, bricks, bring_bricks_online_methods='volume_start_force') self.assertTrue(ret, 'Unable to bring {} online'.format(bricks)) # Enable self-heal daemon, trigger heal and assert volume is in split # brain condition ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, 'Failed to enable self heal daemon') ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname) self.assertTrue(ret, 'Not all self heal daemons are online') ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger index heal on the volume') ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, 'Volume should be in split brain condition') # Select source brick and take note of files in source brick stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) source_bricks = [choice(subvol[0:stop]) for subvol in subvols] files = [ self._get_files_in_brick(path, split_dir) for path in source_bricks ] # Resolve `file1, file2, file3` gfid split files using `source-brick` cmd = ('gluster volume heal ' + self.volname + ' split-brain ' 'source-brick {} /' + split_dir + '/{}') for index, source_brick in enumerate(source_bricks): for each_file in files[index]: run_cmd = cmd.format(source_brick, each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file4, file5, file6` gfid split files using `bigger-file` cmd = ('gluster volume heal ' + self.volname + ' split-brain bigger-file /' + split_dir + '/{}') for each_file in ('file4', 'file5', 'file6'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Resolve `file7, file8, file9` gfid split files using `latest-mtime` cmd = ('gluster volume heal ' + self.volname + ' split-brain latest-mtime /' + split_dir + '/{}') for each_file in ('file7', 'file8', 'file9'): run_cmd = cmd.format(each_file) self._run_cmd_and_assert(run_cmd) # Unless `shd` is triggered manually/automatically files will still # appear in `heal info` ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Unable to trigger full self heal') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue( ret, 'All files in volume should be healed after healing files via' ' `source-brick`, `bigger-file`, `latest-mtime` methods manually') # Validate normal file `file10` and healed files don't differ in # subvols via an `arequal` for subvol in subvols: # Disregard last brick if volume is of arbiter type ret, arequal = collect_bricks_arequal(subvol[0:stop]) self.assertTrue( ret, 'Unable to get `arequal` checksum on ' '{}'.format(subvol[0:stop])) self.assertEqual( len(set(arequal)), 1, 'Mismatch of `arequal` ' 'checksum among {} is identified'.format(subvol[0:stop])) g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, ' '`bigger-file` and `latest-mtime` methods is complete')
def test_afr_gfid_heal(self): """ Description: This test case runs split-brain resolution on a 5 files in split-brain on a 1x2 volume. After resolving split-brain, it makes sure that split brain resolution doesn't work on files already in split brain. """ g.log.info("disabling the self heal daemon") ret = disable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "unable to disable self heal daemon") g.log.info("Successfully disabled the self heal daemon") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "failed to get list of bricks") g.log.info("bringing down brick1") ret = bring_bricks_offline(self.volname, all_bricks[0:1]) self.assertTrue(ret, "unable to bring brick1 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is still online") g.log.info("verified: brick1 is offline") g.log.info("creating 5 files from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a file from mount point") g.log.info("bringing brick 1 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertIsNotNone(ret, "unable to bring brick 1 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is not online") g.log.info("verified: brick1 is online") g.log.info("bringing down brick2") ret = bring_bricks_offline(self.volname, all_bricks[1:2]) self.assertTrue(ret, "unable to bring brick2 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is still online") g.log.info("verified: brick2 is offline") g.log.info("creating 5 new files of same name from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing brick2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertIsNotNone(ret, "unable to bring brick2 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is not online") g.log.info("verified: brick2 is online") g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") g.log.info("checking if volume is in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("resolving split-brain by choosing first brick as " "the source brick") node, brick_path = all_bricks[0].split(':') for fcount in range(5): command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[0] + ' /test_file' + str(fcount) + '.txt') ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "heal not triggered") g.log.info("Successfully triggered heal") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=240) self.assertTrue(ret, "heal not completed") g.log.info("Heal completed successfully") # checking if any file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "file still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") g.log.info("resolving split-brain on a file not in split-brain") node, brick_path = all_bricks[0].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[1] + " /test_file0.txt") ret, _, _ = g.run(node, command) self.assertNotEqual( ret, 0, "Unexpected: split-brain resolution " "command is successful on a file which" " is not in split-brain") g.log.info("Expected: split-brian resolution command failed on " "a file which is not in split-brain") g.log.info("checking the split-brain status of each file") for fcount in range(5): fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) + '.txt') status = get_fattr(self.mounts[0].client_system, fpath, 'replica.split-brain-status') compare_string = ("The file is not under data or metadata " "split-brain") self.assertEqual( status.rstrip('\x00'), compare_string, "file test_file%s is under" " split-brain" % str(fcount)) g.log.info("none of the files are under split-brain")
def _perform_brick_ops_and_enable_self_heal(self, op_type): '''Refactor of steps common to all tests: Brick down and perform metadata/data operations''' # First brick in the subvol will always be online and used for self # heal, so make keys match brick index self.op_cmd = { # Metadata Operations (owner and permission changes) 'metadata': { 2: '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', 3: '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', # 4 - Will be used for final data consistency check 4: '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', }, # Data Operations (append data to the files) 'data': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K >> file.$i; for j in `seq 1 3`; do {1} 2K >> dir.$i/file.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 3K >> file.$i; for j in `seq 1 3`; do {1} 3K >> dir.$i/file.$j; done; done;''', # 4 - Will be used for final data consistency check 4: '''cd {0}; for i in `seq 1 6`; do {1} 4K >> file.$i; for j in `seq 1 6`; do {1} 4K >> dir.$i/file.$j; done; done;''', }, } bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks, 'Not able to get list of bricks in the volume') # Make first brick always online and start operations from second brick for index, brick in enumerate(bricks[1:], start=2): # Bring brick offline ret = bring_bricks_offline(self.volname, brick) self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) # Perform metadata/data operation cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) ret, _, err = g.run(self.client, cmd) self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) # Bring brick online ret = bring_bricks_online( self.mnode, self.volname, brick, bring_bricks_online_methods='volume_start_force') # Assert metadata/data operations resulted in pending heals self.assertFalse(is_heal_complete(self.mnode, self.volname)) # Enable and wait self heal daemon to be online self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), 'Enabling self heal daemon failed') self.assertTrue( wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), 'Not all self heal daemons are online')
def test_afr_gfid_heal(self): """ Description: This test case runs split-brain resolution CLIs on a file in gfid split-brain on 1x2 volume. 1. kill 1 brick 2. create a file at mount point 3. bring back the killed brick 4. kill the other brick 5. create same file at mount point 6. bring back the killed brick 7. try heal from CLI and check if it gets completed """ g.log.info("disabling the self heal daemon") ret = disable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "unable to disable self heal daemon") g.log.info("Successfully disabled the self heal daemon") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "unable to get list of bricks") g.log.info("bringing down brick1") ret = bring_bricks_offline(self.volname, all_bricks[0]) self.assertTrue(ret, "unable to bring %s offline" % all_bricks[0]) g.log.info("Successfully brought the following brick offline " ": %s", all_bricks[0]) g.log.info("creating a file from mount point") all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a file from mount point") g.log.info("bringing brick 1 back online") ret = bring_bricks_online(self.mnode, self.volname, [all_bricks[0]]) self.assertIsNotNone(ret, "unable to bring %s online" % all_bricks[0]) g.log.info("Successfully brought the following brick online " ": %s", all_bricks[0]) g.log.info("bringing down brick2") ret = bring_bricks_offline(self.volname, all_bricks[1]) self.assertTrue(ret, "unable to bring %s offline" % all_bricks[0]) g.log.info("Successfully brought the following brick offline " ": %s", all_bricks[1]) g.log.info("creating a new file of same name from mount point") all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing brick2 back online") ret = bring_bricks_online(self.mnode, self.volname, [all_bricks[1]]) self.assertIsNotNone(ret, "unable to bring %s online" % all_bricks[0]) g.log.info("Successfully brought the following brick online " ": %s", all_bricks[1]) g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") g.log.info("checking if file is in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("resolving split-brain by choosing second brick as " "the source brick") node, _ = all_bricks[0].split(':') command = ("gluster volume heal %s split-brain source-brick %s " "/test_file0.txt" % (self.volname, all_bricks[1])) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "heal not triggered") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=120) self.assertTrue(ret, "heal not completed") # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "file still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution")
def test_gfid_split_brain_resolution(self): """ - create gfid split-brain of files and resolves them using source-brick option of the CLI. """ # pylint: disable=too-many-statements # pylint: disable=too-many-locals # Disable all self-heals and client-quorum options = { "self-heal-daemon": "off", "data-self-heal": "off", "metadata-self-heal": "off", "entry-self-heal": "off", "cluster.quorum-type": "none" } g.log.info("setting volume options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Create dir inside which I/O will be performed. ret = mkdir(self.mounts[0].client_system, "%s/test_gfid_split_brain" % self.mounts[0].mountpoint) self.assertTrue(ret, "mkdir failed") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Toggle bricks and perform I/O file_list = [ "file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt", "file6.txt", "file7.txt", "file8.txt", "file9.txt", "file10.txt" ] brick_index = 0 offline_bricks = [] for _ in range(0, 3): for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] offline_bricks.append(subvol_brick_list[brick_index % 3]) offline_bricks.append(subvol_brick_list[(brick_index + 1) % 3]) self.toggle_bricks_and_perform_io(file_list, offline_bricks) brick_index += 1 offline_bricks[:] = [] # Enable shd g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") # Wait for self heal processes to come online g.log.info("Wait for selfheal process to come online") timeout = 300 ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname, timeout) self.assertTrue(ret, "Self-heal process are not online") g.log.info("All self heal process are online") # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Files are not in split-brain as expected.") g.log.info("Files are still in split-brain") # First brick of each replica will be used as source-brick first_brick_list = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] brick = subvol_brick_list[0] first_brick_list.append(brick) # Find which dht subvols the 10 files are present in and trigger heal for filename in file_list: fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \ filename gfile = GlusterFile(self.clients[0], fpath) for brick in first_brick_list: _, brick_path = brick.split(':') match = [ brick for item in gfile.hashed_bricks if brick_path in item ] if match: self.resolve_gfid_split_brain( "/test_gfid_split_brain/" + filename, brick) # Trigger heal to complete pending data/metadata heals ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) first_brick_total = arequal.splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick for brick in subvol_brick_list[1:]: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and %s are not equal' % brick) g.log.info('Arequals for subvol and %s are equal', brick)