def task(ctx, config): """ Test handling of divergent entries with prior_version prior to log_tail config: none Requires 3 osds. """ if config is None: config = {} assert isinstance(config, dict), \ 'divergent_priors task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) ctx.manager = manager while len(manager.get_osd_status()['up']) < 3: time.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'nodown') manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' dummyfile2 = '/etc/resolv.conf' # create 1 pg pool log.info('creating foo') manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') osds = [0, 1, 2] for i in osds: manager.set_config(i, osd_min_pg_log_entries=1) # determine primary divergent = manager.get_pg_primary('foo', 0) log.info("primary and soon to be divergent is %d", divergent) non_divergent = [0, 1, 2] non_divergent.remove(divergent) log.info('writing initial objects') # write 1000 objects for i in range(1000): rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) manager.wait_for_clean() # blackhole non_divergent log.info("blackholing osds %s", str(non_divergent)) for i in non_divergent: manager.set_config(i, filestore_blackhole='') # write 1 (divergent) object log.info('writing divergent object existing_0') rados(ctx, mon, ['-p', 'foo', 'put', 'existing_0', dummyfile2], wait=False) time.sleep(10) mon.run(args=['killall', '-9', 'rados'], wait=True, check_status=False) # kill all the osds log.info('killing all the osds') for i in osds: manager.kill_osd(i) for i in osds: manager.mark_down_osd(i) for i in osds: manager.mark_out_osd(i) # bring up non-divergent log.info("bringing up non_divergent %s", str(non_divergent)) for i in non_divergent: manager.revive_osd(i) for i in non_divergent: manager.mark_in_osd(i) log.info('making log long to prevent backfill') for i in non_divergent: manager.set_config(i, osd_min_pg_log_entries=100000) # write 1 non-divergent object (ensure that old divergent one is divergent) log.info('writing non-divergent object existing_1') rados(ctx, mon, ['-p', 'foo', 'put', 'existing_1', dummyfile2]) manager.wait_for_recovery() # ensure no recovery log.info('delay recovery') for i in non_divergent: manager.set_config(i, osd_recovery_delay_start=100000) # bring in our divergent friend log.info("revive divergent %d", divergent) manager.revive_osd(divergent) while len(manager.get_osd_status()['up']) < 3: time.sleep(10) log.info('delay recovery divergent') manager.set_config(divergent, osd_recovery_delay_start=100000) log.info('mark divergent in') manager.mark_in_osd(divergent) log.info('wait for peering') rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) log.info("killing divergent %d", divergent) manager.kill_osd(divergent) log.info("reviving divergent %d", divergent) manager.revive_osd(divergent) log.info('allowing recovery') for i in non_divergent: manager.set_config(i, osd_recovery_delay_start=0) log.info('reading existing_0') exit_status = rados( ctx, mon, ['-p', 'foo', 'get', 'existing_0', '-o', '/tmp/existing']) assert exit_status is 0 log.info("success")
def configure_regions_and_zones(ctx, config, regions, role_endpoints): """ Configure regions and zones from rados and rgw. """ if not regions: log.debug('In rgw.configure_regions_and_zones() and regions is None. ' 'Bailing') yield return log.info('Configuring regions and zones...') log.debug('config is %r', config) log.debug('regions are %r', regions) log.debug('role_endpoints = %r', role_endpoints) # extract the zone info role_zones = dict([(client, extract_zone_info(ctx, client, c_config)) for client, c_config in config.iteritems()]) log.debug('roles_zones = %r', role_zones) # extract the user info and append it to the payload tuple for the given # client for client, c_config in config.iteritems(): if not c_config: user_info = None else: user_info = extract_user_info(c_config) (region, zone, zone_info) = role_zones[client] role_zones[client] = (region, zone, zone_info, user_info) region_info = dict([(region_name, extract_region_info(region_name, r_config)) for region_name, r_config in regions.iteritems()]) fill_in_endpoints(region_info, role_zones, role_endpoints) # clear out the old defaults first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() # removing these objects from .rgw.root and the per-zone root pools # may or may not matter rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'region_info.default']) rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'zone_info.default']) for client in config.iterkeys(): for role, (_, zone, zone_info, user_info) in role_zones.iteritems(): rados(ctx, mon, cmd=[ '-p', zone_info['domain_root'], 'rm', 'region_info.default' ]) rados(ctx, mon, cmd=[ '-p', zone_info['domain_root'], 'rm', 'zone_info.default' ]) (remote, ) = ctx.cluster.only(role).remotes.keys() for pool_info in zone_info['placement_pools']: remote.run(args=[ 'ceph', 'osd', 'pool', 'create', pool_info['val'] ['index_pool'], '64', '64' ]) if ctx.rgw.ec_data_pool: create_ec_pool(remote, pool_info['val']['data_pool'], zone, 64) else: create_replicated_pool(remote, pool_info['val']['data_pool'], 64) rgwadmin(ctx, client, cmd=['-n', client, 'zone', 'set', '--rgw-zone', zone], stdin=StringIO( json.dumps(dict(zone_info.items() + user_info.items()))), check_status=True) for region, info in region_info.iteritems(): region_json = json.dumps(info) log.debug('region info is: %s', region_json) rgwadmin(ctx, client, cmd=['-n', client, 'region', 'set'], stdin=StringIO(region_json), check_status=True) if info['is_master']: rgwadmin(ctx, client, cmd=[ '-n', client, 'region', 'default', '--rgw-region', region ], check_status=True) rgwadmin(ctx, client, cmd=['-n', client, 'regionmap', 'update']) yield
def task(ctx, config): """ Test peering. """ if config is None: config = {} assert isinstance(config, dict), \ 'peer task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() for i in range(3): manager.set_config(i, osd_recovery_delay_start=120) # take on osd down manager.kill_osd(2) manager.mark_down_osd(2) # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-']) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery() # kill another and revive 2, so that some pgs can't peer. manager.kill_osd(1) manager.mark_down_osd(1) manager.revive_osd(2) manager.wait_till_osd_is_up(2) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_active_or_down() manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') # look for down pgs num_down_pgs = 0 pgs = manager.get_pg_stats() for pg in pgs: out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query') log.debug("out string %s", out) j = json.loads(out) log.info("pg is %s, query json is %s", pg, j) if pg['state'].count('down'): num_down_pgs += 1 # verify that it is blocked on osd.1 rs = j['recovery_state'] assert len(rs) > 0 assert rs[0]['name'] == 'Started/Primary/Peering/GetInfo' assert rs[1]['name'] == 'Started/Primary/Peering' assert rs[1]['blocked'] assert rs[1]['down_osds_we_would_probe'] == [1] assert len(rs[1]['peering_blocked_by']) == 1 assert rs[1]['peering_blocked_by'][0]['osd'] == 1 assert num_down_pgs > 0 # bring it all back manager.revive_osd(1) manager.wait_till_osd_is_up(1) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean()
def task(ctx, config): """ Test handling of lost objects. A pretty rigid cluseter is brought up andtested by this task """ if config is None: config = {} assert isinstance(config, dict), \ 'lost_unfound task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' # take an osd out until the very end manager.kill_osd(2) manager.mark_down_osd(2) manager.mark_out_osd(2) # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery() # create old objects for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'rm', 'existed_%d' % f]) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.1', 'injectargs', '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000') manager.kill_osd(0) manager.mark_down_osd(0) for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'new_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) # bring osd.0 back up, let it peer, but don't replicate the new # objects... log.info('osd.0 command_args is %s' % 'foo') log.info(ctx.daemons.get_daemon('osd', 0).command_args) ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend( ['--osd-recovery-delay-start', '1000']) manager.revive_osd(0) manager.mark_in_osd(0) manager.wait_till_osd_is_up(0) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.wait_till_active() # take out osd.1 and the only copy of those objects. manager.kill_osd(1) manager.mark_down_osd(1) manager.mark_out_osd(1) manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') # bring up osd.2 so that things would otherwise, in theory, recovery fully manager.revive_osd(2) manager.mark_in_osd(2) manager.wait_till_osd_is_up(2) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') # verify that there are unfound objects unfound = manager.get_num_unfound_objects() log.info("there are %d unfound objects" % unfound) assert unfound # mark stuff lost pgs = manager.get_pg_stats() for pg in pgs: if pg['stat_sum']['num_objects_unfound'] > 0: primary = 'osd.%d' % pg['acting'][0] # verify that i can list them direct from the osd log.info('listing missing/lost in %s state %s', pg['pgid'], pg['state']) m = manager.list_pg_missing(pg['pgid']) #log.info('%s' % m) assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] num_unfound = 0 for o in m['objects']: if len(o['locations']) == 0: num_unfound += 1 assert m['num_unfound'] == num_unfound log.info("reverting unfound in %s on %s", pg['pgid'], primary) manager.raw_cluster_cmd('pg', pg['pgid'], 'mark_unfound_lost', 'delete') else: log.info("no unfound in %s", pg['pgid']) manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # verify result for f in range(1, 10): err = rados(ctx, mon, ['-p', 'data', 'get', 'new_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', 'data', 'get', 'existed_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', 'data', 'get', 'existing_%d' % f, '-']) assert err # see if osd.1 can cope manager.revive_osd(1) manager.mark_in_osd(1) manager.wait_till_osd_is_up(1) manager.wait_for_clean()
def configure_regions_and_zones(ctx, config, regions, role_endpoints): """ Configure regions and zones from rados and rgw. """ if not regions: log.debug( 'In rgw.configure_regions_and_zones() and regions is None. ' 'Bailing') yield return log.info('Configuring regions and zones...') log.debug('config is %r', config) log.debug('regions are %r', regions) log.debug('role_endpoints = %r', role_endpoints) # extract the zone info role_zones = dict([(client, extract_zone_info(ctx, client, c_config)) for client, c_config in config.iteritems()]) log.debug('roles_zones = %r', role_zones) # extract the user info and append it to the payload tuple for the given # client for client, c_config in config.iteritems(): if not c_config: user_info = None else: user_info = extract_user_info(c_config) (region, zone, zone_info) = role_zones[client] role_zones[client] = (region, zone, zone_info, user_info) region_info = dict([ (region_name, extract_region_info(region_name, r_config)) for region_name, r_config in regions.iteritems()]) fill_in_endpoints(region_info, role_zones, role_endpoints) # clear out the old defaults first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() # removing these objects from .rgw.root and the per-zone root pools # may or may not matter rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'region_info.default']) rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'zone_info.default']) for client in config.iterkeys(): for role, (_, zone, zone_info, user_info) in role_zones.iteritems(): rados(ctx, mon, cmd=['-p', zone_info['domain_root'], 'rm', 'region_info.default']) rados(ctx, mon, cmd=['-p', zone_info['domain_root'], 'rm', 'zone_info.default']) (remote,) = ctx.cluster.only(role).remotes.keys() for pool_info in zone_info['placement_pools']: remote.run(args=['ceph', 'osd', 'pool', 'create', pool_info['val']['index_pool'], '64', '64']) if ctx.rgw.ec_data_pool: create_ec_pool(remote, pool_info['val']['data_pool'], zone, 64) else: create_replicated_pool( remote, pool_info['val']['data_pool'], 64) rgwadmin(ctx, client, cmd=['-n', client, 'zone', 'set', '--rgw-zone', zone], stdin=StringIO(json.dumps(dict( zone_info.items() + user_info.items()))), check_status=True) for region, info in region_info.iteritems(): region_json = json.dumps(info) log.debug('region info is: %s', region_json) rgwadmin(ctx, client, cmd=['-n', client, 'region', 'set'], stdin=StringIO(region_json), check_status=True) if info['is_master']: rgwadmin(ctx, client, cmd=['-n', client, 'region', 'default', '--rgw-region', region], check_status=True) rgwadmin(ctx, client, cmd=['-n', client, 'regionmap', 'update']) yield
def task(ctx, config): """ Test handling of divergent entries with prior_version prior to log_tail config: none Requires 3 osds. """ if config is None: config = {} assert isinstance(config, dict), \ 'divergent_priors task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) ctx.manager = manager while len(manager.get_osd_status()['up']) < 3: time.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'nodown') manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' dummyfile2 = '/etc/resolv.conf' # create 1 pg pool log.info('creating foo') manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') osds = [0, 1, 2] for i in osds: manager.set_config(i, osd_min_pg_log_entries=1) # determine primary divergent = manager.get_pg_primary('foo', 0) log.info("primary and soon to be divergent is %d", divergent) non_divergent = [0,1,2] non_divergent.remove(divergent) log.info('writing initial objects') # write 1000 objects for i in range(1000): rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) manager.wait_for_clean() # blackhole non_divergent log.info("blackholing osds %s", str(non_divergent)) for i in non_divergent: manager.set_config(i, filestore_blackhole='') # write 1 (divergent) object log.info('writing divergent object existing_0') rados( ctx, mon, ['-p', 'foo', 'put', 'existing_0', dummyfile2], wait=False) time.sleep(10) mon.run( args=['killall', '-9', 'rados'], wait=True, check_status=False) # kill all the osds log.info('killing all the osds') for i in osds: manager.kill_osd(i) for i in osds: manager.mark_down_osd(i) for i in osds: manager.mark_out_osd(i) # bring up non-divergent log.info("bringing up non_divergent %s", str(non_divergent)) for i in non_divergent: manager.revive_osd(i) for i in non_divergent: manager.mark_in_osd(i) log.info('making log long to prevent backfill') for i in non_divergent: manager.set_config(i, osd_min_pg_log_entries=100000) # write 1 non-divergent object (ensure that old divergent one is divergent) log.info('writing non-divergent object existing_1') rados(ctx, mon, ['-p', 'foo', 'put', 'existing_1', dummyfile2]) manager.wait_for_recovery() # ensure no recovery log.info('delay recovery') for i in non_divergent: manager.set_config(i, osd_recovery_delay_start=100000) # bring in our divergent friend log.info("revive divergent %d", divergent) manager.revive_osd(divergent) while len(manager.get_osd_status()['up']) < 3: time.sleep(10) log.info('delay recovery divergent') manager.set_config(divergent, osd_recovery_delay_start=100000) log.info('mark divergent in') manager.mark_in_osd(divergent) log.info('wait for peering') rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) log.info("killing divergent %d", divergent) manager.kill_osd(divergent) log.info("reviving divergent %d", divergent) manager.revive_osd(divergent) log.info('allowing recovery') for i in non_divergent: manager.set_config(i, osd_recovery_delay_start=0) log.info('reading existing_0') exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_0', '-o', '/tmp/existing']) assert exit_status is 0 log.info("success")
def task(ctx, config): """ Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio configuration settings In order for test to pass must use log-whitelist as follows tasks: - chef: - install: - ceph: log-whitelist: ['OSD near full', 'OSD full dropping all updates'] - osd_failsafe_enospc: """ if config is None: config = {} assert isinstance(config, dict), \ 'osd_failsafe_enospc task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) ctx.manager = manager # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding sleep_time = 50 # something that is always there dummyfile = '/etc/fstab' dummyfile2 = '/etc/resolv.conf' # create 1 pg pool with 1 rep which can only be on osd.0 osds = manager.get_osd_dump() for osd in osds: if osd['osd'] != 0: manager.mark_out_osd(osd['osd']) log.info('creating pool foo') manager.create_pool("foo") manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1') # State NONE -> NEAR log.info('1. Verify warning messages when exceeding nearfull_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.exitstatus.get() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count # State NEAR -> FULL log.info('2. Verify error messages when exceeding full_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.exitstatus.get() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count log.info('3. Verify write failure when exceeding full_ratio') # Write data should fail ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile]) assert ret != 0, 'Expected write failure but it succeeded with exit status 0' # Put back default manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') time.sleep(10) # State FULL -> NEAR log.info('4. Verify write success when NOT exceeding full_ratio') # Write should succeed ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2]) assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret log.info('5. Verify warning messages again when exceeding nearfull_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.exitstatus.get() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90') time.sleep(10) # State NONE -> FULL log.info('6. Verify error messages again when exceeding full_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.exitstatus.get() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count # State FULL -> NONE log.info('7. Verify no messages settings back to default') manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') time.sleep(10) proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.exitstatus.get() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count log.info('Test Passed') # Bring all OSDs back in manager.remove_pool("foo") for osd in osds: if osd['osd'] != 0: manager.mark_in_osd(osd['osd'])
def task(ctx, config): """ Test handling of lost objects. A pretty rigid cluseter is brought up andtested by this task """ if config is None: config = {} assert isinstance(config, dict), \ 'lost_unfound task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' # take an osd out until the very end manager.kill_osd(2) manager.mark_down_osd(2) manager.mark_out_osd(2) # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery() # create old objects for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'rm', 'existed_%d' % f]) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.1', 'injectargs', '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' ) manager.kill_osd(0) manager.mark_down_osd(0) for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'new_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) # bring osd.0 back up, let it peer, but don't replicate the new # objects... log.info('osd.0 command_args is %s' % 'foo') log.info(ctx.daemons.get_daemon('osd', 0).command_args) ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ '--osd-recovery-delay-start', '1000' ]) manager.revive_osd(0) manager.mark_in_osd(0) manager.wait_till_osd_is_up(0) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.wait_till_active() # take out osd.1 and the only copy of those objects. manager.kill_osd(1) manager.mark_down_osd(1) manager.mark_out_osd(1) manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') # bring up osd.2 so that things would otherwise, in theory, recovery fully manager.revive_osd(2) manager.mark_in_osd(2) manager.wait_till_osd_is_up(2) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') # verify that there are unfound objects unfound = manager.get_num_unfound_objects() log.info("there are %d unfound objects" % unfound) assert unfound # mark stuff lost pgs = manager.get_pg_stats() for pg in pgs: if pg['stat_sum']['num_objects_unfound'] > 0: primary = 'osd.%d' % pg['acting'][0] # verify that i can list them direct from the osd log.info('listing missing/lost in %s state %s', pg['pgid'], pg['state']); m = manager.list_pg_missing(pg['pgid']) #log.info('%s' % m) assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] num_unfound=0 for o in m['objects']: if len(o['locations']) == 0: num_unfound += 1 assert m['num_unfound'] == num_unfound log.info("reverting unfound in %s on %s", pg['pgid'], primary) manager.raw_cluster_cmd('pg', pg['pgid'], 'mark_unfound_lost', 'delete') else: log.info("no unfound in %s", pg['pgid']) manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # verify result for f in range(1, 10): err = rados(ctx, mon, ['-p', 'data', 'get', 'new_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', 'data', 'get', 'existed_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', 'data', 'get', 'existing_%d' % f, '-']) assert err # see if osd.1 can cope manager.revive_osd(1) manager.mark_in_osd(1) manager.wait_till_osd_is_up(1) manager.wait_for_clean()
def task(ctx, config): """ Test handling of object location going down """ if config is None: config = {} assert isinstance(config, dict), \ 'lost_unfound task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' # take 0, 1 out manager.mark_out_osd(0) manager.mark_out_osd(1) manager.wait_for_clean() # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.0', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' ) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.1', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' ) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.2', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' ) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.3', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' ) # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) # create old objects for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) manager.mark_out_osd(3) manager.wait_till_active() manager.mark_in_osd(0) manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.mark_out_osd(2) manager.wait_till_active() # bring up 1 manager.mark_in_osd(1) manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert not unfound manager.kill_osd(2) manager.mark_down_osd(2) manager.kill_osd(3) manager.mark_down_osd(3) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert unfound
def task(ctx, config): """ Test peering. """ if config is None: config = {} assert isinstance(config, dict), \ 'peer task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() for i in range(3): manager.set_config( i, osd_recovery_delay_start=120) # take on osd down manager.kill_osd(2) manager.mark_down_osd(2) # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-']) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery() # kill another and revive 2, so that some pgs can't peer. manager.kill_osd(1) manager.mark_down_osd(1) manager.revive_osd(2) manager.wait_till_osd_is_up(2) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_active_or_down() manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') # look for down pgs num_down_pgs = 0 pgs = manager.get_pg_stats() for pg in pgs: out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query') log.debug("out string %s",out) j = json.loads(out) log.info("pg is %s, query json is %s", pg, j) if pg['state'].count('down'): num_down_pgs += 1 # verify that it is blocked on osd.1 rs = j['recovery_state'] assert len(rs) > 0 assert rs[0]['name'] == 'Started/Primary/Peering/GetInfo' assert rs[1]['name'] == 'Started/Primary/Peering' assert rs[1]['blocked'] assert rs[1]['down_osds_we_would_probe'] == [1] assert len(rs[1]['peering_blocked_by']) == 1 assert rs[1]['peering_blocked_by'][0]['osd'] == 1 assert num_down_pgs > 0 # bring it all back manager.revive_osd(1) manager.wait_till_osd_is_up(1) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean()
def task(ctx, config): """ Test handling of lost objects on an ec pool. A pretty rigid cluster is brought up andtested by this task """ if config is None: config = {} assert isinstance(config, dict), \ 'lost_unfound task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_for_clean() pool = manager.create_pool_with_unique_name( ec_pool=True, ec_m=2, ec_k=2) # something that is always there dummyfile = '/etc/fstab' # kludge to make sure they get a map rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery() # create old objects for f in range(1, 10): rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.1', 'injectargs', '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' ) manager.kill_osd(0) manager.mark_down_osd(0) manager.kill_osd(3) manager.mark_down_osd(3) for f in range(1, 10): rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) # take out osd.1 and a necessary shard of those objects. manager.kill_osd(1) manager.mark_down_osd(1) manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') manager.revive_osd(0) manager.wait_till_osd_is_up(0) manager.revive_osd(3) manager.wait_till_osd_is_up(3) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') # verify that there are unfound objects unfound = manager.get_num_unfound_objects() log.info("there are %d unfound objects" % unfound) assert unfound # mark stuff lost pgs = manager.get_pg_stats() for pg in pgs: if pg['stat_sum']['num_objects_unfound'] > 0: # verify that i can list them direct from the osd log.info('listing missing/lost in %s state %s', pg['pgid'], pg['state']); m = manager.list_pg_missing(pg['pgid']) log.info('%s' % m) assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] log.info("reverting unfound in %s", pg['pgid']) manager.raw_cluster_cmd('pg', pg['pgid'], 'mark_unfound_lost', 'delete') else: log.info("no unfound in %s", pg['pgid']) manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_for_recovery() # verify result for f in range(1, 10): err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) assert err err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) assert err # see if osd.1 can cope manager.revive_osd(1) manager.wait_till_osd_is_up(1) manager.wait_for_clean()
def task(ctx, config): """ Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio configuration settings In order for test to pass must use log-whitelist as follows tasks: - chef: - install: - ceph: log-whitelist: ['OSD near full', 'OSD full dropping all updates'] - osd_failsafe_enospc: """ if config is None: config = {} assert isinstance(config, dict), \ 'osd_failsafe_enospc task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) ctx.manager = manager # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding sleep_time = 50 # something that is always there dummyfile = '/etc/fstab' dummyfile2 = '/etc/resolv.conf' # create 1 pg pool with 1 rep which can only be on osd.0 osds = manager.get_osd_dump() for osd in osds: if osd['osd'] != 0: manager.mark_out_osd(osd['osd']) log.info('creating pool foo') manager.create_pool("foo") manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1') # State NONE -> NEAR log.info('1. Verify warning messages when exceeding nearfull_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.wait() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count # State NEAR -> FULL log.info('2. Verify error messages when exceeding full_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.wait() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count log.info('3. Verify write failure when exceeding full_ratio') # Write data should fail ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile]) assert ret != 0, 'Expected write failure but it succeeded with exit status 0' # Put back default manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') time.sleep(10) # State FULL -> NEAR log.info('4. Verify write success when NOT exceeding full_ratio') # Write should succeed ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2]) assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret log.info('5. Verify warning messages again when exceeding nearfull_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.wait() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90') time.sleep(10) # State NONE -> FULL log.info('6. Verify error messages again when exceeding full_ratio') proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.wait() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count # State FULL -> NONE log.info('7. Verify no messages settings back to default') manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') time.sleep(10) proc = mon.run( args=[ 'daemon-helper', 'kill', 'ceph', '-w' ], stdin=run.PIPE, stdout=StringIO(), wait=False, ) time.sleep(sleep_time) proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w proc.wait() lines = proc.stdout.getvalue().split('\n') count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count log.info('Test Passed') # Bring all OSDs back in manager.remove_pool("foo") for osd in osds: if osd['osd'] != 0: manager.mark_in_osd(osd['osd'])
def task(ctx, config): """ Test handling of object location going down """ if config is None: config = {} assert isinstance(config, dict), \ 'lost_unfound task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.wait_for_clean() # something that is always there dummyfile = '/etc/fstab' # take 0, 1 out manager.mark_out_osd(0) manager.mark_out_osd(1) manager.wait_for_clean() # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.0', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000') # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.1', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000') # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.2', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000') # delay recovery, and make the pg log very long (to prevent backfill) manager.raw_cluster_cmd( 'tell', 'osd.3', 'injectargs', '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000') # kludge to make sure they get a map rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) # create old objects for f in range(1, 10): rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) manager.mark_out_osd(3) manager.wait_till_active() manager.mark_in_osd(0) manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.mark_out_osd(2) manager.wait_till_active() # bring up 1 manager.mark_in_osd(1) manager.wait_till_active() manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert not unfound manager.kill_osd(2) manager.mark_down_osd(2) manager.kill_osd(3) manager.mark_down_osd(3) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') log.info("Getting unfound objects") unfound = manager.get_num_unfound_objects() assert unfound
def configure_regions_and_zones(ctx, config, regions, role_endpoints): if not regions: yield return log.info('Configuring regions and zones...') log.debug('config is %r', config) log.debug('regions are %r', regions) log.debug('role_endpoints = %r', role_endpoints) role_zones = dict([(client, extract_zone_info(ctx, client, c_config)) for client, c_config in config.iteritems()]) log.debug('roles_zones = %r', role_zones) region_info = dict([(region, extract_region_info(region, r_config)) for region, r_config in regions.iteritems()]) fill_in_endpoints(region_info, role_zones, role_endpoints) for client in config.iterkeys(): for region, info in region_info.iteritems(): region_json = json.dumps(info) log.debug('region info is: %s', region_json) rgwadmin(ctx, client, cmd=['-n', client, 'region', 'set'], stdin=StringIO(region_json), check_status=True) if info['is_master']: rgwadmin(ctx, client, cmd=['-n', client, 'region', 'default', '--rgw-region', region], check_status=True) for role, (_, zone, info) in role_zones.iteritems(): rgwadmin(ctx, client, cmd=['-n', client, 'zone', 'set', '--rgw-zone', zone], stdin=StringIO(json.dumps(info)), check_status=True) first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() # removing these objects from .rgw.root and the per-zone root pools # may or may not matter rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'region_info.default']) rados(ctx, mon, cmd=['-p', '.rgw.root', 'rm', 'zone_info.default']) for client in config.iterkeys(): rgwadmin(ctx, client, cmd=['-n', client, 'regionmap', 'update']) for role, (_, zone, zone_info) in role_zones.iteritems(): rados(ctx, mon, cmd=['-p', zone_info['domain_root'], 'rm', 'region_info.default']) rados(ctx, mon, cmd=['-p', zone_info['domain_root'], 'rm', 'zone_info.default']) rgwadmin(ctx, client, cmd=[ '-n', client, 'user', 'create', '--uid', zone_info['system_key']['user'], '--access-key', zone_info['system_key']['access_key'], '--secret-key', zone_info['system_key']['secret_key'], '--display-name', zone_info['system_key']['user'], ], check_status=True, ) yield