def test_replicated_delete_speed(self): """ That deletions of replicated metadata are not pathologically slow """ rank_0_id, rank_1_id = self._setup_two_ranks() self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") self.mds_cluster.mds_fail_restart(rank_1_id) self.fs.wait_for_daemons() file_count = 10 self.mount_a.create_n_files("delete_me/file", file_count) self._force_migrate(rank_1_id, "delete_me", self.mount_a.path_to_ino("delete_me/file_0")) begin = datetime.datetime.now() self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) end = datetime.datetime.now() # What we're really checking here is that we are completing client # operations immediately rather than delaying until the next tick. tick_period = float( self.fs.get_config("mds_tick_interval", service_type="mds")) duration = (end - begin).total_seconds() self.assertLess(duration, (file_count * tick_period) * 0.25)
def test_purge_on_shutdown(self): """ That when an MDS rank is shut down, its purge queue is drained in the process. """ rank_0_id, rank_1_id = self._setup_two_ranks() self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") self.mds_cluster.mds_fail_restart(rank_1_id) self.fs.wait_for_daemons() file_count = 5 self.mount_a.create_n_files("delete_me/file", file_count) self._force_migrate(rank_0_id, rank_1_id, "/delete_me", self.mount_a.path_to_ino("delete_me/file_0")) self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) self.mount_a.umount_wait() # See all the strays go into purge queue self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id) self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id) self.assertEqual( self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0) # See nothing get purged from the purge queue (yet) time.sleep(10) self.assertEqual( self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) # Shut down rank 1 self.fs.set_max_mds(1) self.fs.deactivate(1) # It shouldn't proceed past stopping because its still not allowed # to purge time.sleep(10) self.assertEqual( self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) self.assertFalse(self._is_stopped(1)) # Permit the daemon to start purging again self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id), 'injectargs', "--mds_max_purge_files 100") # It should now proceed through shutdown self.wait_until_true(lambda: self._is_stopped(1), timeout=60) # ...and in the process purge all that data self.await_data_pool_empty()
def conduct_neg_test_for_write_caps(self, filepaths, mounts): cmdargs = ['echo', 'some random data', Raw('|'), 'sudo', 'tee'] for mount in mounts: for path in filepaths: if path.find(mount.hostfs_mntpt) != -1: cmdargs.append(path) mount.negtestcmd(args=cmdargs, retval=1, errmsg='permission denied')
def _install_deb_repo(self): self.remote.run( args=[ 'echo', 'deb', self.base_url, self.codename, 'main', Raw('|'), 'sudo', 'tee', '/etc/apt/sources.list.d/{proj}.list'.format( proj=self.project), ], stdout=StringIO(), )
def test_get_pmlogextract_cmd(self): obj = self.klass( hosts=['host1'], time_from='now-3h', time_until='now-1h', ) expected = [ 'pmlogextract', '-S', 'now-3h', '-T', 'now-1h', Raw('/var/log/pcp/pmlogger/host1/*.0'), ] assert obj.get_pmlogextract_cmd('host1') == expected
def test_single_path_rootsquash(self): filedata, filename = 'some data on fs 1', 'file_on_fs1' filepath = os_path_join(self.mount_a.hostfs_mntpt, filename) self.mount_a.write_file(filepath, filedata) keyring = self.fs.authorize(self.client_id, ('/', 'rw', 'root_squash')) keyring_path = self.mount_a.client_remote.mktemp(data=keyring) self.mount_a.remount(client_id=self.client_id, client_keyring_path=keyring_path, cephfs_mntpt='/') if filepath.find(self.mount_a.hostfs_mntpt) != -1: # can read, but not write as root contents = self.mount_a.read_file(filepath) self.assertEqual(filedata, contents) cmdargs = ['echo', 'some random data', Raw('|'), 'sudo', 'tee', filepath] self.mount_a.negtestcmd(args=cmdargs, retval=1, errmsg='permission denied')
def test_kill_mdstable(self): """ check snaptable transcation """ if not isinstance(self.mount_a, FuseMount): self.skipTest("Require FUSE client to forcibly kill mount") self.fs.set_allow_new_snaps(True) self.fs.set_max_mds(2) status = self.fs.wait_for_daemons() grace = float( self.fs.get_config("mds_beacon_grace", service_type="mon")) # setup subtrees self.mount_a.run_shell(["mkdir", "-p", "d1/dir"]) self.mount_a.setfattr("d1", "ceph.dir.pin", "1") self.wait_until_true( lambda: self._check_subtree(1, '/d1', status=status), timeout=30) last_created = self._get_last_created_snap(rank=0, status=status) # mds_kill_mdstable_at: # 1: MDSTableServer::handle_prepare # 2: MDSTableServer::_prepare_logged # 5: MDSTableServer::handle_commit # 6: MDSTableServer::_commit_logged for i in [1, 2, 5, 6]: log.info("testing snapserver mds_kill_mdstable_at={0}".format(i)) status = self.fs.status() rank0 = self.fs.get_rank(rank=0, status=status) self.fs.rank_freeze(True, rank=0) self.fs.rank_asok( ['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) proc = self.mount_a.run_shell( ["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False) self.wait_until_true( lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace * 2) self.delete_mds_coredump(rank0['name']) self.fs.rank_fail(rank=0) self.fs.mds_restart(rank0['name']) self.wait_for_daemon_start([rank0['name']]) status = self.fs.wait_for_daemons() proc.wait() last_created += 1 self.wait_until_true( lambda: self._get_last_created_snap(rank=0) == last_created, timeout=30) self.set_conf("mds", "mds_reconnect_timeout", "5") self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) # set mds_kill_mdstable_at, also kill snapclient for i in [2, 5, 6]: log.info( "testing snapserver mds_kill_mdstable_at={0}, also kill snapclient" .format(i)) status = self.fs.status() last_created = self._get_last_created_snap(rank=0, status=status) rank0 = self.fs.get_rank(rank=0, status=status) rank1 = self.fs.get_rank(rank=1, status=status) self.fs.rank_freeze(True, rank=0) # prevent failover... self.fs.rank_freeze(True, rank=1) # prevent failover... self.fs.rank_asok( ['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) proc = self.mount_a.run_shell( ["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False) self.wait_until_true( lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace * 2) self.delete_mds_coredump(rank0['name']) self.fs.rank_signal(signal.SIGKILL, rank=1) self.mount_a.kill() self.mount_a.kill_cleanup() self.fs.rank_fail(rank=0) self.fs.mds_restart(rank0['name']) self.wait_for_daemon_start([rank0['name']]) self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE) if i in [2, 5]: self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) elif i == 6: self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0) self.assertGreater(self._get_last_created_snap(rank=0), last_created) self.fs.rank_fail(rank=1) self.fs.mds_restart(rank1['name']) self.wait_for_daemon_start([rank1['name']]) self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) if i in [2, 5]: self.wait_until_true( lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) if i == 2: self.assertEqual(self._get_last_created_snap(rank=0), last_created) else: self.assertGreater(self._get_last_created_snap(rank=0), last_created) self.mount_a.mount() self.mount_a.wait_until_mounted() self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) # mds_kill_mdstable_at: # 3: MDSTableClient::handle_request (got agree) # 4: MDSTableClient::commit # 7: MDSTableClient::handle_request (got ack) for i in [3, 4, 7]: log.info("testing snapclient mds_kill_mdstable_at={0}".format(i)) last_created = self._get_last_created_snap(rank=0) status = self.fs.status() rank1 = self.fs.get_rank(rank=1, status=status) self.fs.rank_freeze(True, rank=1) # prevent failover... self.fs.rank_asok( ['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status) proc = self.mount_a.run_shell( ["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False) self.wait_until_true( lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace * 2) self.delete_mds_coredump(rank1['name']) self.mount_a.kill() self.mount_a.kill_cleanup() if i in [3, 4]: self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) elif i == 7: self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0) self.assertGreater(self._get_last_created_snap(rank=0), last_created) self.fs.rank_fail(rank=1) self.fs.mds_restart(rank1['name']) self.wait_for_daemon_start([rank1['name']]) status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) if i in [3, 4]: self.wait_until_true( lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) if i == 3: self.assertEqual(self._get_last_created_snap(rank=0), last_created) else: self.assertGreater(self._get_last_created_snap(rank=0), last_created) self.mount_a.mount() self.mount_a.wait_until_mounted() self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) # mds_kill_mdstable_at: # 3: MDSTableClient::handle_request (got agree) # 8: MDSTableServer::handle_rollback log.info( "testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8" ) last_created = self._get_last_created_snap(rank=0) status = self.fs.status() rank0 = self.fs.get_rank(rank=0, status=status) rank1 = self.fs.get_rank(rank=1, status=status) self.fs.rank_freeze(True, rank=0) self.fs.rank_freeze(True, rank=1) self.fs.rank_asok( ['config', 'set', "mds_kill_mdstable_at", "8".format(i)], rank=0, status=status) self.fs.rank_asok( ['config', 'set', "mds_kill_mdstable_at", "3".format(i)], rank=1, status=status) proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4".format(i)], wait=False) self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace * 2) self.delete_mds_coredump(rank1['name']) self.mount_a.kill() self.mount_a.kill_cleanup() self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) self.fs.rank_fail(rank=1) self.fs.mds_restart(rank1['name']) self.wait_for_daemon_start([rank1['name']]) # rollback triggers assertion self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace * 2) self.delete_mds_coredump(rank0['name']) self.fs.rank_fail(rank=0) self.fs.mds_restart(rank0['name']) self.wait_for_daemon_start([rank0['name']]) self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) # mds.1 should re-send rollback message self.wait_until_true( lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) self.assertEqual(self._get_last_created_snap(rank=0), last_created) self.mount_a.mount() self.mount_a.wait_until_mounted()
def test_snapclient_cache(self): """ check if snapclient cache gets synced properly """ self.fs.set_allow_new_snaps(True) self.fs.set_max_mds(3) status = self.fs.wait_for_daemons() grace = float( self.fs.get_config("mds_beacon_grace", service_type="mon")) self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"]) self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"]) self.mount_a.setfattr("d0", "ceph.dir.pin", "0") self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1") self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2") self.wait_until_true( lambda: self._check_subtree(2, '/d0/d2', status=status), timeout=30) self.wait_until_true( lambda: self._check_subtree(1, '/d0/d1', status=status), timeout=5) self.wait_until_true( lambda: self._check_subtree(0, '/d0', status=status), timeout=5) def _check_snapclient_cache(snaps_dump, cache_dump=None, rank=0): if cache_dump is None: cache_dump = self._get_snapclient_dump(rank=rank) for key, value in cache_dump.items(): if value != snaps_dump[key]: return False return True # sync after mksnap last_created = self._get_last_created_snap(rank=0) self.mount_a.run_shell( ["mkdir", "d0/d1/dir/.snap/s1", "d0/d1/dir/.snap/s2"]) self.wait_until_true( lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) self.assertGreater(self._get_last_created_snap(rank=0), last_created) snaps_dump = self._get_snapserver_dump(rank=0) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)) # sync after rmsnap last_destroyed = self._get_last_destroyed_snap(rank=0) self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"]) self.wait_until_true( lambda: len(self._get_pending_snap_destroy(rank=0)) == 0, timeout=30) self.assertGreater(self._get_last_destroyed_snap(rank=0), last_destroyed) snaps_dump = self._get_snapserver_dump(rank=0) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)) # sync during mds recovers self.fs.rank_fail(rank=2) status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)) self.fs.rank_fail(rank=0) self.fs.rank_fail(rank=1) status = self.fs.wait_for_daemons() self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)) self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)) # kill at MDSTableClient::handle_notify_prep status = self.fs.status() rank2 = self.fs.get_rank(rank=2, status=status) self.fs.rank_freeze(True, rank=2) self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status) proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False) self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace * 2) self.delete_mds_coredump(rank2['name']) # mksnap should wait for notify ack from mds.2 self.assertFalse(proc.finished) # mksnap should proceed after mds.2 fails self.fs.rank_fail(rank=2) self.wait_until_true(lambda: proc.finished, timeout=30) self.fs.mds_restart(rank2['name']) self.wait_for_daemon_start([rank2['name']]) status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")]) # kill at MDSTableClient::commit # the recovering mds should sync all mds' cache when it enters resolve stage self.set_conf("mds", "mds_reconnect_timeout", "5") for i in range(1, 4): status = self.fs.status() rank2 = self.fs.get_rank(rank=2, status=status) self.fs.rank_freeze(True, rank=2) self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status) last_created = self._get_last_created_snap(rank=0) proc = self.mount_a.run_shell( ["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False) self.wait_until_true( lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace * 2) self.delete_mds_coredump(rank2['name']) self.mount_a.kill() self.mount_a.kill_cleanup() self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) if i in [2, 4]: self.fs.rank_fail(rank=0) if i in [3, 4]: self.fs.rank_fail(rank=1) self.fs.rank_fail(rank=2) self.fs.mds_restart(rank2['name']) self.wait_for_daemon_start([rank2['name']]) status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) rank0_cache = self._get_snapclient_dump(rank=0) rank1_cache = self._get_snapclient_dump(rank=1) rank2_cache = self._get_snapclient_dump(rank=2) self.assertGreater(int(rank0_cache["last_created"]), last_created) self.assertEqual(rank0_cache, rank1_cache) self.assertEqual(rank0_cache, rank2_cache) self.wait_until_true( lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) snaps_dump = self._get_snapserver_dump(rank=0) self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"]) self.assertTrue( _check_snapclient_cache(snaps_dump, cache_dump=rank0_cache)) self.mount_a.mount() self.mount_a.wait_until_mounted() self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")])
def task(ctx, config): """ Go through filesystem creation with a synthetic failure in an MDS in its 'up:creating' state, to exercise the retry behaviour. """ # Grab handles to the teuthology objects of interest mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) if len(mdslist) != 1: # Require exactly one MDS, the code path for creation failure when # a standby is available is different raise RuntimeError("This task requires exactly one MDS") mds_id = mdslist[0] (mds_remote, ) = ctx.cluster.only( 'mds.{_id}'.format(_id=mds_id)).remotes.keys() manager = ceph_manager.CephManager( mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), ) # Stop MDS self.fs.set_max_mds(0) self.fs.mds_stop(mds_id) self.fs.mds_fail(mds_id) # Reset the filesystem so that next start will go into CREATING manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") # Start the MDS with mds_kill_create_at set, it will crash during creation mds.restart_with_args(["--mds_kill_create_at=1"]) try: mds.wait_for_exit() except CommandFailedError as e: if e.exitstatus == 1: log.info("MDS creation killed as expected") else: log.error("Unexpected status code %s" % e.exitstatus) raise # Since I have intentionally caused a crash, I will clean up the resulting core # file to avoid task.internal.coredump seeing it as a failure. log.info("Removing core file from synthetic MDS failure") mds_remote.run(args=[ 'rm', '-f', Raw("{archive}/coredump/*.core".format( archive=misc.get_archive_dir(ctx))) ]) # It should have left the MDS map state still in CREATING status = self.fs.status().get_mds(mds_id) assert status['state'] == 'up:creating' # Start the MDS again without the kill flag set, it should proceed with creation successfully mds.restart() # Wait for state ACTIVE self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id) # The system should be back up in a happy healthy state, go ahead and run any further tasks # inside this context. yield
def run_shell_payload(self, payload, **kwargs): return self.run_shell(["bash", "-c", Raw(f"'{payload}'")], **kwargs)