def unlock_one(ctx, name, user, description=None): name = misc.canonicalize_hostname(name, user=None) if not teuthology.provision.destroy_if_vm(ctx, name, user, description): log.error('destroy failed for %s', name) return False request = dict(name=name, locked=False, locked_by=user, description=description) uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') with safe_while( sleep=1, increment=0.5, action="unlock %s" % name) as proceed: while proceed(): try: response = requests.put(uri, json.dumps(request)) break # Work around https://github.com/kennethreitz/requests/issues/2364 except requests.ConnectionError as e: log.warn("Saw %s while unlocking; retrying...", str(e)) success = response.ok if success: log.info('unlocked %s', name) else: try: reason = response.json().get('message') except ValueError: reason = str(response.status_code) log.error('failed to unlock {node}. reason: {reason}'.format( node=name, reason=reason)) return success
def listdir(path): with safe_while(sleep=1, increment=1, tries=10) as proceed: while proceed(): try: return os.listdir(path) except OSError: log.exception("Failed to list %s !" % path)
def get_addresses(self): """ Return the list of IPs associated with instance_id in OpenStack. """ with safe_while(sleep=2, tries=30, action="get ip " + self["id"]) as proceed: while proceed(): found = re.match(".*\d+", self["addresses"]) if found: return self["addresses"] self.set_info()
def get_addresses(instance_id): """ Return the list of IPs associated with instance_id in OpenStack. """ with safe_while(sleep=2, tries=30, action="get ip " + instance_id) as proceed: while proceed(): instance = misc.sh("openstack server show -f json " + instance_id) addresses = OpenStack.get_value(json.loads(instance), 'addresses') found = re.match('.*\d+', addresses) if found: return addresses
def _wait_for_ready(self): with safe_while(sleep=6, tries=20) as proceed: while proceed(): try: self.remote.connect() break except ( socket.error, NoValidConnectionsError, AuthenticationException, ): pass cmd = "while [ ! -e '%s' ]; do sleep 5; done" % self._sentinel_path self.remote.run(args=cmd, timeout=600) log.info("Node is ready: %s", self.node)
def wait_for_ceph_health(self): with contextutil.safe_while(sleep=15, tries=6, action='check health') as proceed: (remote,) = self.ctx.cluster.only('mon.a').remotes remote.run(args=['sudo', 'ceph', 'osd', 'tree']) remote.run(args=['sudo', 'ceph', '-s']) log.info("Waiting for Ceph health to reach HEALTH_OK \ or HEALTH WARN") while proceed(): out = StringIO() remote.run(args=['sudo', 'ceph', 'health'], stdout=out) out = out.getvalue().split(None, 1)[0] log.info("cluster in state: %s", out) if out in ('HEALTH_OK', 'HEALTH_WARN'): break
def check_power(self, state, timeout=None): """ Check power. Retry if EOF encountered on power check read. """ timeout = timeout or self.timeout sleep_time = 4.0 with safe_while( sleep=sleep_time, tries=int(timeout / sleep_time), _raise=False, action='wait for power %s' % state) as proceed: while proceed(): c = self._pexpect_spawn_ipmi('power status') r = c.expect(['Chassis Power is {s}'.format( s=state), pexpect.EOF, pexpect.TIMEOUT], timeout=1) if r == 0: return True return False
def retry(function, *args, **kwargs): """ Call a function (returning its results), retrying if any of the exceptions in RETRY_EXCEPTIONS are raised """ with safe_while(sleep=1, tries=24, increment=1) as proceed: tries = 0 while proceed(): tries += 1 try: result = function(*args, **kwargs) if tries > 1: log.debug( "'%s' succeeded after %s tries", function.__name__, tries, ) return result except RETRY_EXCEPTIONS: pass
def cloud_init_wait(self, name_or_ip): """ Wait for cloud-init to complete on the name_or_ip OpenStack instance. """ log.debug('cloud_init_wait ' + name_or_ip) client_args = { 'user_at_host': '@'.join((self.username, name_or_ip)), 'timeout': 240, 'retry': False, } if self.key_filename: log.debug("using key " + self.key_filename) client_args['key_filename'] = self.key_filename with safe_while(sleep=30, tries=100, action="cloud_init_wait " + name_or_ip) as proceed: success = False # CentOS 6.6 logs in /var/log/clout-init-output.log # CentOS 7.0 logs in /var/log/clout-init.log all_done = ("tail /var/log/cloud-init*.log ; " + " test -f /tmp/init.out && tail /tmp/init.out ; " + " grep '" + self.up_string + "' " + "/var/log/cloud-init*.log") while proceed(): try: client = connection.connect(**client_args) except paramiko.PasswordRequiredException as e: raise Exception( "The private key requires a passphrase.\n" "Create a new key with:" " openstack keypair create myself > myself.pem\n" " chmod 600 myself.pem\n" "and call teuthology-openstack with the options\n" " --key-name myself --key-filename myself.pem\n") except paramiko.AuthenticationException as e: log.debug('cloud_init_wait AuthenticationException ' + str(e)) continue except socket.timeout as e: log.debug('cloud_init_wait connect socket.timeout ' + str(e)) continue except socket.error as e: log.debug('cloud_init_wait connect socket.error ' + str(e)) continue except Exception as e: transients = ('Incompatible ssh peer', 'Unknown server') for transient in transients: if transient in str(e): continue log.exception('cloud_init_wait ' + name_or_ip) raise log.debug('cloud_init_wait ' + all_done) try: stdin, stdout, stderr = client.exec_command(all_done) stdout.channel.settimeout(5) out = stdout.read() log.debug('cloud_init_wait stdout ' + all_done + ' ' + out) except socket.timeout as e: client.close() log.debug('cloud_init_wait socket.timeout ' + all_done) continue except socket.error as e: client.close() log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + all_done) continue log.debug('cloud_init_wait stderr ' + all_done + ' ' + stderr.read()) if stdout.channel.recv_exit_status() == 0: success = True client.close() if success: break return success
def cli_test(ctx, config): """ ceph-deploy cli to exercise most commonly use cli's and ensure all commands works and also startup the init system. """ log.info('Ceph-deploy Test') if config is None: config = {} test_branch = '' conf_dir = teuthology.get_testdir(ctx) + "/cdtest" def execute_cdeploy(admin, cmd, path): """Execute ceph-deploy commands """ """Either use git path or repo path """ args = ['cd', conf_dir, run.Raw(';')] if path: args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path)) else: args.append('ceph-deploy') args.append(run.Raw(cmd)) ec = admin.run(args=args, check_status=False).exitstatus if ec != 0: raise RuntimeError( "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) if config.get('rhbuild'): path = None else: path = teuthology.get_testdir(ctx) # test on branch from config eg: wip-* , master or next etc # packages for all distro's should exist for wip* if ctx.config.get('branch'): branch = ctx.config.get('branch') test_branch = ' --dev={branch} '.format(branch=branch) mons = ctx.cluster.only(teuthology.is_type('mon')) for node, role in mons.remotes.items(): admin = node admin.run(args=['mkdir', conf_dir], check_status=False) nodename = admin.shortname system_type = teuthology.get_system_type(admin) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y']) log.info('system type is %s', system_type) osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles in osds.remotes.items(): devs = teuthology.get_scratch_devices(remote) log.info("roles %s", roles) if (len(devs) < 3): log.error( 'Test needs minimum of 3 devices, only found %s', str(devs)) raise RuntimeError("Needs minimum of 3 devices ") conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir) new_cmd = 'new ' + nodename execute_cdeploy(admin, new_cmd, path) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.items(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) for key, value in keys.items(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) new_mon_install = 'install {branch} --mon '.format( branch=test_branch) + nodename new_mgr_install = 'install {branch} --mgr '.format( branch=test_branch) + nodename new_osd_install = 'install {branch} --osd '.format( branch=test_branch) + nodename new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename create_initial = 'mon create-initial ' mgr_create = 'mgr create ' + nodename # either use create-keys or push command push_keys = 'admin ' + nodename execute_cdeploy(admin, new_mon_install, path) execute_cdeploy(admin, new_mgr_install, path) execute_cdeploy(admin, new_osd_install, path) execute_cdeploy(admin, new_admin, path) execute_cdeploy(admin, create_initial, path) execute_cdeploy(admin, mgr_create, path) execute_cdeploy(admin, push_keys, path) for i in range(3): zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i]) prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i]) execute_cdeploy(admin, zap_disk, path) execute_cdeploy(admin, prepare, path) log.info("list files for debugging purpose to check file permissions") admin.run(args=['ls', run.Raw('-lt'), conf_dir]) remote.run(args=['sudo', 'ceph', '-s'], check_status=False) r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() log.info('Ceph health: %s', out.rstrip('\n')) log.info("Waiting for cluster to become healthy") with contextutil.safe_while(sleep=10, tries=6, action='check health') as proceed: while proceed(): r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() if (out.split(None, 1)[0] == 'HEALTH_OK'): break rgw_install = 'install {branch} --rgw {node}'.format( branch=test_branch, node=nodename, ) rgw_create = 'rgw create ' + nodename execute_cdeploy(admin, rgw_install, path) execute_cdeploy(admin, rgw_create, path) log.info('All ceph-deploy cli tests passed') try: yield finally: log.info("cleaning up") ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) for i in range(3): umount_dev = "{d}1".format(d=devs[i]) r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)]) cmd = 'purge ' + nodename execute_cdeploy(admin, cmd, path) cmd = 'purgedata ' + nodename execute_cdeploy(admin, cmd, path) log.info("Removing temporary dir") admin.run( args=[ 'rm', run.Raw('-rf'), run.Raw(conf_dir)], check_status=False) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
def test_mirroring_init_failure_with_recovery(self): """Test if the mirror daemon can recover from a init failure""" # disable mgr mirroring plugin as it would try to load dir map on # on mirroring enabled for a filesystem (an throw up erorrs in # the logs) self.disable_mirroring_module() # enable mirroring through mon interface -- this should result in the mirror daemon # failing to enable mirroring due to absence of `cephfs_mirror` index object. self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", self.primary_fs_name) # need safe_while since non-failed status pops up as mirroring is restarted # internally in mirror daemon. with safe_while(sleep=5, tries=20, action='wait for failed state') as proceed: while proceed(): try: # verify via asok res = self.mirror_daemon_command( f'mirror status for fs: {self.primary_fs_name}', 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') if not 'state' in res: return self.assertTrue(res['state'] == "failed") return True except: pass # create the index object and check daemon recovery try: p = self.mount_a.client_remote.run(args=[ 'rados', '-p', self.fs.metadata_pool_name, 'create', 'cephfs_mirror' ], stdout=StringIO(), stderr=StringIO(), timeout=30, check_status=True, label="create index object") p.wait() except CommandFailedError as ce: log.warn( f'mirror daemon command to create mirror index object failed: {ce}' ) raise time.sleep(30) res = self.mirror_daemon_command( f'mirror status for fs: {self.primary_fs_name}', 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') self.assertTrue(res['peers'] == {}) self.assertTrue(res['snap_dirs']['dir_count'] == 0) self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", self.primary_fs_name) time.sleep(10) # verify via asok try: self.mirror_daemon_command( f'mirror status for fs: {self.primary_fs_name}', 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') except CommandFailedError: pass else: raise RuntimeError('expected admin socket to be unavailable')
def cloud_init_wait(self, instance): """ Wait for cloud-init to complete on the name_or_ip OpenStack instance. """ ip = instance.get_floating_ip_or_ip() log.debug('cloud_init_wait ' + ip) client_args = { 'user_at_host': '@'.join((self.username, ip)), 'timeout': 240, 'retry': False, } if self.key_filename: log.debug("using key " + self.key_filename) client_args['key_filename'] = self.key_filename with safe_while(sleep=30, tries=30, action="cloud_init_wait " + ip) as proceed: success = False # CentOS 6.6 logs in /var/log/clout-init-output.log # CentOS 7.0 logs in /var/log/clout-init.log tail = ("tail --follow=name --retry" " /var/log/cloud-init*.log /tmp/init.out") while proceed(): try: client = connection.connect(**client_args) except paramiko.PasswordRequiredException as e: raise Exception( "The private key requires a passphrase.\n" "Create a new key with:" " openstack keypair create myself > myself.pem\n" " chmod 600 myself.pem\n" "and call teuthology-openstack with the options\n" " --key-name myself --key-filename myself.pem\n") except paramiko.AuthenticationException as e: log.debug('cloud_init_wait AuthenticationException ' + str(e)) continue except socket.timeout as e: log.debug('cloud_init_wait connect socket.timeout ' + str(e)) continue except socket.error as e: log.debug('cloud_init_wait connect socket.error ' + str(e)) continue except Exception as e: transients = ('Incompatible ssh peer', 'Unknown server') for transient in transients: if transient in str(e): continue log.exception('cloud_init_wait ' + ip) raise log.debug('cloud_init_wait ' + tail) try: # get the I/O channel to iterate line by line transport = client.get_transport() channel = transport.open_session() channel.get_pty() channel.settimeout(240) output = channel.makefile('r', 1) channel.exec_command(tail) for line in iter(output.readline, b''): log.info(line.strip()) if self.up_string in line: success = True break except socket.timeout as e: client.close() log.debug('cloud_init_wait socket.timeout ' + tail) continue except socket.error as e: client.close() log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + tail) continue client.close() if success: break return success
def task(ctx, config): """ Run watch_notify_same_primary The config should be as follows: watch_notify_same_primary: clients: [client list] The client list should contain 1 client The test requires 3 osds. example: tasks: - ceph: - watch_notify_same_primary: clients: [client.0] - interactive: """ log.info('Beginning watch_notify_same_primary...') assert isinstance(config, dict), \ "please list clients to run on" clients = config.get('clients', ['client.0']) assert len(clients) == 1 role = clients[0] assert isinstance(role, six.string_types) PREFIX = 'client.' assert role.startswith(PREFIX) (remote, ) = ctx.cluster.only(role).remotes.keys() manager = ctx.managers['ceph'] manager.raw_cluster_cmd('osd', 'set', 'noout') pool = manager.create_pool_with_unique_name() def obj(n): return "foo-{num}".format(num=n) def start_watch(n): remote.run( args=["rados", "-p", pool, "put", obj(n), "/etc/resolv.conf"], logger=log.getChild('watch.{id}'.format(id=n))) proc = remote.run(args=["rados", "-p", pool, "watch", obj(n)], stdin=run.PIPE, stdout=BytesIO(), stderr=BytesIO(), wait=False) return proc num = 20 watches = [start_watch(i) for i in range(num)] # wait for them all to register for i in range(num): with safe_while() as proceed: while proceed(): lines = remote.sh( ["rados", "-p", pool, "listwatchers", obj(i)]) num_watchers = lines.count('watcher=') log.info('i see %d watchers for %s', num_watchers, obj(i)) if num_watchers >= 1: break def notify(n, msg): remote.run(args=["rados", "-p", pool, "notify", obj(n), msg], logger=log.getChild('notify.{id}'.format(id=n))) [notify(n, 'notify1') for n in range(len(watches))] manager.kill_osd(0) manager.mark_down_osd(0) [notify(n, 'notify2') for n in range(len(watches))] try: yield finally: log.info('joining watch_notify_stress') for watch in watches: watch.stdin.write("\n") run.wait(watches) for watch in watches: lines = watch.stdout.getvalue().split("\n") got1 = False got2 = False for l in lines: if 'notify1' in l: got1 = True if 'notify2' in l: got2 = True log.info(lines) assert got1 and got2 manager.revive_osd(0) manager.remove_pool(pool)
def task(ctx, config): """ Deploy rook-ceph cluster tasks: - kubeadm: - rook: branch: wip-foo spec: mon: count: 1 The spec item is deep-merged against the cluster.yaml. The branch, sha1, or image items are used to determine the Ceph container image. """ if not config: config = {} assert isinstance(config, dict), \ "task only supports a dictionary for configuration" log.info('Rook start') overrides = ctx.config.get('overrides', {}) teuthology.deep_merge(config, overrides.get('ceph', {})) teuthology.deep_merge(config, overrides.get('rook', {})) log.info('Config: ' + str(config)) # set up cluster context if not hasattr(ctx, 'rook'): ctx.rook = {} if 'cluster' not in config: config['cluster'] = 'ceph' cluster_name = config['cluster'] if cluster_name not in ctx.rook: ctx.rook[cluster_name] = argparse.Namespace() ctx.rook[cluster_name].remote = list(ctx.cluster.remotes.keys())[0] # image teuth_defaults = teuth_config.get('defaults', {}) cephadm_defaults = teuth_defaults.get('cephadm', {}) containers_defaults = cephadm_defaults.get('containers', {}) container_image_name = containers_defaults.get('image', None) if 'image' in config: ctx.rook[cluster_name].image = config.get('image') else: sha1 = config.get('sha1') flavor = config.get('flavor', 'default') if sha1: if flavor == "crimson": ctx.rook[ cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor else: ctx.rook[ cluster_name].image = container_image_name + ':' + sha1 else: # hmm, fall back to branch? branch = config.get('branch', 'master') ctx.rook[cluster_name].image = container_image_name + ':' + branch log.info('Ceph image is %s' % ctx.rook[cluster_name].image) with contextutil.nested( lambda: rook_operator(ctx, config), lambda: ceph_log(ctx, config), lambda: rook_cluster(ctx, config), lambda: rook_toolbox(ctx, config), lambda: wait_for_orch(ctx, config), lambda: rook_post_config(ctx, config), lambda: wait_for_osds(ctx, config), lambda: ceph_config_keyring(ctx, config), lambda: ceph_clients(ctx, config), ): if not hasattr(ctx, 'managers'): ctx.managers = {} ctx.managers[cluster_name] = CephManager( ctx.rook[cluster_name].remote, ctx=ctx, logger=log.getChild('ceph_manager.' + cluster_name), cluster=cluster_name, rook=True, ) try: if config.get('wait-for-healthy', True): healthy(ctx=ctx, config=config) log.info('Rook complete, yielding') yield finally: to_remove = [] ret = _shell(ctx, config, ['ceph', 'orch', 'ls', '-f', 'json'], stdout=BytesIO()) if ret.exitstatus == 0: r = json.loads(ret.stdout.getvalue().decode('utf-8')) for service in r: if service['service_type'] in [ 'rgw', 'mds', 'nfs', 'rbd-mirror' ]: _shell(ctx, config, ['ceph', 'orch', 'rm', service['service_name']]) to_remove.append(service['service_name']) with safe_while( sleep=10, tries=90, action="waiting for service removal") as proceed: while proceed(): ret = _shell(ctx, config, ['ceph', 'orch', 'ls', '-f', 'json'], stdout=BytesIO()) if ret.exitstatus == 0: r = json.loads( ret.stdout.getvalue().decode('utf-8')) still_up = [ service['service_name'] for service in r ] matches = set(still_up).intersection(to_remove) if not matches: break log.info('Tearing down rook')
def task(ctx, config): """ Run watch_notify_same_primary The config should be as follows: watch_notify_same_primary: clients: [client list] The client list should contain 1 client The test requires 3 osds. example: tasks: - ceph: - watch_notify_same_primary: clients: [client.0] - interactive: """ log.info('Beginning watch_notify_same_primary...') assert isinstance(config, dict), \ "please list clients to run on" clients = config.get('clients', ['client.0']) assert len(clients) == 1 role = clients[0] assert isinstance(role, basestring) PREFIX = 'client.' assert role.startswith(PREFIX) (remote,) = ctx.cluster.only(role).remotes.iterkeys() ctx.manager.raw_cluster_cmd('osd', 'set', 'noout') pool = ctx.manager.create_pool_with_unique_name() def obj(n): return "foo-{num}".format(num=n) def start_watch(n): remote.run( args = [ "rados", "-p", pool, "put", obj(n), "/etc/resolv.conf"], logger=log.getChild('watch.{id}'.format(id=n))) proc = remote.run( args = [ "rados", "-p", pool, "watch", obj(n)], stdin=run.PIPE, stdout=StringIO(), stderr=StringIO(), wait=False) return proc num = 20 watches = [start_watch(i) for i in range(num)] # wait for them all to register for i in range(num): with safe_while() as proceed: while proceed(): proc = remote.run( args = [ "rados", "-p", pool, "listwatchers", obj(i)], stdout=StringIO()) lines = proc.stdout.getvalue() num_watchers = lines.count('watcher=') log.info('i see %d watchers for %s', num_watchers, obj(i)) if num_watchers >= 1: break def notify(n, msg): remote.run( args = [ "rados", "-p", pool, "notify", obj(n), msg], logger=log.getChild('notify.{id}'.format(id=n))) [notify(n, 'notify1') for n in range(len(watches))] ctx.manager.kill_osd(0) ctx.manager.mark_down_osd(0) [notify(n, 'notify2') for n in range(len(watches))] try: yield finally: log.info('joining watch_notify_stress') for watch in watches: watch.stdin.write("\n") run.wait(watches) for watch in watches: lines = watch.stdout.getvalue().split("\n") got1 = False got2 = False for l in lines: if 'notify1' in l: got1 = True if 'notify2' in l: got2 = True log.info(lines) assert got1 and got2 ctx.manager.revive_osd(0) ctx.manager.remove_pool(pool)
def report_job(self, run_name, job_id, job_info=None, dead=False): """ Report a single job to the results server. :param run_name: The name of the run. The run must already exist. :param job_id: The job's id :param job_info: The job's info dict. Optional - if not present, we look at the archive. """ if job_info is not None and not isinstance(job_info, dict): raise TypeError("job_info must be a dict") run_uri = "{base}/runs/{name}/jobs/".format( base=self.base_uri, name=run_name, ) if job_info is None: job_info = self.serializer.job_info(run_name, job_id) if dead and get_status(job_info) is None: set_status(job_info, 'dead') job_json = json.dumps(job_info) headers = {'content-type': 'application/json'} inc = random.uniform(0, 1) with safe_while(sleep=1, increment=inc, action=f'report job {job_id}') as proceed: while proceed(): response = self.session.post(run_uri, data=job_json, headers=headers) if response.status_code == 200: return # This call is wrapped in a try/except because of: # http://tracker.ceph.com/issues/8166 try: resp_json = response.json() except ValueError: resp_json = dict() if resp_json: msg = resp_json.get('message', '') else: msg = response.text if msg and msg.endswith('already exists'): job_uri = os.path.join(run_uri, job_id, '') response = self.session.put(job_uri, data=job_json, headers=headers) if response.status_code == 200: return elif msg: self.log.error( "POST to {uri} failed with status {status}: {msg}". format( uri=run_uri, status=response.status_code, msg=msg, )) response.raise_for_status()
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid testdir = teuthology.get_testdir(ctx) num_mons = 1 try: for remote, roles in ctx.cluster.remotes.items(): for mon in [ r for r in roles if teuthology.is_type('mon', cluster_name)(r) ]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[ cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'mon', 'update', str(num_mons), remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break # refresh ceph.conf files for all mons + first mgr """ for remote, roles in ctx.cluster.remotes.items(): for mon in [r for r in roles if teuthology.is_type('mon', cluster_name)(r)]: c_, _, id_ = teuthology.split_role(mon) _shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'service', 'redeploy', 'mon', id_, ]) _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote, [ 'ceph', 'orchestrator', 'service', 'redeploy', 'mgr', ctx.ceph[cluster_name].first_mgr, ]) """ yield finally: pass
def cloud_init_wait(self, name_or_ip): """ Wait for cloud-init to complete on the name_or_ip OpenStack instance. """ log.debug('cloud_init_wait ' + name_or_ip) client_args = { 'user_at_host': '@'.join((self.username, name_or_ip)), 'timeout': 10, 'retry': False, } if self.key_filename: log.debug("using key " + self.key_filename) client_args['key_filename'] = self.key_filename with safe_while(sleep=2, tries=600, action="cloud_init_wait " + name_or_ip) as proceed: success = False # CentOS 6.6 logs in /var/log/clout-init-output.log # CentOS 7.0 logs in /var/log/clout-init.log all_done = ("tail /var/log/cloud-init*.log ; " + " test -f /tmp/init.out && tail /tmp/init.out ; " + " grep '" + self.up_string + "' " + "/var/log/cloud-init*.log") while proceed(): try: client = connection.connect(**client_args) except paramiko.PasswordRequiredException as e: raise Exception( "The private key requires a passphrase.\n" "Create a new key with:" " openstack keypair create myself > myself.pem\n" " chmod 600 myself.pem\n" "and call teuthology-openstack with the options\n" " --key-name myself --key-filename myself.pem\n") except paramiko.AuthenticationException as e: log.debug('cloud_init_wait AuthenticationException ' + str(e)) continue except socket.timeout as e: log.debug('cloud_init_wait connect socket.timeout ' + str(e)) continue except socket.error as e: log.debug('cloud_init_wait connect socket.error ' + str(e)) continue except Exception as e: transients = ('Incompatible ssh peer', 'Unknown server') for transient in transients: if transient in str(e): continue log.exception('cloud_init_wait ' + name_or_ip) raise log.debug('cloud_init_wait ' + all_done) try: stdin, stdout, stderr = client.exec_command(all_done) stdout.channel.settimeout(5) out = stdout.read() log.debug('cloud_init_wait stdout ' + all_done + ' ' + out) except socket.timeout as e: client.close() log.debug('cloud_init_wait socket.timeout ' + all_done) continue except socket.error as e: client.close() log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + all_done) continue log.debug('cloud_init_wait stderr ' + all_done + ' ' + stderr.read()) if stdout.channel.recv_exit_status() == 0: success = True client.close() if success: break return success
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid try: daemons = {} if config.get('add_mons_via_daemon_add'): # This is the old way of adding mons that works with the (early) octopus # cephadm scheduler. num_mons = 1 for remote, roles in ctx.cluster.remotes.items(): for mon in [r for r in roles if teuthology.is_type('mon', cluster_name)(r)]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'daemon', 'add', 'mon', remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) daemons[mon] = (remote, id_) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break else: nodes = [] for remote, roles in ctx.cluster.remotes.items(): for mon in [r for r in roles if teuthology.is_type('mon', cluster_name)(r)]: c_, _, id_ = teuthology.split_role(mon) log.info('Adding %s on %s' % (mon, remote.shortname)) nodes.append(remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_) if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon: continue daemons[mon] = (remote, id_) _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'apply', 'mon', str(len(nodes)) + ';' + ';'.join(nodes)] ) for mgr, i in daemons.items(): remote, id_ = i ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (len(nodes))) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == len(nodes): break # refresh our (final) ceph.conf file bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote log.info('Generating final ceph.conf file...') r = _shell( ctx=ctx, cluster_name=cluster_name, remote=bootstrap_remote, args=[ 'ceph', 'config', 'generate-minimal-conf', ], stdout=StringIO(), ) ctx.ceph[cluster_name].config_file = r.stdout.getvalue() yield finally: pass
def rook_cluster(ctx, config): cluster_name = config['cluster'] # count how many OSDs we'll create num_devs = 0 num_hosts = 0 for remote in ctx.cluster.remotes.keys(): ls = remote.read_file('/scratch_devs').decode( 'utf-8').strip().splitlines() num_devs += len(ls) num_hosts += 1 ctx.rook[cluster_name].num_osds = num_devs # config ceph_conf = build_initial_config(ctx, config) ceph_conf_fp = BytesIO() ceph_conf.write(ceph_conf_fp) log.info(f'Config:\n{ceph_conf_fp.getvalue()}') _kubectl(ctx, ceph_conf, ['create', '-f', '-'], stdin=yaml.dump({ 'apiVersion': 'v1', 'kind': 'ConfigMap', 'metadata': { 'name': 'rook-config-override', 'namespace': 'rook-ceph' }, 'data': { 'config': ceph_conf_fp.getvalue() } })) # cluster cluster = { 'apiVersion': 'ceph.rook.io/v1', 'kind': 'CephCluster', 'metadata': { 'name': 'rook-ceph', 'namespace': 'rook-ceph' }, 'spec': { 'cephVersion': { 'image': ctx.rook[cluster_name].image, 'allowUnsupported': True, }, 'dataDirHostPath': '/var/lib/rook', 'skipUpgradeChecks': True, 'mgr': { 'count': 1, 'modules': [ { 'name': 'rook', 'enabled': True }, ], }, 'mon': { 'count': num_hosts, 'allowMultiplePerNode': True, }, } } teuthology.deep_merge(cluster['spec'], config.get('spec', {})) cluster_yaml = yaml.dump(cluster) log.info(f'Cluster:\n{cluster_yaml}') try: ctx.rook[cluster_name].remote.write_file('cluster.yaml', cluster_yaml) _kubectl(ctx, config, ['create', '-f', 'cluster.yaml']) yield except Exception as e: log.exception(e) raise finally: _kubectl(ctx, config, ['delete', '-f', 'cluster.yaml'], check_status=False) # wait for cluster to shut down log.info('Waiting for cluster to stop') running = True with safe_while(sleep=5, tries=100, action="wait for teardown") as proceed: while running and proceed(): p = _kubectl( ctx, config, ['-n', 'rook-ceph', 'get', 'pods'], stdout=BytesIO(), ) running = False for line in p.stdout.getvalue().decode( 'utf-8').strip().splitlines(): name, ready, status, _ = line.split(None, 3) if (name != 'NAME' and not name.startswith('csi-') and not name.startswith('rook-ceph-operator-') and not name.startswith('rook-ceph-tools-')): running = True break _kubectl( ctx, config, ['-n', 'rook-ceph', 'delete', 'configmap', 'rook-config-override'], check_status=False, ) ctx.rook[cluster_name].remote.run(args=['rm', '-f', 'cluster.yaml'])
def connect(user_at_host, host_key=None, keep_alive=False, timeout=60, _SSHClient=None, _create_key=None, retry=True, key_filename=None): """ ssh connection routine. :param user_at_host: user@host :param host_key: ssh key :param keep_alive: keep_alive indicator :param timeout: timeout in seconds :param _SSHClient: client, default is paramiko ssh client :param _create_key: routine to create a key (defaults to local reate_key) :param retry: Whether or not to retry failed connection attempts (eventually giving up if none succeed). Default is True :param key_filename: Optionally override which private key to use. :return: ssh connection. """ user, host = split_user(user_at_host) if _SSHClient is None: _SSHClient = paramiko.SSHClient ssh = _SSHClient() if _create_key is None: _create_key = create_key if host_key is None: ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) if config.verify_host_keys is True: ssh.load_system_host_keys() else: keytype, key = host_key.split(' ', 1) ssh.get_host_keys().add( hostname=host, keytype=keytype, key=_create_key(keytype, key) ) connect_args = dict( hostname=host, username=user, timeout=timeout ) ssh_config_path = os.path.expanduser("~/.ssh/config") if os.path.exists(ssh_config_path): ssh_config = paramiko.SSHConfig() ssh_config.parse(open(ssh_config_path)) opts = ssh_config.lookup(host) if not key_filename and 'identityfile' in opts: key_filename = opts['identityfile'] if key_filename: if not isinstance(key_filename, list): key_filename = [key_filename] key_filename = [os.path.expanduser(f) for f in key_filename] connect_args['key_filename'] = key_filename log.debug(connect_args) if not retry: ssh.connect(**connect_args) else: # Retries are implemented using safe_while with safe_while(sleep=1, action='connect to ' + host) as proceed: while proceed(): try: ssh.connect(**connect_args) break except paramiko.AuthenticationException: log.exception( "Error connecting to {host}".format(host=host)) ssh.get_transport().set_keepalive(keep_alive) return ssh
def rook_operator(ctx, config): cluster_name = config['cluster'] rook_branch = config.get('rook_branch', 'master') rook_git_url = config.get('rook_git_url', 'https://github.com/rook/rook') log.info(f'Cloning {rook_git_url} branch {rook_branch}') ctx.rook[cluster_name].remote.run(args=[ 'rm', '-rf', 'rook', run.Raw('&&'), 'git', 'clone', '--single-branch', '--branch', rook_branch, rook_git_url, 'rook', ]) # operator.yaml operator_yaml = ctx.rook[cluster_name].remote.read_file( 'rook/cluster/examples/kubernetes/ceph/operator.yaml') rook_image = config.get('rook_image') if rook_image: log.info(f'Patching operator to use image {rook_image}') crs = list(yaml.load_all(operator_yaml, Loader=yaml.FullLoader)) assert len(crs) == 2 crs[1]['spec']['template']['spec']['containers'][0][ 'image'] = rook_image operator_yaml = yaml.dump_all(crs) ctx.rook[cluster_name].remote.write_file('operator.yaml', operator_yaml) op_job = None try: log.info('Deploying operator') _kubectl(ctx, config, [ 'create', '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml', '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml', '-f', 'operator.yaml', ]) # on centos: if teuthology.get_distro(ctx) == 'centos': _kubectl(ctx, config, [ '-n', 'rook-ceph', 'set', 'env', 'deploy/rook-ceph-operator', 'ROOK_HOSTPATH_REQUIRES_PRIVILEGED=true' ]) # wait for operator op_name = None with safe_while(sleep=10, tries=90, action="wait for operator") as proceed: while not op_name and proceed(): p = _kubectl( ctx, config, [ '-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-operator' ], stdout=BytesIO(), ) for line in p.stdout.getvalue().decode( 'utf-8').strip().splitlines(): name, ready, status, _ = line.split(None, 3) if status == 'Running': op_name = name break # log operator output op_job = _kubectl( ctx, config, ['-n', 'rook-ceph', 'logs', '-f', op_name], wait=False, logger=log.getChild('operator'), ) yield except Exception as e: log.exception(e) raise finally: log.info('Cleaning up rook operator') _kubectl(ctx, config, [ 'delete', '-f', 'operator.yaml', ]) if False: # don't bother since we'll tear down k8s anyway (and this mysteriously # fails sometimes when deleting some of the CRDs... not sure why!) _kubectl(ctx, config, [ 'delete', '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml', ]) _kubectl(ctx, config, [ 'delete', '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml', ]) ctx.rook[cluster_name].remote.run( args=['rm', '-rf', 'rook', 'operator.yaml']) if op_job: op_job.wait() run.wait(ctx.cluster.run(args=['sudo', 'rm', '-rf', '/var/lib/rook']))
def cli_test(ctx, config): """ ceph-deploy cli to exercise most commonly use cli's and ensure all commands works and also startup the init system. """ log.info('Ceph-deploy Test') if config is None: config = {} test_branch = '' conf_dir = teuthology.get_testdir(ctx) + "/cdtest" def execute_cdeploy(admin, cmd, path): """Execute ceph-deploy commands """ """Either use git path or repo path """ args = ['cd', conf_dir, run.Raw(';')] if path: args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path)); else: args.append('ceph-deploy') args.append(run.Raw(cmd)) ec = admin.run(args=args, check_status=False).exitstatus if ec != 0: raise RuntimeError( "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) if config.get('rhbuild'): path = None else: path = teuthology.get_testdir(ctx) # test on branch from config eg: wip-* , master or next etc # packages for all distro's should exist for wip* if ctx.config.get('branch'): branch = ctx.config.get('branch') test_branch = ' --dev={branch} '.format(branch=branch) mons = ctx.cluster.only(teuthology.is_type('mon')) for node, role in mons.remotes.iteritems(): admin = node admin.run(args=['mkdir', conf_dir], check_status=False) nodename = admin.shortname system_type = teuthology.get_system_type(admin) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y']) log.info('system type is %s', system_type) osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) log.info("roles %s", roles) if (len(devs) < 3): log.error( 'Test needs minimum of 3 devices, only found %s', str(devs)) raise RuntimeError("Needs minimum of 3 devices ") conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir) new_cmd = 'new ' + nodename execute_cdeploy(admin, new_cmd, path) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) new_mon_install = 'install {branch} --mon '.format( branch=test_branch) + nodename new_mgr_install = 'install {branch} --mgr '.format( branch=test_branch) + nodename new_osd_install = 'install {branch} --osd '.format( branch=test_branch) + nodename new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename create_initial = 'mon create-initial ' # either use create-keys or push command push_keys = 'admin ' + nodename execute_cdeploy(admin, new_mon_install, path) execute_cdeploy(admin, new_mgr_install, path) execute_cdeploy(admin, new_osd_install, path) execute_cdeploy(admin, new_admin, path) execute_cdeploy(admin, create_initial, path) execute_cdeploy(admin, push_keys, path) for i in range(3): zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i]) prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i]) execute_cdeploy(admin, zap_disk, path) execute_cdeploy(admin, prepare, path) log.info("list files for debugging purpose to check file permissions") admin.run(args=['ls', run.Raw('-lt'), conf_dir]) remote.run(args=['sudo', 'ceph', '-s'], check_status=False) r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() log.info('Ceph health: %s', out.rstrip('\n')) log.info("Waiting for cluster to become healthy") with contextutil.safe_while(sleep=10, tries=6, action='check health') as proceed: while proceed(): r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() if (out.split(None,1)[0] == 'HEALTH_OK'): break rgw_install = 'install {branch} --rgw {node}'.format( branch=test_branch, node=nodename, ) rgw_create = 'rgw create ' + nodename execute_cdeploy(admin, rgw_install, path) execute_cdeploy(admin, rgw_create, path) log.info('All ceph-deploy cli tests passed') try: yield finally: log.info("cleaning up") ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), 'sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) for i in range(3): umount_dev = "{d}1".format(d=devs[i]) r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)]) cmd = 'purge ' + nodename execute_cdeploy(admin, cmd, path) cmd = 'purgedata ' + nodename execute_cdeploy(admin, cmd, path) log.info("Removing temporary dir") admin.run( args=[ 'rm', run.Raw('-rf'), run.Raw(conf_dir)], check_status=False) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid num_mons = 1 try: for remote, roles in ctx.cluster.remotes.items(): for mon in [r for r in roles if teuthology.is_type('mon', cluster_name)(r)]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'daemon', 'add', 'mon', remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break # refresh our (final) ceph.conf file log.info('Generating final ceph.conf file...') r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'config', 'generate-minimal-conf', ], stdout=StringIO(), ) ctx.ceph[cluster_name].config_file = r.stdout.getvalue() yield finally: pass
def task(ctx, config): log.info('Setting up nvme_loop on scratch devices...') host = 'hostnqn' port = '1' devs_by_remote = {} old_scratch_by_remote = {} for remote, roles in ctx.cluster.remotes.items(): devs = teuthology.get_scratch_devices(remote) devs_by_remote[remote] = devs base = '/sys/kernel/config/nvmet' remote.run(args=[ 'sudo', 'modprobe', 'nvme_loop', run.Raw('&&'), 'sudo', 'mkdir', '-p', f'{base}/hosts/{host}', run.Raw('&&'), 'sudo', 'mkdir', '-p', f'{base}/ports/{port}', run.Raw('&&'), 'echo', 'loop', run.Raw('|'), 'sudo', 'tee', f'{base}/ports/{port}/addr_trtype', ]) for dev in devs: short = dev.split('/')[-1] log.info(f'Connecting nvme_loop {remote.shortname}:{dev}...') remote.run(args=[ 'sudo', 'mkdir', '-p', f'{base}/subsystems/{short}', run.Raw('&&'), 'echo', '1', run.Raw('|'), 'sudo', 'tee', f'{base}/subsystems/{short}/attr_allow_any_host', run.Raw('&&'), 'sudo', 'mkdir', '-p', f'{base}/subsystems/{short}/namespaces/1', run.Raw('&&'), 'echo', dev, run.Raw('|'), 'sudo', 'tee', f'{base}/subsystems/{short}/namespaces/1/device_path', run.Raw('&&'), 'echo', '1', run.Raw('|'), 'sudo', 'tee', f'{base}/subsystems/{short}/namespaces/1/enable', run.Raw('&&'), 'sudo', 'ln', '-s', f'{base}/subsystems/{short}', f'{base}/ports/{port}/subsystems/{short}', run.Raw('&&'), 'sudo', 'nvme', 'connect', '-t', 'loop', '-n', short, '-q', host, ]) # identify nvme_loops devices old_scratch_by_remote[remote] = remote.read_file('/scratch_devs') with contextutil.safe_while(sleep=1, tries=15) as proceed: while proceed(): p = remote.run(args=['sudo', 'nvme', 'list'], stdout=StringIO()) new_devs = [] for line in p.stdout.getvalue().splitlines(): dev, _, vendor = line.split()[0:3] if dev.startswith('/dev/') and vendor == 'Linux': new_devs.append(dev) log.info(f'new_devs {new_devs}') assert len(new_devs) <= len(devs) if len(new_devs) == len(devs): break remote.write_file(path='/scratch_devs', data='\n'.join(new_devs) + '\n', sudo=True) try: yield finally: for remote, devs in devs_by_remote.items(): for dev in devs: short = dev.split('/')[-1] log.info( f'Disconnecting nvme_loop {remote.shortname}:{dev}...') remote.run( args=['sudo', 'nvme', 'disconnect', '-n', short], check_status=False, ) remote.write_file(path='/scratch_devs', data=old_scratch_by_remote[remote], sudo=True)