def wait_for_reboot(ctx, need_install, timeout, distro=False): """ Loop reconnecting and checking kernel versions until they're all correct or the timeout is exceeded. :param ctx: Context :param need_install: list of packages that we need to reinstall. :param timeout: number of second before we timeout. """ import time starttime = time.time() while need_install: teuthology.reconnect(ctx, timeout) for client in need_install.keys(): if 'distro' in str(need_install[client]): distro = True log.info('Checking client {client} for new kernel version...'.format(client=client)) try: if distro: (remote,) = ctx.cluster.only(client).remotes.keys() assert not need_to_install_distro(remote), \ 'failed to install new distro kernel version within timeout' else: assert not need_to_install(ctx, client, need_install[client]), \ 'failed to install new kernel version within timeout' del need_install[client] except Exception: log.exception("Saw exception") # ignore connection resets and asserts while time is left if time.time() - starttime > timeout: raise time.sleep(1)
def reboot(ctx, remotes): for remote in remotes: if stale_kernel_mount(remote): log.warn('Stale kernel mount on %s!', remote.name) log.info('force/no-sync rebooting %s', remote.name) # -n is ignored in systemd versions through v229, which means this # only works on trusty -- on 7.3 (v219) and xenial (v229) reboot -n # still calls sync(). # args = ['sync', run.Raw('&'), # 'sleep', '5', run.Raw(';'), # 'sudo', 'reboot', '-f', '-n'] args = [ 'for', 'sysrq', 'in', 's', 'u', 'b', run.Raw(';'), 'do', 'echo', run.Raw('$sysrq'), run.Raw('|'), 'sudo', 'tee', '/proc/sysrq-trigger', run.Raw(';'), 'done' ] else: log.info('rebooting %s', remote.name) args = ['sudo', 'reboot'] try: remote.run(args=args, wait=False) except Exception: log.exception('ignoring exception during reboot command') # we just ignore these procs because reboot -f doesn't actually # send anything back to the ssh client! if remotes: log.info('waiting for nodes to reboot') time.sleep(8) # if we try and reconnect too quickly, it succeeds! reconnect(ctx, 480) # allow 8 minutes for the reboots
def revive_osd(self, osd, timeout=75): """ Revive osds by either power cycling (if indicated by the config) or by restarting. """ if self.config.get('powercycle'): (remote, ) = self.ctx.cluster.only( 'osd.{o}'.format(o=osd)).remotes.iterkeys() self.log('kill_osd on osd.{o} doing powercycle of {s}'.format( o=osd, s=remote.name)) assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." remote.console.power_on() if not remote.console.check_status(300): raise Exception( 'Failed to revive osd.{o} via ipmi'.format(o=osd)) teuthology.reconnect(self.ctx, 60, [remote]) ceph_task.mount_osd_data(self.ctx, remote, str(osd)) ceph_task.make_admin_daemon_dir(self.ctx, remote) self.ctx.daemons.get_daemon('osd', osd).reset() self.ctx.daemons.get_daemon('osd', osd).restart() # wait for dump_ops_in_flight; this command doesn't appear # until after the signal handler is installed and it is safe # to stop the osd again without making valgrind leak checks # unhappy. see #5924. self.wait_run_admin_socket(osd, args=['dump_ops_in_flight'], timeout=timeout)
def kill_cleanup(self): assert not self.mounted # We need to do a sleep here because we don't know how long it will # take for a hard_reset to be effected. time.sleep(30) try: # Wait for node to come back up after reboot misc.reconnect(None, 300, [self.client_remote]) except: # attempt to get some useful debug output: con = orchestra_remote.getRemoteConsole( self.client_remote.hostname, self.ipmi_user, self.ipmi_password, self.ipmi_domain) con.check_status(timeout=60) raise # Remove mount directory self.client_remote.run(args=['uptime'], timeout=10) # Remove mount directory self.client_remote.run( args=[ 'rmdir', '--', self.mountpoint, ], timeout=(5 * 60), )
def kill_cleanup(self): assert not self.mounted con = orchestra_remote.getRemoteConsole( self.client_remote.hostname, self.ipmi_user, self.ipmi_password, self.ipmi_domain ) con.power_on() # Wait for node to come back up after reboot misc.reconnect(None, 300, [self.client_remote]) # Remove mount directory self.client_remote.run(args=["rmdir", "--", self.mountpoint])
def revive_osd(self, osd, timeout=75): if self.config.get('powercycle'): (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." remote.console.power_on() if not remote.console.check_status(300): raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd)) teuthology.reconnect(self.ctx, 60, [remote]) ceph_task.mount_osd_data(self.ctx, remote, str(osd)) ceph_task.make_admin_daemon_dir(self.ctx, remote) self.ctx.daemons.get_daemon('osd', osd).reset() self.ctx.daemons.get_daemon('osd', osd).restart() self.wait_run_admin_socket(osd, timeout=timeout)
def kill_cleanup(self): assert not self.mounted con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, self.ipmi_user, self.ipmi_password, self.ipmi_domain) con.power_on() # Wait for node to come back up after reboot misc.reconnect(None, 300, [self.client_remote]) # Remove mount directory self.client_remote.run(args=[ 'rmdir', '--', self.mountpoint, ], )
def reboot(ctx, remotes, log): import time nodes = {} for remote in remotes: log.info('rebooting %s', remote.name) proc = remote.run( # note use of -n to force a no-sync reboot args=['sudo', 'reboot', '-f', '-n'], wait=False) nodes[remote] = proc # we just ignore these procs because reboot -f doesn't actually # send anything back to the ssh client! #for remote, proc in nodes.iteritems(): #proc.exitstatus.get() from teuthology.misc import reconnect if remotes: log.info('waiting for nodes to reboot') time.sleep(5) #if we try and reconnect too quickly, it succeeds! reconnect(ctx, 480) #allow 8 minutes for the reboots
def reboot(ctx, remotes, log): import time nodes = {} for remote in remotes: log.info("rebooting %s", remote.name) proc = remote.run(args=["sudo", "reboot", "-f", "-n"], wait=False) # note use of -n to force a no-sync reboot nodes[remote] = proc # we just ignore these procs because reboot -f doesn't actually # send anything back to the ssh client! # for remote, proc in nodes.iteritems(): # proc.exitstatus.get() from teuthology.misc import reconnect if remotes: log.info("waiting for nodes to reboot") time.sleep(5) # if we try and reconnect too quickly, it succeeds! reconnect(ctx, 480) # allow 8 minutes for the reboots
def task(ctx, config): """ Run chef-solo on all nodes. Optional parameters: tasks: -chef script_url: # override default location for solo-from-scratch for Chef chef_repo: # override default Chef repo used by solo-from-scratch chef_branch: # to choose a different upstream branch for ceph-qa-chef """ log.info("Running chef-solo...") if config is None: config = {} assert isinstance(config, dict), "chef - need config" chef_script = config.get( "script_url", "http://git.ceph.com/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD" ) chef_repo = config.get("chef_repo", "") chef_branch = config.get("chef_branch", "") run.wait( ctx.cluster.run( args=[ "wget", # '-q', "-O-", # 'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch', chef_script, run.Raw("|"), run.Raw("CHEF_REPO={repo}".format(repo=chef_repo)), run.Raw("CHEF_BRANCH={branch}".format(branch=chef_branch)), "sh", "-x", ], wait=False, ) ) log.info("Reconnecting after ceph-qa-chef run") misc.reconnect(ctx, 10) # Reconnect for ulimit and other ceph-qa-chef changes
def revive_osd(self, osd, timeout=75): if self.config.get('powercycle'): (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." remote.console.power_on() if not remote.console.check_status(300): raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd)) teuthology.reconnect(self.ctx, 60, [remote]) ceph_task.mount_osd_data(self.ctx, remote, str(osd)) ceph_task.make_admin_daemon_dir(self.ctx, remote) self.ctx.daemons.get_daemon('osd', osd).reset() self.ctx.daemons.get_daemon('osd', osd).restart() # wait for dump_ops_in_flight; this command doesn't appear # until after the signal handler is installed and it is safe # to stop the osd again without making valgrind leak checks # unhappy. see #5924. self.wait_run_admin_socket(osd, args=['dump_ops_in_flight'], timeout=timeout)
def wait_for_reboot(ctx, need_install, timeout): """ Loop reconnecting and checking kernel versions until they're all correct or the timeout is exceeded. """ import time starttime = time.time() while need_install: teuthology.reconnect(ctx, timeout) for client in need_install.keys(): log.info('Checking client {client} for new kernel version...'.format(client=client)) try: assert not need_to_install(ctx, client, need_install[client]), \ 'failed to install new kernel version within timeout' del need_install[client] except: # ignore connection resets and asserts while time is left if time.time() - starttime > timeout: raise time.sleep(1)
def wait_for_reboot(ctx, need_install, timeout): """ Loop reconnecting and checking kernel versions until they're all correct or the timeout is exceeded. """ import time starttime = time.time() while need_install: teuthology.reconnect(ctx, timeout) for client in need_install.keys(): log.info( 'Checking client {client} for new kernel version...'.format( client=client)) try: assert not need_to_install(ctx, client, need_install[client]), \ 'failed to install new kernel version within timeout' del need_install[client] except: # ignore connection resets and asserts while time is left if time.time() - starttime > timeout: raise time.sleep(1)
def wait_for_reboot(ctx, need_install, timeout, distro=False): """ Loop reconnecting and checking kernel versions until they're all correct or the timeout is exceeded. :param ctx: Context :param need_install: list of packages that we need to reinstall. :param timeout: number of second before we timeout. """ import time # do not try to reconnect immediately after triggering the reboot, # because the reboot sequence might not have started yet (!) -- # see https://tracker.ceph.com/issues/44187 time.sleep(30) starttime = time.time() while need_install: teuthology.reconnect(ctx, timeout) for client in list(need_install.keys()): if 'distro' in str(need_install[client]): distro = True log.info( 'Checking client {client} for new kernel version...'.format( client=client)) try: if distro: (remote, ) = ctx.cluster.only(client).remotes.keys() assert not need_to_install_distro(remote), \ 'failed to install new distro kernel version within timeout' else: assert not need_to_install(ctx, client, need_install[client]), \ 'failed to install new kernel version within timeout' del need_install[client] except Exception: log.exception("Saw exception") # ignore connection resets and asserts while time is left if time.time() - starttime > timeout: raise time.sleep(1)
def poweron(ctx, config): """ tasks: ceph-ipmi.poweron: [osd.0] check_status: false """ assert isinstance(config, dict) or isinstance(config, list), \ "task ceph_ipmi only supports a list or dictionary for configuration" if config is None: config = {} elif isinstance(config, list): config = dict((role, None) for role in config) roles = config.keys() last_remote = [] for role in roles: (remote, ) = ctx.cluster.only(role).remotes.iterkeys() cluster_name, _, _ = teuthology.split_role(role) if remote not in last_remote: log.info("Powering on host containing %s" % role) ipmi = IpmiCapabilities( remote, ctx.teuthology_config.get('ipmi_user', None), ctx.teuthology_config.get('ipmi_password', None), ctx.teuthology_config.get('ipmi_domain', None), timeout=180) ipmi.power_on() last_remote.append(remote) if config.get('check_status', True): ipmi.check_status() teuthology.reconnect(ctx, 360) yield
def reboot(ctx, remotes, log): from .orchestra import run import time nodes = {} for remote in remotes: log.info('rebooting %s', remote.name) proc = remote.run( # note use of -n to force a no-sync reboot args=[ 'timeout', '5', 'sync', run.Raw(';'), 'sudo', 'reboot', '-f', '-n' ], wait=False ) nodes[remote] = proc # we just ignore these procs because reboot -f doesn't actually # send anything back to the ssh client! #for remote, proc in nodes.iteritems(): #proc.exitstatus.get() from teuthology.misc import reconnect if remotes: log.info('waiting for nodes to reboot') time.sleep(5) #if we try and reconnect too quickly, it succeeds! reconnect(ctx, 480) #allow 8 minutes for the reboots
def task(ctx, config): """ Run chef-solo on all nodes. """ log.info('Running chef-solo...') run.wait( ctx.cluster.run( args=[ 'wget', # '-q', '-O-', # 'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch', 'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD', run.Raw('|'), 'sh', '-x', ], wait=False, )) log.info('Reconnecting after ceph-qa-chef run') misc.reconnect(ctx, 10) #Reconnect for ulimit and other ceph-qa-chef changes
def task(ctx, config): """ Run chef-solo on all nodes. """ log.info('Running chef-solo...') run.wait( ctx.cluster.run( args=[ 'wget', # '-q', '-O-', # 'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch', 'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD', run.Raw('|'), 'sh', '-x', ], wait=False, ) ) log.info('Reconnecting after ceph-qa-chef run') misc.reconnect(ctx, 10) #Reconnect for ulimit and other ceph-qa-chef changes
def task(ctx, config): """ - tasks: ceph-deploy: systemd: Test ceph systemd services can start, stop and restart and check for any failed services and report back errors """ for remote, roles in ctx.cluster.remotes.items(): remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) units = remote.sh('sudo systemctl list-units | grep ceph', check_status=False) log.info(units) if units.find('failed'): log.info("Ceph services in failed state") # test overall service stop and start using ceph.target # ceph.target tests are meant for ceph systemd tests # and not actual process testing using 'ps' log.info("Stopping all Ceph services") remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) status = _remote_service_status(remote, 'ceph.target') log.info(status) log.info("Checking process status") ps_eaf = remote.sh('sudo ps -eaf | grep ceph') if ps_eaf.find('Active: inactive'): log.info("Successfully stopped all ceph services") else: log.info("Failed to stop ceph services") log.info("Starting all Ceph services") remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target']) status = _remote_service_status(remote, 'ceph.target') log.info(status) if status.find('Active: active'): log.info("Successfully started all Ceph services") else: log.info("info", "Failed to start Ceph services") ps_eaf = remote.sh('sudo ps -eaf | grep ceph') log.info(ps_eaf) time.sleep(4) # test individual services start stop name = remote.shortname mon_name = 'ceph-mon@' + name + '.service' mds_name = 'ceph-mds@' + name + '.service' mgr_name = 'ceph-mgr@' + name + '.service' mon_role_name = 'mon.' + name mds_role_name = 'mds.' + name mgr_role_name = 'mgr.' + name m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf) if m_osd: osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1)) remote.run(args=['sudo', 'systemctl', 'status', osd_service]) remote.run(args=['sudo', 'systemctl', 'stop', osd_service]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, osd_service) log.info(status) if status.find('Active: inactive'): log.info("Successfully stopped single osd ceph service") else: log.info("Failed to stop ceph osd services") remote.sh(['sudo', 'systemctl', 'start', osd_service]) time.sleep(4) if mon_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mon_name]) remote.run(args=['sudo', 'systemctl', 'stop', mon_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mon_name) if status.find('Active: inactive'): log.info("Successfully stopped single mon ceph service") else: log.info("Failed to stop ceph mon service") remote.run(args=['sudo', 'systemctl', 'start', mon_name]) time.sleep(4) if mgr_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mgr_name]) remote.run(args=['sudo', 'systemctl', 'stop', mgr_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mgr_name) if status.find('Active: inactive'): log.info("Successfully stopped single ceph mgr service") else: log.info("Failed to stop ceph mgr service") remote.run(args=['sudo', 'systemctl', 'start', mgr_name]) time.sleep(4) if mds_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mds_name]) remote.run(args=['sudo', 'systemctl', 'stop', mds_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mds_name) if status.find('Active: inactive'): log.info("Successfully stopped single ceph mds service") else: log.info("Failed to stop ceph mds service") remote.run(args=['sudo', 'systemctl', 'start', mds_name]) time.sleep(4) # reboot all nodes and verify the systemd units restart # workunit that runs would fail if any of the systemd unit doesnt start ctx.cluster.run(args='sudo reboot', wait=False, check_status=False) # avoid immediate reconnect time.sleep(120) reconnect(ctx, 480) # reconnect all nodes # for debug info ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) # wait for HEALTH_OK mon = get_first_mon(ctx, config) (mon_remote, ) = ctx.cluster.only(mon).remotes.keys() wait_until_healthy(ctx, mon_remote, use_sudo=True) yield
def task(ctx, config): """ - tasks: ceph-deploy: systemd: Test ceph systemd services can start, stop and restart and check for any failed services and report back errors """ for remote, roles in ctx.cluster.remotes.iteritems(): remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('failed'): log.info("Ceph services in failed state") # test overall service stop and start using ceph.target # ceph.target tests are meant for ceph systemd tests # and not actual process testing using 'ps' log.info("Stopping all Ceph services") remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) log.info("Checking process status") r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO()) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped all ceph services") else: log.info("Failed to stop ceph services") log.info("Starting all Ceph services") remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target']) r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], stdout=StringIO()) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('Active: active'): log.info("Sucessfully started all Ceph services") else: log.info("info", "Failed to start Ceph services") r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO()) log.info(r.stdout.getvalue()) time.sleep(4) # test individual services start stop name = remote.shortname mon_name = 'ceph-mon@' + name + '.service' mds_name = 'ceph-mds@' + name + '.service' mgr_name = 'ceph-mgr@' + name + '.service' mon_role_name = 'mon.' + name mds_role_name = 'mds.' + name mgr_role_name = 'mgr.' + name m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue()) if m_osd: osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1)) remote.run(args=['sudo', 'systemctl', 'status', osd_service]) remote.run(args=['sudo', 'systemctl', 'stop', osd_service]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', osd_service], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single osd ceph service") else: log.info("Failed to stop ceph osd services") remote.run(args=['sudo', 'systemctl', 'start', osd_service]) time.sleep(4) if mon_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mon_name]) remote.run(args=['sudo', 'systemctl', 'stop', mon_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mon_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single mon ceph service") else: log.info("Failed to stop ceph mon service") remote.run(args=['sudo', 'systemctl', 'start', mon_name]) time.sleep(4) if mgr_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mgr_name]) remote.run(args=['sudo', 'systemctl', 'stop', mgr_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single ceph mgr service") else: log.info("Failed to stop ceph mgr service") remote.run(args=['sudo', 'systemctl', 'start', mgr_name]) time.sleep(4) if mds_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mds_name]) remote.run(args=['sudo', 'systemctl', 'stop', mds_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mds_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single ceph mds service") else: log.info("Failed to stop ceph mds service") remote.run(args=['sudo', 'systemctl', 'start', mds_name]) time.sleep(4) # reboot all nodes and verify the systemd units restart # workunit that runs would fail if any of the systemd unit doesnt start ctx.cluster.run(args='sudo reboot', wait=False, check_status=False) # avoid immediate reconnect time.sleep(120) reconnect(ctx, 480) # reconnect all nodes # for debug info ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) # wait for HEALTH_OK mon = get_first_mon(ctx, config) (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys() wait_until_healthy(ctx, mon_remote) yield