Esempio n. 1
0
def wait_for_reboot(ctx, need_install, timeout, distro=False):
    """
    Loop reconnecting and checking kernel versions until
    they're all correct or the timeout is exceeded.

    :param ctx: Context
    :param need_install: list of packages that we need to reinstall.
    :param timeout: number of second before we timeout.
    """
    import time
    starttime = time.time()
    while need_install:
        teuthology.reconnect(ctx, timeout)
        for client in need_install.keys():
            if 'distro' in str(need_install[client]):
                 distro = True
            log.info('Checking client {client} for new kernel version...'.format(client=client))
            try:
                if distro:
                    (remote,) = ctx.cluster.only(client).remotes.keys()
                    assert not need_to_install_distro(remote), \
                            'failed to install new distro kernel version within timeout'

                else:
                    assert not need_to_install(ctx, client, need_install[client]), \
                            'failed to install new kernel version within timeout'
                del need_install[client]
            except Exception:
                log.exception("Saw exception")
                # ignore connection resets and asserts while time is left
                if time.time() - starttime > timeout:
                    raise
        time.sleep(1)
Esempio n. 2
0
def reboot(ctx, remotes):
    for remote in remotes:
        if stale_kernel_mount(remote):
            log.warn('Stale kernel mount on %s!', remote.name)
            log.info('force/no-sync rebooting %s', remote.name)
            # -n is ignored in systemd versions through v229, which means this
            # only works on trusty -- on 7.3 (v219) and xenial (v229) reboot -n
            # still calls sync().
            # args = ['sync', run.Raw('&'),
            #         'sleep', '5', run.Raw(';'),
            #         'sudo', 'reboot', '-f', '-n']
            args = [
                'for', 'sysrq', 'in', 's', 'u', 'b',
                run.Raw(';'), 'do', 'echo',
                run.Raw('$sysrq'),
                run.Raw('|'), 'sudo', 'tee', '/proc/sysrq-trigger',
                run.Raw(';'), 'done'
            ]
        else:
            log.info('rebooting %s', remote.name)
            args = ['sudo', 'reboot']
        try:
            remote.run(args=args, wait=False)
        except Exception:
            log.exception('ignoring exception during reboot command')
        # we just ignore these procs because reboot -f doesn't actually
        # send anything back to the ssh client!
    if remotes:
        log.info('waiting for nodes to reboot')
        time.sleep(8)  # if we try and reconnect too quickly, it succeeds!
        reconnect(ctx, 480)  # allow 8 minutes for the reboots
Esempio n. 3
0
 def revive_osd(self, osd, timeout=75):
     """
     Revive osds by either power cycling (if indicated by the config)
     or by restarting.
     """
     if self.config.get('powercycle'):
         (remote, ) = self.ctx.cluster.only(
             'osd.{o}'.format(o=osd)).remotes.iterkeys()
         self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(
             o=osd, s=remote.name))
         assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized.  Check ipmi config."
         remote.console.power_on()
         if not remote.console.check_status(300):
             raise Exception(
                 'Failed to revive osd.{o} via ipmi'.format(o=osd))
         teuthology.reconnect(self.ctx, 60, [remote])
         ceph_task.mount_osd_data(self.ctx, remote, str(osd))
         ceph_task.make_admin_daemon_dir(self.ctx, remote)
         self.ctx.daemons.get_daemon('osd', osd).reset()
     self.ctx.daemons.get_daemon('osd', osd).restart()
     # wait for dump_ops_in_flight; this command doesn't appear
     # until after the signal handler is installed and it is safe
     # to stop the osd again without making valgrind leak checks
     # unhappy.  see #5924.
     self.wait_run_admin_socket(osd,
                                args=['dump_ops_in_flight'],
                                timeout=timeout)
Esempio n. 4
0
    def kill_cleanup(self):
        assert not self.mounted

        # We need to do a sleep here because we don't know how long it will
        # take for a hard_reset to be effected.
        time.sleep(30)

        try:
            # Wait for node to come back up after reboot
            misc.reconnect(None, 300, [self.client_remote])
        except:
            # attempt to get some useful debug output:
            con = orchestra_remote.getRemoteConsole(
                self.client_remote.hostname, self.ipmi_user,
                self.ipmi_password, self.ipmi_domain)
            con.check_status(timeout=60)
            raise

        # Remove mount directory
        self.client_remote.run(args=['uptime'], timeout=10)

        # Remove mount directory
        self.client_remote.run(
            args=[
                'rmdir',
                '--',
                self.mountpoint,
            ],
            timeout=(5 * 60),
        )
Esempio n. 5
0
    def kill_cleanup(self):
        assert not self.mounted

        con = orchestra_remote.getRemoteConsole(
            self.client_remote.hostname, self.ipmi_user, self.ipmi_password, self.ipmi_domain
        )
        con.power_on()

        # Wait for node to come back up after reboot
        misc.reconnect(None, 300, [self.client_remote])

        # Remove mount directory
        self.client_remote.run(args=["rmdir", "--", self.mountpoint])
Esempio n. 6
0
 def revive_osd(self, osd, timeout=75):
     if self.config.get('powercycle'):
         (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
         self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
         assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized.  Check ipmi config."
         remote.console.power_on()
         if not remote.console.check_status(300):
             raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd))
         teuthology.reconnect(self.ctx, 60, [remote])
         ceph_task.mount_osd_data(self.ctx, remote, str(osd))
         ceph_task.make_admin_daemon_dir(self.ctx, remote)
         self.ctx.daemons.get_daemon('osd', osd).reset()
     self.ctx.daemons.get_daemon('osd', osd).restart()
     self.wait_run_admin_socket(osd, timeout=timeout)
Esempio n. 7
0
    def kill_cleanup(self):
        assert not self.mounted

        con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
                                                self.ipmi_user,
                                                self.ipmi_password,
                                                self.ipmi_domain)
        con.power_on()

        # Wait for node to come back up after reboot
        misc.reconnect(None, 300, [self.client_remote])

        # Remove mount directory
        self.client_remote.run(args=[
            'rmdir',
            '--',
            self.mountpoint,
        ], )
Esempio n. 8
0
def reboot(ctx, remotes, log):
    import time
    nodes = {}
    for remote in remotes:
        log.info('rebooting %s', remote.name)
        proc = remote.run(  # note use of -n to force a no-sync reboot
            args=['sudo', 'reboot', '-f', '-n'],
            wait=False)
        nodes[remote] = proc
        # we just ignore these procs because reboot -f doesn't actually
        # send anything back to the ssh client!
        #for remote, proc in nodes.iteritems():
        #proc.exitstatus.get()
    from teuthology.misc import reconnect
    if remotes:
        log.info('waiting for nodes to reboot')
        time.sleep(5)  #if we try and reconnect too quickly, it succeeds!
        reconnect(ctx, 480)  #allow 8 minutes for the reboots
Esempio n. 9
0
def reboot(ctx, remotes, log):
    import time

    nodes = {}
    for remote in remotes:
        log.info("rebooting %s", remote.name)
        proc = remote.run(args=["sudo", "reboot", "-f", "-n"], wait=False)  # note use of -n to force a no-sync reboot
        nodes[remote] = proc
        # we just ignore these procs because reboot -f doesn't actually
        # send anything back to the ssh client!
        # for remote, proc in nodes.iteritems():
        # proc.exitstatus.get()
    from teuthology.misc import reconnect

    if remotes:
        log.info("waiting for nodes to reboot")
        time.sleep(5)  # if we try and reconnect too quickly, it succeeds!
        reconnect(ctx, 480)  # allow 8 minutes for the reboots
Esempio n. 10
0
def task(ctx, config):
    """
    Run chef-solo on all nodes.

    Optional parameters:
    tasks:
    -chef
        script_url: # override default location for solo-from-scratch for Chef
        chef_repo: # override default Chef repo used by solo-from-scratch
        chef_branch: # to choose a different upstream branch for ceph-qa-chef
    """
    log.info("Running chef-solo...")

    if config is None:
        config = {}

    assert isinstance(config, dict), "chef - need config"
    chef_script = config.get(
        "script_url", "http://git.ceph.com/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD"
    )
    chef_repo = config.get("chef_repo", "")
    chef_branch = config.get("chef_branch", "")
    run.wait(
        ctx.cluster.run(
            args=[
                "wget",
                #                '-q',
                "-O-",
                #                'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch',
                chef_script,
                run.Raw("|"),
                run.Raw("CHEF_REPO={repo}".format(repo=chef_repo)),
                run.Raw("CHEF_BRANCH={branch}".format(branch=chef_branch)),
                "sh",
                "-x",
            ],
            wait=False,
        )
    )

    log.info("Reconnecting after ceph-qa-chef run")
    misc.reconnect(ctx, 10)  # Reconnect for ulimit and other ceph-qa-chef changes
Esempio n. 11
0
 def revive_osd(self, osd, timeout=75):
     if self.config.get('powercycle'):
         (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
         self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
         assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized.  Check ipmi config."
         remote.console.power_on()
         if not remote.console.check_status(300):
             raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd))
         teuthology.reconnect(self.ctx, 60, [remote])
         ceph_task.mount_osd_data(self.ctx, remote, str(osd))
         ceph_task.make_admin_daemon_dir(self.ctx, remote)
         self.ctx.daemons.get_daemon('osd', osd).reset()
     self.ctx.daemons.get_daemon('osd', osd).restart()
     # wait for dump_ops_in_flight; this command doesn't appear
     # until after the signal handler is installed and it is safe
     # to stop the osd again without making valgrind leak checks
     # unhappy.  see #5924.
     self.wait_run_admin_socket(osd,
                                args=['dump_ops_in_flight'],
                                timeout=timeout)
Esempio n. 12
0
def wait_for_reboot(ctx, need_install, timeout):
    """
    Loop reconnecting and checking kernel versions until
    they're all correct or the timeout is exceeded.
    """
    import time
    starttime = time.time()
    while need_install:
        teuthology.reconnect(ctx, timeout)
        for client in need_install.keys():
            log.info('Checking client {client} for new kernel version...'.format(client=client))
            try:
                assert not need_to_install(ctx, client, need_install[client]), \
                        'failed to install new kernel version within timeout'
                del need_install[client]
            except:
                # ignore connection resets and asserts while time is left
                if time.time() - starttime > timeout:
                    raise
        time.sleep(1)
Esempio n. 13
0
def wait_for_reboot(ctx, need_install, timeout):
    """
    Loop reconnecting and checking kernel versions until
    they're all correct or the timeout is exceeded.
    """
    import time
    starttime = time.time()
    while need_install:
        teuthology.reconnect(ctx, timeout)
        for client in need_install.keys():
            log.info(
                'Checking client {client} for new kernel version...'.format(
                    client=client))
            try:
                assert not need_to_install(ctx, client, need_install[client]), \
                        'failed to install new kernel version within timeout'
                del need_install[client]
            except:
                # ignore connection resets and asserts while time is left
                if time.time() - starttime > timeout:
                    raise
        time.sleep(1)
Esempio n. 14
0
def wait_for_reboot(ctx, need_install, timeout, distro=False):
    """
    Loop reconnecting and checking kernel versions until
    they're all correct or the timeout is exceeded.

    :param ctx: Context
    :param need_install: list of packages that we need to reinstall.
    :param timeout: number of second before we timeout.
    """
    import time
    # do not try to reconnect immediately after triggering the reboot,
    # because the reboot sequence might not have started yet (!) --
    # see https://tracker.ceph.com/issues/44187
    time.sleep(30)
    starttime = time.time()
    while need_install:
        teuthology.reconnect(ctx, timeout)
        for client in list(need_install.keys()):
            if 'distro' in str(need_install[client]):
                distro = True
            log.info(
                'Checking client {client} for new kernel version...'.format(
                    client=client))
            try:
                if distro:
                    (remote, ) = ctx.cluster.only(client).remotes.keys()
                    assert not need_to_install_distro(remote), \
                            'failed to install new distro kernel version within timeout'

                else:
                    assert not need_to_install(ctx, client, need_install[client]), \
                            'failed to install new kernel version within timeout'
                del need_install[client]
            except Exception:
                log.exception("Saw exception")
                # ignore connection resets and asserts while time is left
                if time.time() - starttime > timeout:
                    raise
        time.sleep(1)
Esempio n. 15
0
def poweron(ctx, config):
    """
    tasks:
        ceph-ipmi.poweron: [osd.0]
        check_status: false

    """

    assert isinstance(config, dict) or isinstance(config, list), \
        "task ceph_ipmi only supports a list or dictionary for configuration"

    if config is None:
        config = {}
    elif isinstance(config, list):
        config = dict((role, None) for role in config)
    roles = config.keys()
    last_remote = []

    for role in roles:
        (remote, ) = ctx.cluster.only(role).remotes.iterkeys()
        cluster_name, _, _ = teuthology.split_role(role)
        if remote not in last_remote:
            log.info("Powering on host containing %s" % role)
            ipmi = IpmiCapabilities(
                remote,
                ctx.teuthology_config.get('ipmi_user', None),
                ctx.teuthology_config.get('ipmi_password', None),
                ctx.teuthology_config.get('ipmi_domain', None),
                timeout=180)
            ipmi.power_on()
            last_remote.append(remote)

            if config.get('check_status', True):
                ipmi.check_status()

            teuthology.reconnect(ctx, 360)

    yield
Esempio n. 16
0
def reboot(ctx, remotes, log):
    from .orchestra import run
    import time
    nodes = {}
    for remote in remotes:
        log.info('rebooting %s', remote.name)
        proc = remote.run( # note use of -n to force a no-sync reboot
            args=[
                'timeout', '5', 'sync',
                run.Raw(';'),
                'sudo', 'reboot', '-f', '-n'
                ],
            wait=False
            )
        nodes[remote] = proc
        # we just ignore these procs because reboot -f doesn't actually
        # send anything back to the ssh client!
        #for remote, proc in nodes.iteritems():
        #proc.exitstatus.get()
    from teuthology.misc import reconnect
    if remotes:
        log.info('waiting for nodes to reboot')
        time.sleep(5) #if we try and reconnect too quickly, it succeeds!
        reconnect(ctx, 480)     #allow 8 minutes for the reboots
Esempio n. 17
0
def task(ctx, config):
    """
    Run chef-solo on all nodes.
    """
    log.info('Running chef-solo...')

    run.wait(
        ctx.cluster.run(
            args=[
                'wget',
                #                '-q',
                '-O-',
                #                'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch',
                'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD',
                run.Raw('|'),
                'sh',
                '-x',
            ],
            wait=False,
        ))

    log.info('Reconnecting after ceph-qa-chef run')
    misc.reconnect(ctx,
                   10)  #Reconnect for ulimit and other ceph-qa-chef changes
Esempio n. 18
0
def task(ctx, config):
    """
    Run chef-solo on all nodes.
    """
    log.info('Running chef-solo...')

    run.wait(
        ctx.cluster.run(
            args=[
                'wget',
#                '-q',
                '-O-',
#                'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch',
                'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD',
                run.Raw('|'),
                'sh',
                '-x',
                ],
            wait=False,
            )
        )

    log.info('Reconnecting after ceph-qa-chef run')
    misc.reconnect(ctx, 10)     #Reconnect for ulimit and other ceph-qa-chef changes
Esempio n. 19
0
def task(ctx, config):
    """
      - tasks:
          ceph-deploy:
          systemd:

    Test ceph systemd services can start, stop and restart and
    check for any failed services and report back errors
    """
    for remote, roles in ctx.cluster.remotes.items():
        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'])
        units = remote.sh('sudo systemctl list-units | grep ceph',
                          check_status=False)
        log.info(units)
        if units.find('failed'):
            log.info("Ceph services in failed state")

        # test overall service stop and start using ceph.target
        # ceph.target tests are meant for ceph systemd tests
        # and not actual process testing using 'ps'
        log.info("Stopping all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
        status = _remote_service_status(remote, 'ceph.target')
        log.info(status)
        log.info("Checking process status")
        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
        if ps_eaf.find('Active: inactive'):
            log.info("Successfully stopped all ceph services")
        else:
            log.info("Failed to stop ceph services")

        log.info("Starting all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
        status = _remote_service_status(remote, 'ceph.target')
        log.info(status)
        if status.find('Active: active'):
            log.info("Successfully started all Ceph services")
        else:
            log.info("info", "Failed to start Ceph services")
        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
        log.info(ps_eaf)
        time.sleep(4)

        # test individual services start stop
        name = remote.shortname
        mon_name = 'ceph-mon@' + name + '.service'
        mds_name = 'ceph-mds@' + name + '.service'
        mgr_name = 'ceph-mgr@' + name + '.service'
        mon_role_name = 'mon.' + name
        mds_role_name = 'mds.' + name
        mgr_role_name = 'mgr.' + name
        m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf)
        if m_osd:
            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
            remote.run(args=['sudo', 'systemctl', 'status', osd_service])
            remote.run(args=['sudo', 'systemctl', 'stop', osd_service])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, osd_service)
            log.info(status)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single osd ceph service")
            else:
                log.info("Failed to stop ceph osd services")
            remote.sh(['sudo', 'systemctl', 'start', osd_service])
            time.sleep(4)
        if mon_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mon_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single mon ceph service")
            else:
                log.info("Failed to stop ceph mon service")
            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
            time.sleep(4)
        if mgr_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mgr_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single ceph mgr service")
            else:
                log.info("Failed to stop ceph mgr service")
            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
            time.sleep(4)
        if mds_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mds_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single ceph mds service")
            else:
                log.info("Failed to stop ceph mds service")
            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
            time.sleep(4)

    # reboot all nodes and verify the systemd units restart
    # workunit that runs would fail if any of the systemd unit doesnt start
    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
    # avoid immediate reconnect
    time.sleep(120)
    reconnect(ctx, 480)  # reconnect all nodes
    # for debug info
    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'])
    # wait for HEALTH_OK
    mon = get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(mon).remotes.keys()
    wait_until_healthy(ctx, mon_remote, use_sudo=True)
    yield
Esempio n. 20
0
def task(ctx, config):
    """
      - tasks:
          ceph-deploy:
          systemd:

    Test ceph systemd services can start, stop and restart and
    check for any failed services and report back errors
    """
    for remote, roles in ctx.cluster.remotes.iteritems():
        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                         'grep', 'ceph'])
        r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO(),
                       check_status=False)
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('failed'):
            log.info("Ceph services in failed state")

        # test overall service stop and start using ceph.target
        # ceph.target tests are meant for ceph systemd tests
        # and not actual process testing using 'ps'
        log.info("Stopping all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO(), check_status=False)
        log.info(r.stdout.getvalue())
        log.info("Checking process status")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        if r.stdout.getvalue().find('Active: inactive'):
            log.info("Sucessfully stopped all ceph services")
        else:
            log.info("Failed to stop ceph services")

        log.info("Starting all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO())
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('Active: active'):
            log.info("Sucessfully started all Ceph services")
        else:
            log.info("info", "Failed to start Ceph services")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        log.info(r.stdout.getvalue())
        time.sleep(4)

        # test individual services start stop
        name = remote.shortname
        mon_name = 'ceph-mon@' + name + '.service'
        mds_name = 'ceph-mds@' + name + '.service'
        mgr_name = 'ceph-mgr@' + name + '.service'
        mon_role_name = 'mon.' + name
        mds_role_name = 'mds.' + name
        mgr_role_name = 'mgr.' + name
        m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
        if m_osd:
            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
            remote.run(args=['sudo', 'systemctl', 'status',
                             osd_service])
            remote.run(args=['sudo', 'systemctl', 'stop',
                             osd_service])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
                           stdout=StringIO(), check_status=False)
            log.info(r.stdout.getvalue())
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single osd ceph service")
            else:
                log.info("Failed to stop ceph osd services")
            remote.run(args=['sudo', 'systemctl', 'start',
                             osd_service])
            time.sleep(4)
        if mon_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single mon ceph service")
            else:
                log.info("Failed to stop ceph mon service")
            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
            time.sleep(4)
        if mgr_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mgr service")
            else:
                log.info("Failed to stop ceph mgr service")
            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
            time.sleep(4)
        if mds_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mds service")
            else:
                log.info("Failed to stop ceph mds service")
            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
            time.sleep(4)

    # reboot all nodes and verify the systemd units restart
    # workunit that runs would fail if any of the systemd unit doesnt start
    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
    # avoid immediate reconnect
    time.sleep(120)
    reconnect(ctx, 480)  # reconnect all nodes
    # for debug info
    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                          'grep', 'ceph'])
    # wait for HEALTH_OK
    mon = get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
    wait_until_healthy(ctx, mon_remote)
    yield