Example #1
0
def unlock_one(ctx, name, user, description=None):
    name = misc.canonicalize_hostname(name, user=None)
    if not teuthology.provision.destroy_if_vm(ctx, name, user, description):
        log.error('destroy failed for %s', name)
        return False
    request = dict(name=name, locked=False, locked_by=user,
                   description=description)
    uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
    with safe_while(
            sleep=1, increment=0.5, action="unlock %s" % name) as proceed:
        while proceed():
            try:
                response = requests.put(uri, json.dumps(request))
                break
            # Work around https://github.com/kennethreitz/requests/issues/2364
            except requests.ConnectionError as e:
                log.warn("Saw %s while unlocking; retrying...", str(e))
    success = response.ok
    if success:
        log.info('unlocked %s', name)
    else:
        try:
            reason = response.json().get('message')
        except ValueError:
            reason = str(response.status_code)
        log.error('failed to unlock {node}. reason: {reason}'.format(
            node=name, reason=reason))
    return success
Example #2
0
def listdir(path):
    with safe_while(sleep=1, increment=1, tries=10) as proceed:
        while proceed():
            try:
                return os.listdir(path)
            except OSError:
                log.exception("Failed to list %s !" % path)
Example #3
0
 def get_addresses(self):
     """
     Return the list of IPs associated with instance_id in OpenStack.
     """
     with safe_while(sleep=2, tries=30, action="get ip " + self["id"]) as proceed:
         while proceed():
             found = re.match(".*\d+", self["addresses"])
             if found:
                 return self["addresses"]
             self.set_info()
Example #4
0
 def get_addresses(instance_id):
     """
     Return the list of IPs associated with instance_id in OpenStack.
     """
     with safe_while(sleep=2, tries=30,
                     action="get ip " + instance_id) as proceed:
         while proceed():
             instance = misc.sh("openstack server show -f json " +
                                instance_id)
             addresses = OpenStack.get_value(json.loads(instance),
                                             'addresses')
             found = re.match('.*\d+', addresses)
             if found:
                 return addresses
Example #5
0
 def _wait_for_ready(self):
     with safe_while(sleep=6, tries=20) as proceed:
         while proceed():
             try:
                 self.remote.connect()
                 break
             except (
                 socket.error,
                 NoValidConnectionsError,
                 AuthenticationException,
             ):
                 pass
     cmd = "while [ ! -e '%s' ]; do sleep 5; done" % self._sentinel_path
     self.remote.run(args=cmd, timeout=600)
     log.info("Node is ready: %s", self.node)
Example #6
0
 def wait_for_ceph_health(self):
     with contextutil.safe_while(sleep=15, tries=6,
                                 action='check health') as proceed:
         (remote,) = self.ctx.cluster.only('mon.a').remotes
         remote.run(args=['sudo', 'ceph', 'osd', 'tree'])
         remote.run(args=['sudo', 'ceph', '-s'])
         log.info("Waiting for Ceph health to reach HEALTH_OK \
                     or HEALTH WARN")
         while proceed():
             out = StringIO()
             remote.run(args=['sudo', 'ceph', 'health'], stdout=out)
             out = out.getvalue().split(None, 1)[0]
             log.info("cluster in state: %s", out)
             if out in ('HEALTH_OK', 'HEALTH_WARN'):
                 break
Example #7
0
 def check_power(self, state, timeout=None):
     """
     Check power.  Retry if EOF encountered on power check read.
     """
     timeout = timeout or self.timeout
     sleep_time = 4.0
     with safe_while(
             sleep=sleep_time,
             tries=int(timeout / sleep_time),
             _raise=False,
             action='wait for power %s' % state) as proceed:
         while proceed():
             c = self._pexpect_spawn_ipmi('power status')
             r = c.expect(['Chassis Power is {s}'.format(
                 s=state), pexpect.EOF, pexpect.TIMEOUT], timeout=1)
             if r == 0:
                 return True
     return False
Example #8
0
def retry(function, *args, **kwargs):
    """
    Call a function (returning its results), retrying if any of the exceptions
    in RETRY_EXCEPTIONS are raised
    """
    with safe_while(sleep=1, tries=24, increment=1) as proceed:
        tries = 0
        while proceed():
            tries += 1
            try:
                result = function(*args, **kwargs)
                if tries > 1:
                    log.debug(
                        "'%s' succeeded after %s tries",
                        function.__name__,
                        tries,
                    )
                return result
            except RETRY_EXCEPTIONS:
                pass
Example #9
0
 def cloud_init_wait(self, name_or_ip):
     """
     Wait for cloud-init to complete on the name_or_ip OpenStack instance.
     """
     log.debug('cloud_init_wait ' + name_or_ip)
     client_args = {
         'user_at_host': '@'.join((self.username, name_or_ip)),
         'timeout': 240,
         'retry': False,
     }
     if self.key_filename:
         log.debug("using key " + self.key_filename)
         client_args['key_filename'] = self.key_filename
     with safe_while(sleep=30, tries=100,
                     action="cloud_init_wait " + name_or_ip) as proceed:
         success = False
         # CentOS 6.6 logs in /var/log/clout-init-output.log
         # CentOS 7.0 logs in /var/log/clout-init.log
         all_done = ("tail /var/log/cloud-init*.log ; " +
                     " test -f /tmp/init.out && tail /tmp/init.out ; " +
                     " grep '" + self.up_string + "' " +
                     "/var/log/cloud-init*.log")
         while proceed():
             try:
                 client = connection.connect(**client_args)
             except paramiko.PasswordRequiredException as e:
                 raise Exception(
                     "The private key requires a passphrase.\n"
                     "Create a new key with:"
                     "  openstack keypair create myself > myself.pem\n"
                     "  chmod 600 myself.pem\n"
                     "and call teuthology-openstack with the options\n"
                     " --key-name myself --key-filename myself.pem\n")
             except paramiko.AuthenticationException as e:
                 log.debug('cloud_init_wait AuthenticationException ' + str(e))
                 continue
             except socket.timeout as e:
                 log.debug('cloud_init_wait connect socket.timeout ' + str(e))
                 continue
             except socket.error as e:
                 log.debug('cloud_init_wait connect socket.error ' + str(e))
                 continue
             except Exception as e:
                 transients = ('Incompatible ssh peer', 'Unknown server')
                 for transient in transients:
                     if transient in str(e):
                         continue
                 log.exception('cloud_init_wait ' + name_or_ip)
                 raise
             log.debug('cloud_init_wait ' + all_done)
             try:
                 stdin, stdout, stderr = client.exec_command(all_done)
                 stdout.channel.settimeout(5)
                 out = stdout.read()
                 log.debug('cloud_init_wait stdout ' + all_done + ' ' + out)
             except socket.timeout as e:
                 client.close()
                 log.debug('cloud_init_wait socket.timeout ' + all_done)
                 continue
             except socket.error as e:
                 client.close()
                 log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + all_done)
                 continue
             log.debug('cloud_init_wait stderr ' + all_done +
                       ' ' + stderr.read())
             if stdout.channel.recv_exit_status() == 0:
                 success = True
             client.close()
             if success:
                 break
         return success
Example #10
0
def cli_test(ctx, config):
    """
     ceph-deploy cli to exercise most commonly use cli's and ensure
     all commands works and also startup the init system.

    """
    log.info('Ceph-deploy Test')
    if config is None:
        config = {}
    test_branch = ''
    conf_dir = teuthology.get_testdir(ctx) + "/cdtest"

    def execute_cdeploy(admin, cmd, path):
        """Execute ceph-deploy commands """
        """Either use git path or repo path """
        args = ['cd', conf_dir, run.Raw(';')]
        if path:
            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
        else:
            args.append('ceph-deploy')
        args.append(run.Raw(cmd))
        ec = admin.run(args=args, check_status=False).exitstatus
        if ec != 0:
            raise RuntimeError(
                "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))

    if config.get('rhbuild'):
        path = None
    else:
        path = teuthology.get_testdir(ctx)
        # test on branch from config eg: wip-* , master or next etc
        # packages for all distro's should exist for wip*
        if ctx.config.get('branch'):
            branch = ctx.config.get('branch')
            test_branch = ' --dev={branch} '.format(branch=branch)
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    for node, role in mons.remotes.items():
        admin = node
        admin.run(args=['mkdir', conf_dir], check_status=False)
        nodename = admin.shortname
    system_type = teuthology.get_system_type(admin)
    if config.get('rhbuild'):
        admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
    log.info('system type is %s', system_type)
    osds = ctx.cluster.only(teuthology.is_type('osd'))

    for remote, roles in osds.remotes.items():
        devs = teuthology.get_scratch_devices(remote)
        log.info("roles %s", roles)
        if (len(devs) < 3):
            log.error(
                'Test needs minimum of 3 devices, only found %s',
                str(devs))
            raise RuntimeError("Needs minimum of 3 devices ")

    conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
    new_cmd = 'new ' + nodename
    execute_cdeploy(admin, new_cmd, path)
    if config.get('conf') is not None:
        confp = config.get('conf')
        for section, keys in confp.items():
            lines = '[{section}]\n'.format(section=section)
            teuthology.append_lines_to_file(admin, conf_path, lines,
                                            sudo=True)
            for key, value in keys.items():
                log.info("[%s] %s = %s" % (section, key, value))
                lines = '{key} = {value}\n'.format(key=key, value=value)
                teuthology.append_lines_to_file(admin, conf_path, lines,
                                                sudo=True)
    new_mon_install = 'install {branch} --mon '.format(
        branch=test_branch) + nodename
    new_mgr_install = 'install {branch} --mgr '.format(
        branch=test_branch) + nodename
    new_osd_install = 'install {branch} --osd '.format(
        branch=test_branch) + nodename
    new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
    create_initial = 'mon create-initial '
    mgr_create = 'mgr create ' + nodename
    # either use create-keys or push command
    push_keys = 'admin ' + nodename
    execute_cdeploy(admin, new_mon_install, path)
    execute_cdeploy(admin, new_mgr_install, path)
    execute_cdeploy(admin, new_osd_install, path)
    execute_cdeploy(admin, new_admin, path)
    execute_cdeploy(admin, create_initial, path)
    execute_cdeploy(admin, mgr_create, path)
    execute_cdeploy(admin, push_keys, path)

    for i in range(3):
        zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
        prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
        execute_cdeploy(admin, zap_disk, path)
        execute_cdeploy(admin, prepare, path)

    log.info("list files for debugging purpose to check file permissions")
    admin.run(args=['ls', run.Raw('-lt'), conf_dir])
    remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
    r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
    out = r.stdout.getvalue()
    log.info('Ceph health: %s', out.rstrip('\n'))
    log.info("Waiting for cluster to become healthy")
    with contextutil.safe_while(sleep=10, tries=6,
                                action='check health') as proceed:
        while proceed():
            r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
            out = r.stdout.getvalue()
            if (out.split(None, 1)[0] == 'HEALTH_OK'):
                break
    rgw_install = 'install {branch} --rgw {node}'.format(
        branch=test_branch,
        node=nodename,
    )
    rgw_create = 'rgw create ' + nodename
    execute_cdeploy(admin, rgw_install, path)
    execute_cdeploy(admin, rgw_create, path)
    log.info('All ceph-deploy cli tests passed')
    try:
        yield
    finally:
        log.info("cleaning up")
        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'],
                        check_status=False)
        time.sleep(4)
        for i in range(3):
            umount_dev = "{d}1".format(d=devs[i])
            r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
        cmd = 'purge ' + nodename
        execute_cdeploy(admin, cmd, path)
        cmd = 'purgedata ' + nodename
        execute_cdeploy(admin, cmd, path)
        log.info("Removing temporary dir")
        admin.run(
            args=[
                'rm',
                run.Raw('-rf'),
                run.Raw(conf_dir)],
            check_status=False)
        if config.get('rhbuild'):
            admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
Example #11
0
    def test_mirroring_init_failure_with_recovery(self):
        """Test if the mirror daemon can recover from a init failure"""

        # disable mgr mirroring plugin as it would try to load dir map on
        # on mirroring enabled for a filesystem (an throw up erorrs in
        # the logs)
        self.disable_mirroring_module()

        # enable mirroring through mon interface -- this should result in the mirror daemon
        # failing to enable mirroring due to absence of `cephfs_mirror` index object.

        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable",
                                                     self.primary_fs_name)
        # need safe_while since non-failed status pops up as mirroring is restarted
        # internally in mirror daemon.
        with safe_while(sleep=5, tries=20,
                        action='wait for failed state') as proceed:
            while proceed():
                try:
                    # verify via asok
                    res = self.mirror_daemon_command(
                        f'mirror status for fs: {self.primary_fs_name}', 'fs',
                        'mirror', 'status',
                        f'{self.primary_fs_name}@{self.primary_fs_id}')
                    if not 'state' in res:
                        return
                    self.assertTrue(res['state'] == "failed")
                    return True
                except:
                    pass

        # create the index object and check daemon recovery
        try:
            p = self.mount_a.client_remote.run(args=[
                'rados', '-p', self.fs.metadata_pool_name, 'create',
                'cephfs_mirror'
            ],
                                               stdout=StringIO(),
                                               stderr=StringIO(),
                                               timeout=30,
                                               check_status=True,
                                               label="create index object")
            p.wait()
        except CommandFailedError as ce:
            log.warn(
                f'mirror daemon command to create mirror index object failed: {ce}'
            )
            raise
        time.sleep(30)
        res = self.mirror_daemon_command(
            f'mirror status for fs: {self.primary_fs_name}', 'fs', 'mirror',
            'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
        self.assertTrue(res['peers'] == {})
        self.assertTrue(res['snap_dirs']['dir_count'] == 0)

        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable",
                                                     self.primary_fs_name)
        time.sleep(10)
        # verify via asok
        try:
            self.mirror_daemon_command(
                f'mirror status for fs: {self.primary_fs_name}', 'fs',
                'mirror', 'status',
                f'{self.primary_fs_name}@{self.primary_fs_id}')
        except CommandFailedError:
            pass
        else:
            raise RuntimeError('expected admin socket to be unavailable')
Example #12
0
 def cloud_init_wait(self, instance):
     """
     Wait for cloud-init to complete on the name_or_ip OpenStack instance.
     """
     ip = instance.get_floating_ip_or_ip()
     log.debug('cloud_init_wait ' + ip)
     client_args = {
         'user_at_host': '@'.join((self.username, ip)),
         'timeout': 240,
         'retry': False,
     }
     if self.key_filename:
         log.debug("using key " + self.key_filename)
         client_args['key_filename'] = self.key_filename
     with safe_while(sleep=30, tries=30,
                     action="cloud_init_wait " + ip) as proceed:
         success = False
         # CentOS 6.6 logs in /var/log/clout-init-output.log
         # CentOS 7.0 logs in /var/log/clout-init.log
         tail = ("tail --follow=name --retry"
                     " /var/log/cloud-init*.log /tmp/init.out")
         while proceed():
             try:
                 client = connection.connect(**client_args)
             except paramiko.PasswordRequiredException as e:
                 raise Exception(
                     "The private key requires a passphrase.\n"
                     "Create a new key with:"
                     "  openstack keypair create myself > myself.pem\n"
                     "  chmod 600 myself.pem\n"
                     "and call teuthology-openstack with the options\n"
                     " --key-name myself --key-filename myself.pem\n")
             except paramiko.AuthenticationException as e:
                 log.debug('cloud_init_wait AuthenticationException ' + str(e))
                 continue
             except socket.timeout as e:
                 log.debug('cloud_init_wait connect socket.timeout ' + str(e))
                 continue
             except socket.error as e:
                 log.debug('cloud_init_wait connect socket.error ' + str(e))
                 continue
             except Exception as e:
                 transients = ('Incompatible ssh peer', 'Unknown server')
                 for transient in transients:
                     if transient in str(e):
                         continue
                 log.exception('cloud_init_wait ' + ip)
                 raise
             log.debug('cloud_init_wait ' + tail)
             try:
                 # get the I/O channel to iterate line by line
                 transport = client.get_transport()
                 channel = transport.open_session()
                 channel.get_pty()
                 channel.settimeout(240)
                 output = channel.makefile('r', 1)
                 channel.exec_command(tail)
                 for line in iter(output.readline, b''):
                     log.info(line.strip())
                     if self.up_string in line:
                         success = True
                         break
             except socket.timeout as e:
                 client.close()
                 log.debug('cloud_init_wait socket.timeout ' + tail)
                 continue
             except socket.error as e:
                 client.close()
                 log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + tail)
                 continue
             client.close()
             if success:
                 break
         return success
def task(ctx, config):
    """
    Run watch_notify_same_primary

    The config should be as follows:

    watch_notify_same_primary:
        clients: [client list]

    The client list should contain 1 client

    The test requires 3 osds.

    example:

    tasks:
    - ceph:
    - watch_notify_same_primary:
        clients: [client.0]
    - interactive:
    """
    log.info('Beginning watch_notify_same_primary...')
    assert isinstance(config, dict), \
        "please list clients to run on"

    clients = config.get('clients', ['client.0'])
    assert len(clients) == 1
    role = clients[0]
    assert isinstance(role, six.string_types)
    PREFIX = 'client.'
    assert role.startswith(PREFIX)
    (remote, ) = ctx.cluster.only(role).remotes.keys()
    manager = ctx.managers['ceph']
    manager.raw_cluster_cmd('osd', 'set', 'noout')

    pool = manager.create_pool_with_unique_name()

    def obj(n):
        return "foo-{num}".format(num=n)

    def start_watch(n):
        remote.run(
            args=["rados", "-p", pool, "put",
                  obj(n), "/etc/resolv.conf"],
            logger=log.getChild('watch.{id}'.format(id=n)))
        proc = remote.run(args=["rados", "-p", pool, "watch",
                                obj(n)],
                          stdin=run.PIPE,
                          stdout=BytesIO(),
                          stderr=BytesIO(),
                          wait=False)
        return proc

    num = 20

    watches = [start_watch(i) for i in range(num)]

    # wait for them all to register
    for i in range(num):
        with safe_while() as proceed:
            while proceed():
                lines = remote.sh(
                    ["rados", "-p", pool, "listwatchers",
                     obj(i)])
                num_watchers = lines.count('watcher=')
                log.info('i see %d watchers for %s', num_watchers, obj(i))
                if num_watchers >= 1:
                    break

    def notify(n, msg):
        remote.run(args=["rados", "-p", pool, "notify",
                         obj(n), msg],
                   logger=log.getChild('notify.{id}'.format(id=n)))

    [notify(n, 'notify1') for n in range(len(watches))]

    manager.kill_osd(0)
    manager.mark_down_osd(0)

    [notify(n, 'notify2') for n in range(len(watches))]

    try:
        yield
    finally:
        log.info('joining watch_notify_stress')
        for watch in watches:
            watch.stdin.write("\n")

        run.wait(watches)

        for watch in watches:
            lines = watch.stdout.getvalue().split("\n")
            got1 = False
            got2 = False
            for l in lines:
                if 'notify1' in l:
                    got1 = True
                if 'notify2' in l:
                    got2 = True
            log.info(lines)
            assert got1 and got2

        manager.revive_osd(0)
        manager.remove_pool(pool)
Example #14
0
def task(ctx, config):
    """
    Deploy rook-ceph cluster

      tasks:
      - kubeadm:
      - rook:
          branch: wip-foo
          spec:
            mon:
              count: 1

    The spec item is deep-merged against the cluster.yaml.  The branch, sha1, or
    image items are used to determine the Ceph container image.
    """
    if not config:
        config = {}
    assert isinstance(config, dict), \
        "task only supports a dictionary for configuration"

    log.info('Rook start')

    overrides = ctx.config.get('overrides', {})
    teuthology.deep_merge(config, overrides.get('ceph', {}))
    teuthology.deep_merge(config, overrides.get('rook', {}))
    log.info('Config: ' + str(config))

    # set up cluster context
    if not hasattr(ctx, 'rook'):
        ctx.rook = {}
    if 'cluster' not in config:
        config['cluster'] = 'ceph'
    cluster_name = config['cluster']
    if cluster_name not in ctx.rook:
        ctx.rook[cluster_name] = argparse.Namespace()

    ctx.rook[cluster_name].remote = list(ctx.cluster.remotes.keys())[0]

    # image
    teuth_defaults = teuth_config.get('defaults', {})
    cephadm_defaults = teuth_defaults.get('cephadm', {})
    containers_defaults = cephadm_defaults.get('containers', {})
    container_image_name = containers_defaults.get('image', None)
    if 'image' in config:
        ctx.rook[cluster_name].image = config.get('image')
    else:
        sha1 = config.get('sha1')
        flavor = config.get('flavor', 'default')
        if sha1:
            if flavor == "crimson":
                ctx.rook[
                    cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
            else:
                ctx.rook[
                    cluster_name].image = container_image_name + ':' + sha1
        else:
            # hmm, fall back to branch?
            branch = config.get('branch', 'master')
            ctx.rook[cluster_name].image = container_image_name + ':' + branch
    log.info('Ceph image is %s' % ctx.rook[cluster_name].image)

    with contextutil.nested(
            lambda: rook_operator(ctx, config),
            lambda: ceph_log(ctx, config),
            lambda: rook_cluster(ctx, config),
            lambda: rook_toolbox(ctx, config),
            lambda: wait_for_orch(ctx, config),
            lambda: rook_post_config(ctx, config),
            lambda: wait_for_osds(ctx, config),
            lambda: ceph_config_keyring(ctx, config),
            lambda: ceph_clients(ctx, config),
    ):
        if not hasattr(ctx, 'managers'):
            ctx.managers = {}
        ctx.managers[cluster_name] = CephManager(
            ctx.rook[cluster_name].remote,
            ctx=ctx,
            logger=log.getChild('ceph_manager.' + cluster_name),
            cluster=cluster_name,
            rook=True,
        )
        try:
            if config.get('wait-for-healthy', True):
                healthy(ctx=ctx, config=config)
            log.info('Rook complete, yielding')
            yield

        finally:
            to_remove = []
            ret = _shell(ctx,
                         config, ['ceph', 'orch', 'ls', '-f', 'json'],
                         stdout=BytesIO())
            if ret.exitstatus == 0:
                r = json.loads(ret.stdout.getvalue().decode('utf-8'))
                for service in r:
                    if service['service_type'] in [
                            'rgw', 'mds', 'nfs', 'rbd-mirror'
                    ]:
                        _shell(ctx, config,
                               ['ceph', 'orch', 'rm', service['service_name']])
                        to_remove.append(service['service_name'])
                with safe_while(
                        sleep=10, tries=90,
                        action="waiting for service removal") as proceed:
                    while proceed():
                        ret = _shell(ctx,
                                     config,
                                     ['ceph', 'orch', 'ls', '-f', 'json'],
                                     stdout=BytesIO())
                        if ret.exitstatus == 0:
                            r = json.loads(
                                ret.stdout.getvalue().decode('utf-8'))
                            still_up = [
                                service['service_name'] for service in r
                            ]
                            matches = set(still_up).intersection(to_remove)
                            if not matches:
                                break
            log.info('Tearing down rook')
Example #15
0
 def cloud_init_wait(self, instance):
     """
     Wait for cloud-init to complete on the name_or_ip OpenStack instance.
     """
     ip = instance.get_floating_ip_or_ip()
     log.debug('cloud_init_wait ' + ip)
     client_args = {
         'user_at_host': '@'.join((self.username, ip)),
         'timeout': 240,
         'retry': False,
     }
     if self.key_filename:
         log.debug("using key " + self.key_filename)
         client_args['key_filename'] = self.key_filename
     with safe_while(sleep=30, tries=30,
                     action="cloud_init_wait " + ip) as proceed:
         success = False
         # CentOS 6.6 logs in /var/log/clout-init-output.log
         # CentOS 7.0 logs in /var/log/clout-init.log
         tail = ("tail --follow=name --retry"
                 " /var/log/cloud-init*.log /tmp/init.out")
         while proceed():
             try:
                 client = connection.connect(**client_args)
             except paramiko.PasswordRequiredException as e:
                 raise Exception(
                     "The private key requires a passphrase.\n"
                     "Create a new key with:"
                     "  openstack keypair create myself > myself.pem\n"
                     "  chmod 600 myself.pem\n"
                     "and call teuthology-openstack with the options\n"
                     " --key-name myself --key-filename myself.pem\n")
             except paramiko.AuthenticationException as e:
                 log.debug('cloud_init_wait AuthenticationException ' +
                           str(e))
                 continue
             except socket.timeout as e:
                 log.debug('cloud_init_wait connect socket.timeout ' +
                           str(e))
                 continue
             except socket.error as e:
                 log.debug('cloud_init_wait connect socket.error ' + str(e))
                 continue
             except Exception as e:
                 transients = ('Incompatible ssh peer', 'Unknown server')
                 for transient in transients:
                     if transient in str(e):
                         continue
                 log.exception('cloud_init_wait ' + ip)
                 raise
             log.debug('cloud_init_wait ' + tail)
             try:
                 # get the I/O channel to iterate line by line
                 transport = client.get_transport()
                 channel = transport.open_session()
                 channel.get_pty()
                 channel.settimeout(240)
                 output = channel.makefile('r', 1)
                 channel.exec_command(tail)
                 for line in iter(output.readline, b''):
                     log.info(line.strip())
                     if self.up_string in line:
                         success = True
                         break
             except socket.timeout as e:
                 client.close()
                 log.debug('cloud_init_wait socket.timeout ' + tail)
                 continue
             except socket.error as e:
                 client.close()
                 log.debug('cloud_init_wait socket.error ' + str(e) + ' ' +
                           tail)
                 continue
             client.close()
             if success:
                 break
         return success
def task(ctx, config):
    """
    Run watch_notify_same_primary

    The config should be as follows:

    watch_notify_same_primary:
        clients: [client list]

    The client list should contain 1 client

    The test requires 3 osds.

    example:

    tasks:
    - ceph:
    - watch_notify_same_primary:
        clients: [client.0]
    - interactive:
    """
    log.info('Beginning watch_notify_same_primary...')
    assert isinstance(config, dict), \
        "please list clients to run on"

    clients = config.get('clients', ['client.0'])
    assert len(clients) == 1
    role = clients[0]
    assert isinstance(role, basestring)
    PREFIX = 'client.'
    assert role.startswith(PREFIX)
    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
    ctx.manager.raw_cluster_cmd('osd', 'set', 'noout')

    pool = ctx.manager.create_pool_with_unique_name()
    def obj(n): return "foo-{num}".format(num=n)
    def start_watch(n):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "put",
                obj(n),
                "/etc/resolv.conf"],
            logger=log.getChild('watch.{id}'.format(id=n)))
        proc = remote.run(
            args = [
                "rados",
                "-p", pool,
                "watch",
                obj(n)],
            stdin=run.PIPE,
            stdout=StringIO(),
            stderr=StringIO(),
            wait=False)
        return proc

    num = 20

    watches = [start_watch(i) for i in range(num)]

    # wait for them all to register
    for i in range(num):
        with safe_while() as proceed:
            while proceed():
                proc = remote.run(
                    args = [
                        "rados",
                        "-p", pool,
                        "listwatchers",
                        obj(i)],
                    stdout=StringIO())
                lines = proc.stdout.getvalue()
                num_watchers = lines.count('watcher=')
                log.info('i see %d watchers for %s', num_watchers, obj(i))
                if num_watchers >= 1:
                    break

    def notify(n, msg):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "notify",
                obj(n),
                msg],
            logger=log.getChild('notify.{id}'.format(id=n)))

    [notify(n, 'notify1') for n in range(len(watches))]

    ctx.manager.kill_osd(0)
    ctx.manager.mark_down_osd(0)

    [notify(n, 'notify2') for n in range(len(watches))]

    try:
        yield
    finally:
        log.info('joining watch_notify_stress')
        for watch in watches:
            watch.stdin.write("\n")

        run.wait(watches)

        for watch in watches:
            lines = watch.stdout.getvalue().split("\n")
            got1 = False
            got2 = False
            for l in lines:
                if 'notify1' in l:
                    got1 = True
                if 'notify2' in l:
                    got2 = True
            log.info(lines)
            assert got1 and got2

        ctx.manager.revive_osd(0)
        ctx.manager.remove_pool(pool)
Example #17
0
    def report_job(self, run_name, job_id, job_info=None, dead=False):
        """
        Report a single job to the results server.

        :param run_name: The name of the run. The run must already exist.
        :param job_id:   The job's id
        :param job_info: The job's info dict. Optional - if not present, we
                         look at the archive.
        """
        if job_info is not None and not isinstance(job_info, dict):
            raise TypeError("job_info must be a dict")
        run_uri = "{base}/runs/{name}/jobs/".format(
            base=self.base_uri,
            name=run_name,
        )
        if job_info is None:
            job_info = self.serializer.job_info(run_name, job_id)
        if dead and get_status(job_info) is None:
            set_status(job_info, 'dead')
        job_json = json.dumps(job_info)
        headers = {'content-type': 'application/json'}

        inc = random.uniform(0, 1)
        with safe_while(sleep=1, increment=inc,
                        action=f'report job {job_id}') as proceed:
            while proceed():
                response = self.session.post(run_uri,
                                             data=job_json,
                                             headers=headers)

                if response.status_code == 200:
                    return

                # This call is wrapped in a try/except because of:
                #  http://tracker.ceph.com/issues/8166
                try:
                    resp_json = response.json()
                except ValueError:
                    resp_json = dict()

                if resp_json:
                    msg = resp_json.get('message', '')
                else:
                    msg = response.text

                if msg and msg.endswith('already exists'):
                    job_uri = os.path.join(run_uri, job_id, '')
                    response = self.session.put(job_uri,
                                                data=job_json,
                                                headers=headers)
                    if response.status_code == 200:
                        return
                elif msg:
                    self.log.error(
                        "POST to {uri} failed with status {status}: {msg}".
                        format(
                            uri=run_uri,
                            status=response.status_code,
                            msg=msg,
                        ))
        response.raise_for_status()
Example #18
0
def ceph_mons(ctx, config):
    """
    Deploy any additional mons
    """
    cluster_name = config['cluster']
    fsid = ctx.ceph[cluster_name].fsid
    testdir = teuthology.get_testdir(ctx)
    num_mons = 1

    try:
        for remote, roles in ctx.cluster.remotes.items():
            for mon in [
                    r for r in roles
                    if teuthology.is_type('mon', cluster_name)(r)
            ]:
                c_, _, id_ = teuthology.split_role(mon)
                if c_ == cluster_name and id_ == ctx.ceph[
                        cluster_name].first_mon:
                    continue
                log.info('Adding %s on %s' % (mon, remote.shortname))
                num_mons += 1
                _shell(ctx, cluster_name, remote, [
                    'ceph',
                    'orchestrator',
                    'mon',
                    'update',
                    str(num_mons),
                    remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] +
                    '=' + id_,
                ])
                ctx.daemons.register_daemon(
                    remote,
                    'mon',
                    id_,
                    cluster=cluster_name,
                    fsid=fsid,
                    logger=log.getChild(mon),
                    wait=False,
                    started=True,
                )

                with contextutil.safe_while(sleep=1, tries=180) as proceed:
                    while proceed():
                        log.info('Waiting for %d mons in monmap...' %
                                 (num_mons))
                        r = _shell(
                            ctx=ctx,
                            cluster_name=cluster_name,
                            remote=remote,
                            args=[
                                'ceph',
                                'mon',
                                'dump',
                                '-f',
                                'json',
                            ],
                            stdout=StringIO(),
                        )
                        j = json.loads(r.stdout.getvalue())
                        if len(j['mons']) == num_mons:
                            break

        # refresh ceph.conf files for all mons + first mgr
        """
        for remote, roles in ctx.cluster.remotes.items():
            for mon in [r for r in roles
                        if teuthology.is_type('mon', cluster_name)(r)]:
                c_, _, id_ = teuthology.split_role(mon)
                _shell(ctx, cluster_name, remote, [
                    'ceph', 'orchestrator', 'service', 'redeploy',
                    'mon', id_,
                ])
        _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote, [
            'ceph', 'orchestrator', 'service', 'redeploy',
            'mgr', ctx.ceph[cluster_name].first_mgr,
        ])
        """

        yield

    finally:
        pass
Example #19
0
 def cloud_init_wait(self, name_or_ip):
     """
     Wait for cloud-init to complete on the name_or_ip OpenStack instance.
     """
     log.debug('cloud_init_wait ' + name_or_ip)
     client_args = {
         'user_at_host': '@'.join((self.username, name_or_ip)),
         'timeout': 10,
         'retry': False,
     }
     if self.key_filename:
         log.debug("using key " + self.key_filename)
         client_args['key_filename'] = self.key_filename
     with safe_while(sleep=2, tries=600,
                     action="cloud_init_wait " + name_or_ip) as proceed:
         success = False
         # CentOS 6.6 logs in /var/log/clout-init-output.log
         # CentOS 7.0 logs in /var/log/clout-init.log
         all_done = ("tail /var/log/cloud-init*.log ; " +
                     " test -f /tmp/init.out && tail /tmp/init.out ; " +
                     " grep '" + self.up_string + "' " +
                     "/var/log/cloud-init*.log")
         while proceed():
             try:
                 client = connection.connect(**client_args)
             except paramiko.PasswordRequiredException as e:
                 raise Exception(
                     "The private key requires a passphrase.\n"
                     "Create a new key with:"
                     "  openstack keypair create myself > myself.pem\n"
                     "  chmod 600 myself.pem\n"
                     "and call teuthology-openstack with the options\n"
                     " --key-name myself --key-filename myself.pem\n")
             except paramiko.AuthenticationException as e:
                 log.debug('cloud_init_wait AuthenticationException ' + str(e))
                 continue
             except socket.timeout as e:
                 log.debug('cloud_init_wait connect socket.timeout ' + str(e))
                 continue
             except socket.error as e:
                 log.debug('cloud_init_wait connect socket.error ' + str(e))
                 continue
             except Exception as e:
                 transients = ('Incompatible ssh peer', 'Unknown server')
                 for transient in transients:
                     if transient in str(e):
                         continue
                 log.exception('cloud_init_wait ' + name_or_ip)
                 raise
             log.debug('cloud_init_wait ' + all_done)
             try:
                 stdin, stdout, stderr = client.exec_command(all_done)
                 stdout.channel.settimeout(5)
                 out = stdout.read()
                 log.debug('cloud_init_wait stdout ' + all_done + ' ' + out)
             except socket.timeout as e:
                 client.close()
                 log.debug('cloud_init_wait socket.timeout ' + all_done)
                 continue
             except socket.error as e:
                 client.close()
                 log.debug('cloud_init_wait socket.error ' + str(e) + ' ' + all_done)
                 continue
             log.debug('cloud_init_wait stderr ' + all_done +
                       ' ' + stderr.read())
             if stdout.channel.recv_exit_status() == 0:
                 success = True
             client.close()
             if success:
                 break
         return success
Example #20
0
def ceph_mons(ctx, config):
    """
    Deploy any additional mons
    """
    cluster_name = config['cluster']
    fsid = ctx.ceph[cluster_name].fsid

    try:
        daemons = {}
        if config.get('add_mons_via_daemon_add'):
            # This is the old way of adding mons that works with the (early) octopus
            # cephadm scheduler.
            num_mons = 1
            for remote, roles in ctx.cluster.remotes.items():
                for mon in [r for r in roles
                            if teuthology.is_type('mon', cluster_name)(r)]:
                    c_, _, id_ = teuthology.split_role(mon)
                    if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
                        continue
                    log.info('Adding %s on %s' % (mon, remote.shortname))
                    num_mons += 1
                    _shell(ctx, cluster_name, remote, [
                        'ceph', 'orch', 'daemon', 'add', 'mon',
                        remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
                    ])
                    ctx.daemons.register_daemon(
                        remote, 'mon', id_,
                        cluster=cluster_name,
                        fsid=fsid,
                        logger=log.getChild(mon),
                        wait=False,
                        started=True,
                    )
                    daemons[mon] = (remote, id_)

                    with contextutil.safe_while(sleep=1, tries=180) as proceed:
                        while proceed():
                            log.info('Waiting for %d mons in monmap...' % (num_mons))
                            r = _shell(
                                ctx=ctx,
                                cluster_name=cluster_name,
                                remote=remote,
                                args=[
                                    'ceph', 'mon', 'dump', '-f', 'json',
                                ],
                                stdout=StringIO(),
                            )
                            j = json.loads(r.stdout.getvalue())
                            if len(j['mons']) == num_mons:
                                break
        else:
            nodes = []
            for remote, roles in ctx.cluster.remotes.items():
                for mon in [r for r in roles
                            if teuthology.is_type('mon', cluster_name)(r)]:
                    c_, _, id_ = teuthology.split_role(mon)
                    log.info('Adding %s on %s' % (mon, remote.shortname))
                    nodes.append(remote.shortname
                                 + ':' + ctx.ceph[cluster_name].mons[mon]
                                 + '=' + id_)
                    if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
                        continue
                    daemons[mon] = (remote, id_)

            _shell(ctx, cluster_name, remote, [
                'ceph', 'orch', 'apply', 'mon',
                str(len(nodes)) + ';' + ';'.join(nodes)]
                   )
            for mgr, i in daemons.items():
                remote, id_ = i
                ctx.daemons.register_daemon(
                    remote, 'mon', id_,
                    cluster=cluster_name,
                    fsid=fsid,
                    logger=log.getChild(mon),
                    wait=False,
                    started=True,
                )

            with contextutil.safe_while(sleep=1, tries=180) as proceed:
                while proceed():
                    log.info('Waiting for %d mons in monmap...' % (len(nodes)))
                    r = _shell(
                        ctx=ctx,
                        cluster_name=cluster_name,
                        remote=remote,
                        args=[
                            'ceph', 'mon', 'dump', '-f', 'json',
                        ],
                        stdout=StringIO(),
                    )
                    j = json.loads(r.stdout.getvalue())
                    if len(j['mons']) == len(nodes):
                        break

        # refresh our (final) ceph.conf file
        bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
        log.info('Generating final ceph.conf file...')
        r = _shell(
            ctx=ctx,
            cluster_name=cluster_name,
            remote=bootstrap_remote,
            args=[
                'ceph', 'config', 'generate-minimal-conf',
            ],
            stdout=StringIO(),
        )
        ctx.ceph[cluster_name].config_file = r.stdout.getvalue()

        yield

    finally:
        pass
Example #21
0
def rook_cluster(ctx, config):
    cluster_name = config['cluster']

    # count how many OSDs we'll create
    num_devs = 0
    num_hosts = 0
    for remote in ctx.cluster.remotes.keys():
        ls = remote.read_file('/scratch_devs').decode(
            'utf-8').strip().splitlines()
        num_devs += len(ls)
        num_hosts += 1
    ctx.rook[cluster_name].num_osds = num_devs

    # config
    ceph_conf = build_initial_config(ctx, config)
    ceph_conf_fp = BytesIO()
    ceph_conf.write(ceph_conf_fp)
    log.info(f'Config:\n{ceph_conf_fp.getvalue()}')
    _kubectl(ctx,
             ceph_conf, ['create', '-f', '-'],
             stdin=yaml.dump({
                 'apiVersion': 'v1',
                 'kind': 'ConfigMap',
                 'metadata': {
                     'name': 'rook-config-override',
                     'namespace': 'rook-ceph'
                 },
                 'data': {
                     'config': ceph_conf_fp.getvalue()
                 }
             }))

    # cluster
    cluster = {
        'apiVersion': 'ceph.rook.io/v1',
        'kind': 'CephCluster',
        'metadata': {
            'name': 'rook-ceph',
            'namespace': 'rook-ceph'
        },
        'spec': {
            'cephVersion': {
                'image': ctx.rook[cluster_name].image,
                'allowUnsupported': True,
            },
            'dataDirHostPath': '/var/lib/rook',
            'skipUpgradeChecks': True,
            'mgr': {
                'count': 1,
                'modules': [
                    {
                        'name': 'rook',
                        'enabled': True
                    },
                ],
            },
            'mon': {
                'count': num_hosts,
                'allowMultiplePerNode': True,
            },
        }
    }
    teuthology.deep_merge(cluster['spec'], config.get('spec', {}))

    cluster_yaml = yaml.dump(cluster)
    log.info(f'Cluster:\n{cluster_yaml}')
    try:
        ctx.rook[cluster_name].remote.write_file('cluster.yaml', cluster_yaml)
        _kubectl(ctx, config, ['create', '-f', 'cluster.yaml'])
        yield

    except Exception as e:
        log.exception(e)
        raise

    finally:
        _kubectl(ctx,
                 config, ['delete', '-f', 'cluster.yaml'],
                 check_status=False)

        # wait for cluster to shut down
        log.info('Waiting for cluster to stop')
        running = True
        with safe_while(sleep=5, tries=100,
                        action="wait for teardown") as proceed:
            while running and proceed():
                p = _kubectl(
                    ctx,
                    config,
                    ['-n', 'rook-ceph', 'get', 'pods'],
                    stdout=BytesIO(),
                )
                running = False
                for line in p.stdout.getvalue().decode(
                        'utf-8').strip().splitlines():
                    name, ready, status, _ = line.split(None, 3)
                    if (name != 'NAME' and not name.startswith('csi-')
                            and not name.startswith('rook-ceph-operator-')
                            and not name.startswith('rook-ceph-tools-')):
                        running = True
                        break

        _kubectl(
            ctx,
            config,
            ['-n', 'rook-ceph', 'delete', 'configmap', 'rook-config-override'],
            check_status=False,
        )
        ctx.rook[cluster_name].remote.run(args=['rm', '-f', 'cluster.yaml'])
Example #22
0
def connect(user_at_host, host_key=None, keep_alive=False, timeout=60,
            _SSHClient=None, _create_key=None, retry=True, key_filename=None):
    """
    ssh connection routine.

    :param user_at_host: user@host
    :param host_key: ssh key
    :param keep_alive: keep_alive indicator
    :param timeout:    timeout in seconds
    :param _SSHClient: client, default is paramiko ssh client
    :param _create_key: routine to create a key (defaults to local reate_key)
    :param retry:       Whether or not to retry failed connection attempts
                        (eventually giving up if none succeed). Default is True
    :param key_filename:  Optionally override which private key to use.
    :return: ssh connection.
    """
    user, host = split_user(user_at_host)
    if _SSHClient is None:
        _SSHClient = paramiko.SSHClient
    ssh = _SSHClient()

    if _create_key is None:
        _create_key = create_key

    if host_key is None:
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        if config.verify_host_keys is True:
            ssh.load_system_host_keys()

    else:
        keytype, key = host_key.split(' ', 1)
        ssh.get_host_keys().add(
            hostname=host,
            keytype=keytype,
            key=_create_key(keytype, key)
            )

    connect_args = dict(
        hostname=host,
        username=user,
        timeout=timeout
    )

    ssh_config_path = os.path.expanduser("~/.ssh/config")
    if os.path.exists(ssh_config_path):
        ssh_config = paramiko.SSHConfig()
        ssh_config.parse(open(ssh_config_path))
        opts = ssh_config.lookup(host)
        if not key_filename and 'identityfile' in opts:
            key_filename = opts['identityfile']

    if key_filename:
        if not isinstance(key_filename, list):
            key_filename = [key_filename]
        key_filename = [os.path.expanduser(f) for f in key_filename]
        connect_args['key_filename'] = key_filename

    log.debug(connect_args)

    if not retry:
        ssh.connect(**connect_args)
    else:
        # Retries are implemented using safe_while
        with safe_while(sleep=1, action='connect to ' + host) as proceed:
            while proceed():
                try:
                    ssh.connect(**connect_args)
                    break
                except paramiko.AuthenticationException:
                    log.exception(
                        "Error connecting to {host}".format(host=host))
    ssh.get_transport().set_keepalive(keep_alive)
    return ssh
Example #23
0
def rook_operator(ctx, config):
    cluster_name = config['cluster']
    rook_branch = config.get('rook_branch', 'master')
    rook_git_url = config.get('rook_git_url', 'https://github.com/rook/rook')

    log.info(f'Cloning {rook_git_url} branch {rook_branch}')
    ctx.rook[cluster_name].remote.run(args=[
        'rm',
        '-rf',
        'rook',
        run.Raw('&&'),
        'git',
        'clone',
        '--single-branch',
        '--branch',
        rook_branch,
        rook_git_url,
        'rook',
    ])

    # operator.yaml
    operator_yaml = ctx.rook[cluster_name].remote.read_file(
        'rook/cluster/examples/kubernetes/ceph/operator.yaml')
    rook_image = config.get('rook_image')
    if rook_image:
        log.info(f'Patching operator to use image {rook_image}')
        crs = list(yaml.load_all(operator_yaml, Loader=yaml.FullLoader))
        assert len(crs) == 2
        crs[1]['spec']['template']['spec']['containers'][0][
            'image'] = rook_image
        operator_yaml = yaml.dump_all(crs)
    ctx.rook[cluster_name].remote.write_file('operator.yaml', operator_yaml)

    op_job = None
    try:
        log.info('Deploying operator')
        _kubectl(ctx, config, [
            'create',
            '-f',
            'rook/cluster/examples/kubernetes/ceph/crds.yaml',
            '-f',
            'rook/cluster/examples/kubernetes/ceph/common.yaml',
            '-f',
            'operator.yaml',
        ])

        # on centos:
        if teuthology.get_distro(ctx) == 'centos':
            _kubectl(ctx, config, [
                '-n', 'rook-ceph', 'set', 'env', 'deploy/rook-ceph-operator',
                'ROOK_HOSTPATH_REQUIRES_PRIVILEGED=true'
            ])

        # wait for operator
        op_name = None
        with safe_while(sleep=10, tries=90,
                        action="wait for operator") as proceed:
            while not op_name and proceed():
                p = _kubectl(
                    ctx,
                    config,
                    [
                        '-n', 'rook-ceph', 'get', 'pods', '-l',
                        'app=rook-ceph-operator'
                    ],
                    stdout=BytesIO(),
                )
                for line in p.stdout.getvalue().decode(
                        'utf-8').strip().splitlines():
                    name, ready, status, _ = line.split(None, 3)
                    if status == 'Running':
                        op_name = name
                        break

        # log operator output
        op_job = _kubectl(
            ctx,
            config,
            ['-n', 'rook-ceph', 'logs', '-f', op_name],
            wait=False,
            logger=log.getChild('operator'),
        )

        yield

    except Exception as e:
        log.exception(e)
        raise

    finally:
        log.info('Cleaning up rook operator')
        _kubectl(ctx, config, [
            'delete',
            '-f',
            'operator.yaml',
        ])
        if False:
            # don't bother since we'll tear down k8s anyway (and this mysteriously
            # fails sometimes when deleting some of the CRDs... not sure why!)
            _kubectl(ctx, config, [
                'delete',
                '-f',
                'rook/cluster/examples/kubernetes/ceph/common.yaml',
            ])
            _kubectl(ctx, config, [
                'delete',
                '-f',
                'rook/cluster/examples/kubernetes/ceph/crds.yaml',
            ])
        ctx.rook[cluster_name].remote.run(
            args=['rm', '-rf', 'rook', 'operator.yaml'])
        if op_job:
            op_job.wait()
        run.wait(ctx.cluster.run(args=['sudo', 'rm', '-rf', '/var/lib/rook']))
Example #24
0
def cli_test(ctx, config):
    """
     ceph-deploy cli to exercise most commonly use cli's and ensure
     all commands works and also startup the init system.

    """
    log.info('Ceph-deploy Test')
    if config is None:
        config = {}
    test_branch = ''
    conf_dir = teuthology.get_testdir(ctx) + "/cdtest"

    def execute_cdeploy(admin, cmd, path):
        """Execute ceph-deploy commands """
        """Either use git path or repo path """
        args = ['cd', conf_dir, run.Raw(';')]
        if path:
            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
        else:
            args.append('ceph-deploy')
        args.append(run.Raw(cmd))
        ec = admin.run(args=args, check_status=False).exitstatus
        if ec != 0:
            raise RuntimeError(
                "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))

    if config.get('rhbuild'):
        path = None
    else:
        path = teuthology.get_testdir(ctx)
        # test on branch from config eg: wip-* , master or next etc
        # packages for all distro's should exist for wip*
        if ctx.config.get('branch'):
            branch = ctx.config.get('branch')
            test_branch = ' --dev={branch} '.format(branch=branch)
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    for node, role in mons.remotes.iteritems():
        admin = node
        admin.run(args=['mkdir', conf_dir], check_status=False)
        nodename = admin.shortname
    system_type = teuthology.get_system_type(admin)
    if config.get('rhbuild'):
        admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
    log.info('system type is %s', system_type)
    osds = ctx.cluster.only(teuthology.is_type('osd'))

    for remote, roles in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        log.info("roles %s", roles)
        if (len(devs) < 3):
            log.error(
                'Test needs minimum of 3 devices, only found %s',
                str(devs))
            raise RuntimeError("Needs minimum of 3 devices ")

    conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
    new_cmd = 'new ' + nodename
    execute_cdeploy(admin, new_cmd, path)
    if config.get('conf') is not None:
        confp = config.get('conf')
        for section, keys in confp.iteritems():
            lines = '[{section}]\n'.format(section=section)
            teuthology.append_lines_to_file(admin, conf_path, lines,
                                            sudo=True)
            for key, value in keys.iteritems():
                log.info("[%s] %s = %s" % (section, key, value))
                lines = '{key} = {value}\n'.format(key=key, value=value)
                teuthology.append_lines_to_file(admin, conf_path, lines,
                                                sudo=True)
    new_mon_install = 'install {branch} --mon '.format(
        branch=test_branch) + nodename
    new_mgr_install = 'install {branch} --mgr '.format(
        branch=test_branch) + nodename
    new_osd_install = 'install {branch} --osd '.format(
        branch=test_branch) + nodename
    new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
    create_initial = 'mon create-initial '
    # either use create-keys or push command
    push_keys = 'admin ' + nodename
    execute_cdeploy(admin, new_mon_install, path)
    execute_cdeploy(admin, new_mgr_install, path)
    execute_cdeploy(admin, new_osd_install, path)
    execute_cdeploy(admin, new_admin, path)
    execute_cdeploy(admin, create_initial, path)
    execute_cdeploy(admin, push_keys, path)

    for i in range(3):
        zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
        prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
        execute_cdeploy(admin, zap_disk, path)
        execute_cdeploy(admin, prepare, path)

    log.info("list files for debugging purpose to check file permissions")
    admin.run(args=['ls', run.Raw('-lt'), conf_dir])
    remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
    r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
    out = r.stdout.getvalue()
    log.info('Ceph health: %s', out.rstrip('\n'))
    log.info("Waiting for cluster to become healthy")
    with contextutil.safe_while(sleep=10, tries=6,
                                action='check health') as proceed:
       while proceed():
           r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
           out = r.stdout.getvalue()
           if (out.split(None,1)[0] == 'HEALTH_OK'):
               break
    rgw_install = 'install {branch} --rgw {node}'.format(
        branch=test_branch,
        node=nodename,
    )
    rgw_create = 'rgw create ' + nodename
    execute_cdeploy(admin, rgw_install, path)
    execute_cdeploy(admin, rgw_create, path)
    log.info('All ceph-deploy cli tests passed')
    try:
        yield
    finally:
        log.info("cleaning up")
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
                              'sudo', 'systemctl', 'stop', 'ceph.target'],
                        check_status=False)
        time.sleep(4)
        for i in range(3):
            umount_dev = "{d}1".format(d=devs[i])
            r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
        cmd = 'purge ' + nodename
        execute_cdeploy(admin, cmd, path)
        cmd = 'purgedata ' + nodename
        execute_cdeploy(admin, cmd, path)
        log.info("Removing temporary dir")
        admin.run(
            args=[
                'rm',
                run.Raw('-rf'),
                run.Raw(conf_dir)],
            check_status=False)
        if config.get('rhbuild'):
            admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
Example #25
0
def ceph_mons(ctx, config):
    """
    Deploy any additional mons
    """
    cluster_name = config['cluster']
    fsid = ctx.ceph[cluster_name].fsid
    num_mons = 1

    try:
        for remote, roles in ctx.cluster.remotes.items():
            for mon in [r for r in roles
                        if teuthology.is_type('mon', cluster_name)(r)]:
                c_, _, id_ = teuthology.split_role(mon)
                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
                    continue
                log.info('Adding %s on %s' % (mon, remote.shortname))
                num_mons += 1
                _shell(ctx, cluster_name, remote, [
                    'ceph', 'orch', 'daemon', 'add', 'mon',
                    remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
                ])
                ctx.daemons.register_daemon(
                    remote, 'mon', id_,
                    cluster=cluster_name,
                    fsid=fsid,
                    logger=log.getChild(mon),
                    wait=False,
                    started=True,
                )

                with contextutil.safe_while(sleep=1, tries=180) as proceed:
                    while proceed():
                        log.info('Waiting for %d mons in monmap...' % (num_mons))
                        r = _shell(
                            ctx=ctx,
                            cluster_name=cluster_name,
                            remote=remote,
                            args=[
                                'ceph', 'mon', 'dump', '-f', 'json',
                            ],
                            stdout=StringIO(),
                        )
                        j = json.loads(r.stdout.getvalue())
                        if len(j['mons']) == num_mons:
                            break

        # refresh our (final) ceph.conf file
        log.info('Generating final ceph.conf file...')
        r = _shell(
            ctx=ctx,
            cluster_name=cluster_name,
            remote=remote,
            args=[
                'ceph', 'config', 'generate-minimal-conf',
            ],
            stdout=StringIO(),
        )
        ctx.ceph[cluster_name].config_file = r.stdout.getvalue()

        yield

    finally:
        pass
Example #26
0
def task(ctx, config):
    log.info('Setting up nvme_loop on scratch devices...')
    host = 'hostnqn'
    port = '1'
    devs_by_remote = {}
    old_scratch_by_remote = {}
    for remote, roles in ctx.cluster.remotes.items():
        devs = teuthology.get_scratch_devices(remote)
        devs_by_remote[remote] = devs
        base = '/sys/kernel/config/nvmet'
        remote.run(args=[
            'sudo',
            'modprobe',
            'nvme_loop',
            run.Raw('&&'),
            'sudo',
            'mkdir',
            '-p',
            f'{base}/hosts/{host}',
            run.Raw('&&'),
            'sudo',
            'mkdir',
            '-p',
            f'{base}/ports/{port}',
            run.Raw('&&'),
            'echo',
            'loop',
            run.Raw('|'),
            'sudo',
            'tee',
            f'{base}/ports/{port}/addr_trtype',
        ])
        for dev in devs:
            short = dev.split('/')[-1]
            log.info(f'Connecting nvme_loop {remote.shortname}:{dev}...')
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                f'{base}/subsystems/{short}',
                run.Raw('&&'),
                'echo',
                '1',
                run.Raw('|'),
                'sudo',
                'tee',
                f'{base}/subsystems/{short}/attr_allow_any_host',
                run.Raw('&&'),
                'sudo',
                'mkdir',
                '-p',
                f'{base}/subsystems/{short}/namespaces/1',
                run.Raw('&&'),
                'echo',
                dev,
                run.Raw('|'),
                'sudo',
                'tee',
                f'{base}/subsystems/{short}/namespaces/1/device_path',
                run.Raw('&&'),
                'echo',
                '1',
                run.Raw('|'),
                'sudo',
                'tee',
                f'{base}/subsystems/{short}/namespaces/1/enable',
                run.Raw('&&'),
                'sudo',
                'ln',
                '-s',
                f'{base}/subsystems/{short}',
                f'{base}/ports/{port}/subsystems/{short}',
                run.Raw('&&'),
                'sudo',
                'nvme',
                'connect',
                '-t',
                'loop',
                '-n',
                short,
                '-q',
                host,
            ])

        # identify nvme_loops devices
        old_scratch_by_remote[remote] = remote.read_file('/scratch_devs')

        with contextutil.safe_while(sleep=1, tries=15) as proceed:
            while proceed():
                p = remote.run(args=['sudo', 'nvme', 'list'],
                               stdout=StringIO())
                new_devs = []
                for line in p.stdout.getvalue().splitlines():
                    dev, _, vendor = line.split()[0:3]
                    if dev.startswith('/dev/') and vendor == 'Linux':
                        new_devs.append(dev)
                log.info(f'new_devs {new_devs}')
                assert len(new_devs) <= len(devs)
                if len(new_devs) == len(devs):
                    break

        remote.write_file(path='/scratch_devs',
                          data='\n'.join(new_devs) + '\n',
                          sudo=True)

    try:
        yield

    finally:
        for remote, devs in devs_by_remote.items():
            for dev in devs:
                short = dev.split('/')[-1]
                log.info(
                    f'Disconnecting nvme_loop {remote.shortname}:{dev}...')
                remote.run(
                    args=['sudo', 'nvme', 'disconnect', '-n', short],
                    check_status=False,
                )
            remote.write_file(path='/scratch_devs',
                              data=old_scratch_by_remote[remote],
                              sudo=True)