Example #1
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
Example #2
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx)
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith(
            'mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [
            item for remote_roles in mdss.remotes.values()
            for item in remote_roles
        ]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph',
            'mds', 'set_max_mds',
            str(num_active)
        ])

    yield
Example #3
0
    def execute_playbook(self):
        """
        Execute ansible-playbook

        :param _logfile: Use this file-like object instead of a LoggerFile for
                         testing
        """

        args = [
            'ANSIBLE_STDOUT_CALLBACK=debug', 'ansible-playbook', '-vv', '-i',
            'inven.yml', 'site.yml'
        ]
        log.debug("Running %s", args)
        # If there is an installer.0 node, use that for the installer.
        # Otherwise, use the first mon node as installer node.
        ansible_loc = self.ctx.cluster.only('installer.0')
        (ceph_first_mon, ) = self.ctx.cluster.only(
            misc.get_first_mon(self.ctx, self.config)).remotes.keys()
        if ansible_loc.remotes:
            (ceph_installer, ) = ansible_loc.remotes.keys()
        else:
            ceph_installer = ceph_first_mon
        self.ceph_first_mon = ceph_first_mon
        self.ceph_installer = ceph_installer
        self.args = args
        if self.config.get('rhbuild'):
            self.run_rh_playbook()
        else:
            self.run_playbook()
Example #4
0
def task(ctx, config):
    """
    Test monitor recovery from OSD
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'))

    mons = ctx.cluster.only(teuthology.is_type('mon'))
    # note down the first cluster_name and mon_id
    # we will recover it later on
    cluster_name, _, mon_id = teuthology.split_role(first_mon)
    _nuke_mons(manager, mons, mon_id)
    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
        cluster=cluster_name)
    keyring_path = config.get('keyring_path', default_keyring)
    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
    _revive_mons(manager, mons, mon_id, keyring_path)
    _revive_mgrs(ctx, manager)
    _revive_osds(ctx, manager)
Example #5
0
def task(ctx, config):
    """
    Stress test the monitor by thrashing them while another task/workunit
    is running.

    Please refer to MonitorThrasher class for further information on the
    available options.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mon_thrash task only accepts a dict for configuration'
    assert len(_get_mons(ctx)) > 2, \
        'mon_thrash task requires at least 3 monitors'
    log.info('Beginning mon_thrash...')
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )
    thrash_proc = MonitorThrasher(ctx,
        manager, config,
        logger=log.getChild('mon_thrasher'))
    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining mon_thrasher')
        thrash_proc.do_join()
        mons = _get_mons(ctx)
        manager.wait_for_mon_quorum_size(len(mons))
Example #6
0
    def __init__(self, ctx, admin_remote=None):
        self._ctx = ctx

        self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
        if len(self.mds_ids) == 0:
            raise RuntimeError("This task requires at least one MDS")

        first_mon = misc.get_first_mon(ctx, None)
        if admin_remote is None:
            (self.admin_remote,
             ) = ctx.cluster.only(first_mon).remotes.iterkeys()
        else:
            self.admin_remote = admin_remote
        self.mon_manager = ceph_manager.CephManager(
            self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
        if hasattr(self._ctx, "daemons"):
            # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task
            self.mds_daemons = dict([
                (mds_id, self._ctx.daemons.get_daemon('mds', mds_id))
                for mds_id in self.mds_ids
            ])

        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
        self.client_id = client_list[0]
        self.client_remote = list(
            misc.get_clients(ctx=ctx,
                             roles=["client.{0}".format(self.client_id)
                                    ]))[0][1]
Example #7
0
def is_healthy(ctx, config):
    """Wait until a Ceph cluster is healthy."""
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
    max_tries = 90  # 90 tries * 10 secs --> 15 minutes
    tries = 0
    while True:
        tries += 1
        if tries >= max_tries:
            msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
            raise RuntimeError(msg)

        r = remote.run(
            args=[
                'cd',
                '{tdir}'.format(tdir=testdir),
                run.Raw('&&'),
                'sudo', 'ceph',
                'health',
                ],
            stdout=StringIO(),
            logger=log.getChild('health'),
            )
        out = r.stdout.getvalue()
        log.info('Ceph health: %s', out.rstrip('\n'))
        if out.split(None, 1)[0] == 'HEALTH_OK':
            break
        time.sleep(10)
Example #8
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx)
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            'mds', 'set_max_mds', str(num_active)])

    yield
Example #9
0
def wait_for_mon_quorum(ctx, config):
    """
    Check renote ceph status until all monitors are up.

    :param ctx: Context
    :param config: Configuration
    """

    assert isinstance(config, list)
    firstmon = teuthology.get_first_mon(ctx, config)
    (remote,) = ctx.cluster.only(firstmon).remotes.keys()
    while True:
        r = remote.run(
            args=[
                'ceph',
                'quorum_status',
                ],
            stdout=StringIO(),
            logger=log.getChild('quorum_status'),
            )
        j = json.loads(r.stdout.getvalue())
        q = j.get('quorum_names', [])
        log.debug('Quorum: %s', q)
        if sorted(q) == sorted(config):
            break
        time.sleep(1)
    def __init__(self, ctx, manager, config, logger):
        self.ctx = ctx
        self.manager = manager

        self.stopping = False
        self.logger = logger
        self.config = config

        if self.config is None:
            self.config = dict()

        self.check_interval = float(self.config.get('interval', 30.0))

        first_mon = teuthology.get_first_mon(ctx, config)
        remote = ctx.cluster.only(first_mon).remotes.keys()[0]
        proc = remote.run(
            args=[
                'sudo',
                'ceph-mon',
                '-i', first_mon[4:],
                '--show-config-value', 'mon_clock_drift_allowed'
                ], stdout=StringIO(), wait=True
                )
        self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))

        self.expect_skew = self.config.get('expect-skew', False)
        self.never_fail = self.config.get('never-fail', False)
        self.at_least_once = self.config.get('at-least-once', True)
        self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
def task(ctx, config):
    """
    Use clas ClockSkewCheck to check for clock skews on the monitors.
    This task will spawn a thread running ClockSkewCheck's do_check().

    All the configuration will be directly handled by ClockSkewCheck,
    so please refer to the class documentation for further information.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mon_clock_skew_check task only accepts a dict for configuration'
    log.info('Beginning mon_clock_skew_check...')
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    skew_check = ClockSkewCheck(ctx,
        manager, config,
        logger=log.getChild('mon_clock_skew_check'))
    skew_check_thread = gevent.spawn(skew_check.do_check)
    try:
        yield
    finally:
        log.info('joining mon_clock_skew_check')
        skew_check.finish()
        skew_check_thread.get()
Example #12
0
    def execute_playbook(self):
        """
        Execute ansible-playbook

        :param _logfile: Use this file-like object instead of a LoggerFile for
                         testing
        """

        args = [
            'ANSIBLE_STDOUT_CALLBACK=debug',
            'ansible-playbook', '-vv',
            '-i', 'inven.yml', 'site.yml'
        ]
        log.debug("Running %s", args)
        # If there is an installer.0 node, use that for the installer.
        # Otherwise, use the first mon node as installer node.
        ansible_loc = self.ctx.cluster.only('installer.0')
        (ceph_first_mon,) = self.ctx.cluster.only(
            misc.get_first_mon(self.ctx,
                               self.config)).remotes.iterkeys()
        if ansible_loc.remotes:
            (ceph_installer,) = ansible_loc.remotes.iterkeys()
        else:
            ceph_installer = ceph_first_mon
        self.ceph_first_mon = ceph_first_mon
        self.ceph_installer = ceph_installer
        self.args = args
        if self.config.get('rhbuild'):
            self.run_rh_playbook()
        else:
            self.run_playbook()
Example #13
0
def wait_for_mon_quorum(ctx, config):
    """
    Check renote ceph status until all monitors are up.

    :param ctx: Context
    :param config: Configuration
    """
    if isinstance(config, dict):
        mons = config['daemons']
        cluster_name = config.get('cluster', 'ceph')
    else:
        assert isinstance(config, list)
        mons = config
        cluster_name = 'ceph'
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    while True:
        r = remote.run(
            args=[
                'sudo',
                'ceph',
                'quorum_status',
            ],
            stdout=StringIO(),
            logger=log.getChild('quorum_status'),
        )
        j = json.loads(r.stdout.getvalue())
        q = j.get('quorum_names', [])
        log.debug('Quorum: %s', q)
        if sorted(q) == sorted(mons):
            break
        time.sleep(1)
Example #14
0
    def test_standby_for_invalid_fscid(self):
        # Set invalid standby_fscid with other mds standby_rank
        # stopping active mds service should not end up in mon crash

        # Get configured mons in the cluster
        first_mon = teuthology.get_first_mon(self.ctx, self.configs_set)
        (mon, ) = self.ctx.cluster.only(first_mon).remotes.iterkeys()
        manager = CephManager(
            mon,
            ctx=self.ctx,
            logger=log.getChild('ceph_manager'),
        )
        configured_mons = manager.get_mon_quorum()

        use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
        mds_a, mds_b, mds_c = use_daemons
        log.info("Using MDS daemons: {0}".format(use_daemons))

        def set_standby_for_rank(leader_rank, follower_id):
            self.set_conf("mds.{0}".format(follower_id),
                          "mds_standby_for_rank", leader_rank)

        # Create one fs
        fs_a = self.mds_cluster.newfs("cephfs")

        # Set all the daemons to have a rank assignment but no other
        # standby preferences.
        set_standby_for_rank(0, mds_a)
        set_standby_for_rank(0, mds_b)

        # Set third daemon to have invalid fscid assignment and no other
        # standby preferences
        invalid_fscid = 123
        self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid",
                      invalid_fscid)

        #Restart all the daemons to make the standby preference applied
        self.mds_cluster.mds_restart(mds_a)
        self.mds_cluster.mds_restart(mds_b)
        self.mds_cluster.mds_restart(mds_c)
        self.wait_for_daemon_start([mds_a, mds_b, mds_c])

        #Stop active mds daemon service of fs
        if (fs_a.get_active_names(), [mds_a]):
            self.mds_cluster.mds_stop(mds_a)
            self.mds_cluster.mds_fail(mds_a)
            fs_a.wait_for_daemons()
        else:
            self.mds_cluster.mds_stop(mds_b)
            self.mds_cluster.mds_fail(mds_b)
            fs_a.wait_for_daemons()

        #Get active mons from cluster
        active_mons = manager.get_mon_quorum()

        #Check for active quorum mon status and configured mon status
        self.assertEqual(
            active_mons, configured_mons,
            "Not all mons are in quorum Invalid standby invalid fscid test failed!"
        )
Example #15
0
def is_healthy(ctx, config):
    """Wait until a Ceph cluster is healthy."""
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    (remote, ) = ctx.cluster.only(ceph_admin).remotes.keys()
    max_tries = 90  # 90 tries * 10 secs --> 15 minutes
    tries = 0
    while True:
        tries += 1
        if tries >= max_tries:
            msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
            raise RuntimeError(msg)

        r = remote.run(
            args=[
                'cd',
                '{tdir}'.format(tdir=testdir),
                run.Raw('&&'),
                'sudo',
                'ceph',
                'health',
            ],
            stdout=StringIO(),
            logger=log.getChild('health'),
        )
        out = r.stdout.getvalue()
        log.debug('Ceph health: %s', out.rstrip('\n'))
        if out.split(None, 1)[0] == 'HEALTH_OK':
            break
        time.sleep(10)
Example #16
0
def task(ctx, config):
    """
    Test monitor recovery from OSD
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
    manager = ceph_manager.CephManager(mon,
                                       ctx=ctx,
                                       logger=log.getChild('ceph_manager'))

    mons = ctx.cluster.only(teuthology.is_type('mon'))
    # note down the first cluster_name and mon_id
    # we will recover it later on
    cluster_name, _, mon_id = teuthology.split_role(first_mon)
    _nuke_mons(manager, mons, mon_id)
    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
        cluster=cluster_name)
    keyring_path = config.get('keyring_path', default_keyring)
    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
    _revive_mons(manager, mons, mon_id, keyring_path)
    _revive_mgrs(ctx, manager)
    _revive_osds(ctx, manager)
Example #17
0
def download_ceph_deploy(ctx, config):
    log.info('Downloading ceph-deploy...')
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)

    ctx.cluster.only(ceph_admin).run(
        args=[
            'git', 'clone',
#            'http://github.com/ceph/ceph-deploy.git',
            'git://ceph.com/ceph-deploy.git',
            '{tdir}/ceph-deploy'.format(tdir=testdir),
            ],
        )
    ctx.cluster.only(ceph_admin).run(
        args=[
            'cd',
            '{tdir}/ceph-deploy'.format(tdir=testdir),
            run.Raw('&&'),
            './bootstrap',
            ],
        )

    try:
        yield
    finally:
        log.info('Removing ceph-deploy ...')
        ctx.cluster.only(ceph_admin).run(
            args=[
                'rm',
                '-rf',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                ],
            )
Example #18
0
  def __init__(self, ctx, manager, config, logger):
    self.ctx = ctx
    self.manager = manager;

    self.stopping = False
    self.logger = logger
    self.config = config

    if self.config is None:
      self.config = dict()

    self.check_interval = float(self.config.get('interval', 30.0))

    first_mon = teuthology.get_first_mon(ctx, config)
    remote = ctx.cluster.only(first_mon).remotes.keys()[0]
    proc = remote.run(
        args=[
          'sudo',
          'ceph-mon',
          '-i', first_mon[4:],
          '--show-config-value', 'mon_clock_drift_allowed'
          ], stdout=StringIO(), wait=True
        )
    self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))

    self.expect_skew = self.config.get('expect-skew', False)
    self.never_fail = self.config.get('never-fail', False)
    self.at_least_once = self.config.get('at-least-once', True)
    self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
Example #19
0
def task(ctx, config):
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mon_clock_skew_check task only accepts a dict for configuration'
    interval = float(config.get('interval', 30.0))
    expect_skew = config.get('expect-skew', False)

    log.info('Beginning mon_clock_skew_check...')
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.keys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    quorum_size = len(teuthology.get_mon_names(ctx))
    manager.wait_for_mon_quorum_size(quorum_size)

    # wait a bit
    log.info('sleeping for {s} seconds'.format(s=interval))
    time.sleep(interval)

    health = manager.get_mon_health(True)
    log.info('got health %s' % health)
    if expect_skew:
        if 'MON_CLOCK_SKEW' not in health['checks']:
            raise RuntimeError('expected MON_CLOCK_SKEW but got none')
    else:
        if 'MON_CLOCK_SKEW' in health['checks']:
            raise RuntimeError('got MON_CLOCK_SKEW but expected none')
Example #20
0
def download_ceph_deploy(ctx, config):
    """
    Downloads ceph-deploy from the ceph.com git mirror and (by default)
    switches to the master branch. If the `ceph-deploy-branch` is specified, it
    will use that instead.
    """
    log.info("Downloading ceph-deploy...")
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    default_cd_branch = {"ceph-deploy-branch": "master"}
    ceph_deploy_branch = config.get("ceph-deploy", default_cd_branch).get("ceph-deploy-branch")

    ctx.cluster.only(ceph_admin).run(
        args=[
            "git",
            "clone",
            "-b",
            ceph_deploy_branch,
            teuth_config.ceph_git_base_url + "ceph-deploy.git",
            "{tdir}/ceph-deploy".format(tdir=testdir),
        ]
    )
    ctx.cluster.only(ceph_admin).run(
        args=["cd", "{tdir}/ceph-deploy".format(tdir=testdir), run.Raw("&&"), "./bootstrap"]
    )

    try:
        yield
    finally:
        log.info("Removing ceph-deploy ...")
        ctx.cluster.only(ceph_admin).run(args=["rm", "-rf", "{tdir}/ceph-deploy".format(tdir=testdir)])
Example #21
0
def task(ctx, config):
  """
  Use clas ClockSkewCheck to check for clock skews on the monitors.
  This task will spawn a thread running ClockSkewCheck's do_check().

  All the configuration will be directly handled by ClockSkewCheck,
  so please refer to the class documentation for further information.
  """
  if config is None:
    config = {}
  assert isinstance(config, dict), \
      'mon_clock_skew_check task only accepts a dict for configuration'
  log.info('Beginning mon_clock_skew_check...')
  first_mon = teuthology.get_first_mon(ctx, config)
  (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  manager = ceph_manager.CephManager(
      mon,
      ctx=ctx,
      logger=log.getChild('ceph_manager'),
      )

  skew_check = ClockSkewCheck(ctx,
      manager, config,
      logger=log.getChild('mon_clock_skew_check'))
  skew_check_thread = gevent.spawn(skew_check.do_check)
  try:
    yield
  finally:
    log.info('joining mon_clock_skew_check')
    skew_check.finish()
    skew_check_thread.get()
Example #22
0
def task(ctx, config):
    """
    Test [deep] repair in several situations:
      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]

    The config should be as follows:

      Must include the log-whitelist below
      Must enable filestore_debug_inject_read_err config

    example:

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
          - 'candidate had a stat error'
          - 'candidate had a read error'
          - 'deep-scrub 0 missing, 1 inconsistent objects'
          - 'deep-scrub 0 missing, 4 inconsistent objects'
          - 'deep-scrub 1 errors'
          - 'deep-scrub 4 errors'
          - '!= known omap_digest'
          - 'repair 0 missing, 1 inconsistent objects'
          - 'repair 0 missing, 4 inconsistent objects'
          - 'repair 1 errors, 1 fixed'
          - 'repair 4 errors, 4 fixed'
          - 'scrub 0 missing, 1 inconsistent'
          - 'scrub 1 errors'
          - 'size 1 != known size'
        conf:
          osd:
            filestore debug inject read err: true
    - repair_test:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'repair_test task only accepts a dict for config'

    if not hasattr(ctx, 'manager'):
        first_mon = teuthology.get_first_mon(ctx, config)
        (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
        ctx.manager = ceph_manager.CephManager(
            mon, ctx=ctx, logger=log.getChild('ceph_manager'))

    ctx.manager.wait_for_all_up()

    repair_test_1(ctx, mdataerr, choose_primary, "scrub")
    repair_test_1(ctx, mdataerr, choose_replica, "scrub")
    repair_test_1(ctx, dataerr, choose_primary, "deep-scrub")
    repair_test_1(ctx, dataerr, choose_replica, "deep-scrub")
    repair_test_1(ctx, trunc, choose_primary, "scrub")
    repair_test_1(ctx, trunc, choose_replica, "scrub")
    repair_test_2(ctx, config, choose_primary)
    repair_test_2(ctx, config, choose_replica)

    repair_test_erasure_code(ctx, hinfoerr, 'primary', "deep-scrub")
Example #23
0
def download_ceph_deploy(ctx, config):
    """
    Downloads ceph-deploy from the ceph.com git mirror and (by default)
    switches to the master branch. If the `ceph-deploy-branch` is specified, it
    will use that instead.
    """
    log.info('Downloading ceph-deploy...')
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
    ceph_deploy_branch = config.get('ceph-deploy-branch', 'master')

    ceph_admin.run(args=[
        'git',
        'clone',
        '-b',
        ceph_deploy_branch,
        teuth_config.ceph_git_base_url + 'ceph-deploy.git',
        '{tdir}/ceph-deploy'.format(tdir=testdir),
    ], )
    ceph_admin.run(args=[
        'cd',
        '{tdir}/ceph-deploy'.format(tdir=testdir),
        run.Raw('&&'),
        './bootstrap',
    ], )

    try:
        yield
    finally:
        log.info('Removing ceph-deploy ...')
        ceph_admin.run(args=[
            'rm',
            '-rf',
            '{tdir}/ceph-deploy'.format(tdir=testdir),
        ], )
Example #24
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)  # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
Example #25
0
def crush_setup(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    profile = config.get('crush_tunables', 'default')
    log.info('Setting crush tunables to %s', profile)
    mon_remote.run(args=['sudo', 'ceph', 'osd', 'crush', 'tunables', profile])
    yield
Example #26
0
def crush_setup(ctx, config):
    cluster_name = config["cluster"]
    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    profile = config.get("crush_tunables", "default")
    log.info("Setting crush tunables to %s", profile)
    mon_remote.run(args=["sudo", "ceph", "--cluster", cluster_name, "osd", "crush", "tunables", profile])
    yield
Example #27
0
def wait_for_osds_up(ctx, config):
    log.info('Waiting until ceph osds are all up...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote
        )
Example #28
0
def crush_setup(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    profile = config.get('crush_tunables', 'default')
    log.info('Setting crush tunables to %s', profile)
    mon_remote.run(
        args=['sudo', 'ceph', 'osd', 'crush', 'tunables', profile])
    yield
    def test_standby_for_invalid_fscid(self):
        # Set invalid standby_fscid with other mds standby_rank
        # stopping active mds service should not end up in mon crash

        # Get configured mons in the cluster
        first_mon = teuthology.get_first_mon(self.ctx, self.configs_set)
        (mon,) = self.ctx.cluster.only(first_mon).remotes.iterkeys()
        manager = CephManager(
            mon,
            ctx=self.ctx,
            logger=log.getChild('ceph_manager'),
        )
        configured_mons = manager.get_mon_quorum()

        use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
        mds_a, mds_b, mds_c = use_daemons
        log.info("Using MDS daemons: {0}".format(use_daemons))

        def set_standby_for_rank(leader_rank, follower_id):
            self.set_conf("mds.{0}".format(follower_id),
                          "mds_standby_for_rank", leader_rank)

        # Create one fs
        fs_a = self.mds_cluster.newfs("cephfs")

        # Set all the daemons to have a rank assignment but no other
        # standby preferences.
        set_standby_for_rank(0, mds_a)
        set_standby_for_rank(0, mds_b)

        # Set third daemon to have invalid fscid assignment and no other
        # standby preferences
        invalid_fscid = 123
        self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)

        #Restart all the daemons to make the standby preference applied
        self.mds_cluster.mds_restart(mds_a)
        self.mds_cluster.mds_restart(mds_b)
        self.mds_cluster.mds_restart(mds_c)
        self.wait_for_daemon_start([mds_a, mds_b, mds_c])

        #Stop active mds daemon service of fs
        if (fs_a.get_active_names(), [mds_a]):
            self.mds_cluster.mds_stop(mds_a)
            self.mds_cluster.mds_fail(mds_a)
            fs_a.wait_for_daemons()
        else:
            self.mds_cluster.mds_stop(mds_b)
            self.mds_cluster.mds_fail(mds_b)
            fs_a.wait_for_daemons()

        #Get active mons from cluster
        active_mons = manager.get_mon_quorum()

        #Check for active quorum mon status and configured mon status
        self.assertEqual(active_mons, configured_mons, "Not all mons are in quorum Invalid standby invalid fscid test failed!")
Example #30
0
def task(ctx, config):
    """
    Die if {testdir}/err exists or if an OSD dumps core
    """
    if config is None:
        config = {}

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    testdir = teuthology.get_testdir(ctx)

    while True:
        for i in range(num_osds):
            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
            p = osd_remote.run(
                args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d has an error" % i)
                raise Exception("osd %d error" % i)

            log_path = '/var/log/ceph/osd.%d.log' % (i)

            p = osd_remote.run(
                args = [
                         'tail', '-1', log_path,
                         run.Raw('|'),
                         'grep', '-q', 'end dump'
                       ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d dumped core" % i)
                raise Exception("osd %d dumped core" % i)

        time.sleep(5)
Example #31
0
def task(ctx, config):
    """
    Test [deep] repair in several situations:
      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]

    The config should be as follows:

      Must include the log-whitelist below
      Must enable filestore_debug_inject_read_err config

    example:

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size']
        conf:
          osd:
            filestore debug inject read err: true
    - repair_test:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'repair_test task only accepts a dict for config'

    if not hasattr(ctx, 'manager'):
        first_mon = teuthology.get_first_mon(ctx, config)
        (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
        ctx.manager = ceph_manager.CephManager(
            mon,
            ctx=ctx,
            logger=log.getChild('ceph_manager')
            )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    while len(ctx.manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    tests = [
        gen_repair_test_1(mdataerr(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(mdataerr(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_1(dataerr(ctx), choose_primary(ctx), "deep-scrub"),
        gen_repair_test_1(dataerr(ctx), choose_replica(ctx), "deep-scrub"),
        gen_repair_test_1(trunc(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(trunc(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_2(choose_primary(ctx)),
        gen_repair_test_2(choose_replica(ctx))
        ]

    for test in tests:
        run_test(ctx, config, test)
Example #32
0
def crush_setup(ctx, config):
    cluster_name = config['cluster']
    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()

    profile = config.get('crush_tunables', 'default')
    log.info('Setting crush tunables to %s', profile)
    _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
        args=['ceph', 'osd', 'crush', 'tunables', profile])
    yield
def setup(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    ctx.manager = ceph_manager.CephManager(mon, ctx=ctx, logger=log.getChild("ceph_manager"))
    ctx.manager.clear_pools()
    ctx.manager.create_pool(POOLNAME, config.num_pgs)
    log.info("populating pool")
    ctx.manager.rados_write_objects(
        POOLNAME, config.num_objects, config.object_size, config.creation_time_limit, config.create_threads
    )
    log.info("done populating pool")
Example #34
0
 def setup(self):
     super(CBT, self).setup()
     self.first_mon = self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()[0]
     self.cbt_config = self.generate_cbt_config()
     self.log.info('cbt configuration is %s', self.cbt_config)
     self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt')
     self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir])
     misc.write_file(self.first_mon, os.path.join(self.cbt_dir, 'cbt_config.yaml'),
                     yaml.safe_dump(self.cbt_config, default_flow_style=False))
     self.checkout_cbt()
     self.install_dependencies()
Example #35
0
def healthy(ctx, config):
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        cluster=ctx.cluster,
        remote=mon0_remote
        )
    teuthology.wait_until_healthy(
        remote=mon0_remote,
        )
Example #36
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), "scrub_test task only accepts a dict for configuration"
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, "osd")
    log.info("num_osds is %s" % num_osds)

    manager = ceph_manager.CephManager(mon, ctx=ctx, logger=log.getChild("ceph_manager"))

    while len(manager.get_osd_status()["up"]) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd("tell", "osd.%d" % i, "flush_pg_stats")
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ["-p", "rbd", "bench", "--no-cleanup", "1", "write", "-b", "4096"])
    log.info("err is %d" % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ["-p", "rbd", "setomapval", obj_name, "key", "val"])
    log.info("err is %d" % p.exitstatus)
    manager.do_rados(mon, ["-p", "rbd", "setomapheader", obj_name, "hdr"])
    log.info("err is %d" % p.exitstatus)

    log.info("messing with PG %s on osd %d" % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, "rbd")
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path)
    log.info("test successful!")
Example #37
0
def healthy(ctx, config):
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        cluster=ctx.cluster,
        remote=mon0_remote
        )
    teuthology.wait_until_healthy(
        remote=mon0_remote,
        )
Example #38
0
def execute_ceph_deploy(ctx, config, cmd):
    """Remotely execute a ceph_deploy command"""
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    exec_cmd = cmd
    (remote,) = ctx.cluster.only(ceph_admin).remotes.iterkeys()
    proc = remote.run(
        args=["cd", "{tdir}/ceph-deploy".format(tdir=testdir), run.Raw("&&"), run.Raw(exec_cmd)], check_status=False
    )
    exitstatus = proc.exitstatus
    return exitstatus
Example #39
0
def wait_for_osds_up(ctx, config):
    """
    Wait for all osd's to come up.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph osds are all up...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
Example #40
0
 def setup(self):
     super(CBT, self).setup()
     self.first_mon = self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()[0]
     self.cbt_config = self.generate_cbt_config()
     self.log.info('cbt configuration is %s', self.cbt_config)
     self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt')
     self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir])
     misc.write_file(self.first_mon, os.path.join(self.cbt_dir, 'cbt_config.yaml'),
                     yaml.safe_dump(self.cbt_config, default_flow_style=False))
     self.checkout_cbt()
     self.install_dependencies()
Example #41
0
def wait_for_osds_up(ctx, config):
    """
    Wait for all osd's to come up.

    :param ctx: Context
    :param config: Configuration
    """
    log.info("Waiting until ceph osds are all up...")
    cluster_name = config.get("cluster", "ceph")
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
Example #42
0
def task(ctx, config):
    """
    Benchmark the recovery system.

    Generates objects with smalliobench, runs it normally to get a
    baseline performance measurement, then marks an OSD out and reruns
    to measure performance during recovery.

    The config should be as follows:

    recovery_bench:
        duration: <seconds for each measurement run>
        num_objects: <number of objects>
        io_size: <io size in bytes>

    example:

    tasks:
    - ceph:
    - recovery_bench:
        duration: 60
        num_objects: 500
        io_size: 4096
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'recovery_bench task only accepts a dict for configuration'

    log.info('Beginning recovery bench...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        manager.sleep(10)

    bench_proc = RecoveryBencher(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining recovery bencher')
        bench_proc.do_join()
def task(ctx, config):
    """
    Benchmark the recovery system.

    Generates objects with smalliobench, runs it normally to get a
    baseline performance measurement, then marks an OSD out and reruns
    to measure performance during recovery.

    The config should be as follows:

    recovery_bench:
        duration: <seconds for each measurement run>
        num_objects: <number of objects>
        io_size: <io size in bytes>

    example:

    tasks:
    - ceph:
    - recovery_bench:
        duration: 60
        num_objects: 500
        io_size: 4096
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'recovery_bench task only accepts a dict for configuration'

    log.info('Beginning recovery bench...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        manager.sleep(10)

    bench_proc = RecoveryBencher(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining recovery bencher')
        bench_proc.do_join()
Example #44
0
def test_get_first_mon():
    expected = [
        ([['mon.a', 'osd.0', 'mon.c']], 'ceph', 'mon.a'),
        ([['ceph.mon.a', 'osd.0', 'ceph.mon.c']], 'ceph', 'ceph.mon.a'),
        ([['mon.a', 'osd.0', 'mon.c'], ['ceph.mon.b']], 'ceph', 'ceph.mon.b'),
        ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'ceph', 'mon.a'),
        ([['foo.mon.b', 'osd.0', 'mon.c'], ['foo.mon.a']], 'foo', 'foo.mon.a'),
    ]
    for remote_roles, cluster_name, expected_mon in expected:
        ctx = argparse.Namespace()
        ctx.cluster = Mock()
        ctx.cluster.remotes = {i: roles for i, roles in enumerate(remote_roles)}
        mon = misc.get_first_mon(ctx, None, cluster_name)
        assert expected_mon == mon
Example #45
0
    def thread():
        """Thread spawned by gevent"""
        if not hasattr(ctx, 'manager'):
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
            ctx.manager = CephManager(
                mon,
                ctx=ctx,
                logger=log.getChild('ceph_manager'),
                )

        clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
        log.info('clients are %s' % clients)
        if config.get('ec_pool', False):
            erasure_code_profile = config.get('erasure_code_profile', {})
            erasure_code_profile_name = erasure_code_profile.get('name', False)
            ctx.manager.create_erasure_code_profile(erasure_code_profile_name, **erasure_code_profile)
        else:
            erasure_code_profile_name = False
        for i in range(int(config.get('runs', '1'))):
            log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
            tests = {}
            existing_pools = config.get('pools', [])
            created_pools = []
            for role in config.get('clients', clients):
                assert isinstance(role, basestring)
                PREFIX = 'client.'
                assert role.startswith(PREFIX)
                id_ = role[len(PREFIX):]

                pool = config.get('pool', None)
                if not pool and existing_pools:
                    pool = existing_pools.pop()
                else:
                    pool = ctx.manager.create_pool_with_unique_name(erasure_code_profile_name=erasure_code_profile_name)
                    created_pools.append(pool)

                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
                proc = remote.run(
                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
                    ["--pool", pool],
                    logger=log.getChild("rados.{id}".format(id=id_)),
                    stdin=run.PIPE,
                    wait=False
                    )
                tests[id_] = proc
            run.wait(tests.itervalues())

            for pool in created_pools:
                ctx.manager.remove_pool(pool)
Example #46
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )
Example #47
0
    def __init__(self, ctx):
        self._ctx = ctx

        self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
        if len(self.mds_ids) == 0:
            raise RuntimeError("This task requires at least one MDS")

        first_mon = misc.get_first_mon(ctx, None)
        (self.mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
        self.mon_manager = ceph_manager.CephManager(self.mon_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
        self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])

        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
        self.client_id = client_list[0]
        self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
Example #48
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        try:
            proc = mon_remote.run(args=[
                'sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools'
            ],
                                  stdout=StringIO())
            pools = json.loads(proc.stdout.getvalue())
            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
        except CommandFailedError as e:
            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
            # structured output (--format) from the CLI.
            if e.exitstatus == 22:
                metadata_pool_exists = True
            else:
                raise

        # In case we are using an older Ceph which creates FS by default
        if metadata_pool_exists:
            log.info("Metadata pool already exists, skipping")
        else:
            ceph_fs = Filesystem(ctx)
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith(
            'mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [
            item for remote_roles in mdss.remotes.values()
            for item in remote_roles
        ]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds',
            'set_max_mds',
            str(num_active)
        ])

    yield
Example #49
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        try:
            proc = mon_remote.run(args=['sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools'],
                                  stdout=StringIO())
            pools = json.loads(proc.stdout.getvalue())
            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
        except CommandFailedError as e:
            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
            # structured output (--format) from the CLI.
            if e.exitstatus == 22:
                metadata_pool_exists = True
            else:
                raise

        # In case we are using an older Ceph which creates FS by default
        if metadata_pool_exists:
            log.info("Metadata pool already exists, skipping")
        else:
            mon_remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create', 'metadata', '256'])
            mon_remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create', 'data', '256'])

            # Use 'newfs' to work with either old or new Ceph, until the 'fs new'
            # stuff is all landed.
            mon_remote.run(args=['sudo', 'ceph', 'mds', 'newfs', '1', '2'])
            # mon_remote.run(args=['sudo', 'ceph', 'fs', 'new', 'default', 'metadata', 'data'])

        is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            'mds', 'set_max_mds', str(num_active)])

    yield
Example #50
0
def execute_ceph_deploy(ctx, config, cmd):
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    exec_cmd = cmd
    (remote, ) = ctx.cluster.only(ceph_admin).remotes.iterkeys()
    proc = remote.run(
        args=[
            'cd',
            '{tdir}/ceph-deploy'.format(tdir=testdir),
            run.Raw('&&'),
            run.Raw(exec_cmd),
        ],
        check_status=False,
    )
    exitstatus = proc.exitstatus
    return exitstatus
Example #51
0
def is_healthy(ctx, config):
    """Wait until a Ceph cluster is healthy."""
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
    while True:
        r = remote.run(
            args=["cd", "{tdir}".format(tdir=testdir), run.Raw("&&"), "sudo", "ceph", "health"],
            stdout=StringIO(),
            logger=log.getChild("health"),
        )
        out = r.stdout.getvalue()
        log.debug("Ceph health: %s", out.rstrip("\n"))
        if out.split(None, 1)[0] == "HEALTH_OK":
            break
        time.sleep(1)
Example #52
0
def execute_ceph_deploy(ctx, config, cmd):
    testdir = teuthology.get_testdir(ctx)
    ceph_admin = teuthology.get_first_mon(ctx, config)
    exec_cmd = cmd
    (remote,) = ctx.cluster.only(ceph_admin).remotes.iterkeys()
    proc = remote.run(
        args = [
            'cd',
            '{tdir}/ceph-deploy'.format(tdir=testdir),
            run.Raw('&&'),
            run.Raw(exec_cmd),
            ],
            check_status=False,
        )
    exitstatus = proc.exitstatus
    return exitstatus
Example #53
0
def task(ctx, config):
    """
    Run scrub periodically. Randomly chooses an OSD to scrub.

    The config should be as follows:

    scrub:
        frequency: <seconds between scrubs>
        deep: <bool for deepness>

    example:

    tasks:
    - ceph:
    - scrub:
        frequency: 30
        deep: 0
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub task only accepts a dict for configuration'

    log.info('Beginning scrub...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    scrub_proc = Scrubber(
        manager,
        config,
    )
    try:
        yield
    finally:
        log.info('joining scrub')
        scrub_proc.do_join()
Example #54
0
def task(ctx, config):
    """
    Run scrub periodically. Randomly chooses an OSD to scrub.

    The config should be as follows:

    scrub:
        frequency: <seconds between scrubs>
        deep: <bool for deepness>

    example:

    tasks:
    - ceph:
    - scrub:
        frequency: 30
        deep: 0
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub task only accepts a dict for configuration'

    log.info('Beginning scrub...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    scrub_proc = Scrubber(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining scrub')
        scrub_proc.do_join()
Example #55
0
def task(ctx, config):
    """
    Test [deep] repair in several situations:
      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]

    The config should be as follows:

      Must include the log-whitelist below
      Must enable filestore_debug_inject_read_err config

    example:

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size']
        conf:
          osd:
            filestore debug inject read err: true
    - repair_test:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'repair_test task only accepts a dict for config'

    if not hasattr(ctx, 'manager'):
        first_mon = teuthology.get_first_mon(ctx, config)
        (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
        ctx.manager = ceph_manager.CephManager(
            mon, ctx=ctx, logger=log.getChild('ceph_manager'))

    tests = [
        gen_repair_test_1(mdataerr(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(mdataerr(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_1(dataerr(ctx), choose_primary(ctx), "deep-scrub"),
        gen_repair_test_1(dataerr(ctx), choose_replica(ctx), "deep-scrub"),
        gen_repair_test_1(trunc(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(trunc(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_2(choose_primary(ctx)),
        gen_repair_test_2(choose_replica(ctx))
    ]

    for test in tests:
        run_test(ctx, config, test)
Example #56
0
def setup(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    ctx.manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )
    ctx.manager.clear_pools()
    ctx.manager.create_pool(POOLNAME, config.num_pgs)
    log.info("populating pool")
    ctx.manager.rados_write_objects(
        POOLNAME,
        config.num_objects,
        config.object_size,
        config.creation_time_limit,
        config.create_threads)
    log.info("done populating pool")
Example #57
0
    def thread():
        """Thread spawned by gevent"""
        if not hasattr(ctx, 'manager'):
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
            ctx.manager = CephManager(
                mon,
                ctx=ctx,
                logger=log.getChild('ceph_manager'),
                )

        clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
        log.info('clients are %s' % clients)
        for i in range(int(config.get('runs', '1'))):
            log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
            tests = {}
            existing_pools = config.get('pools', [])
            created_pools = []
            for role in config.get('clients', clients):
                assert isinstance(role, basestring)
                PREFIX = 'client.'
                assert role.startswith(PREFIX)
                id_ = role[len(PREFIX):]

                pool = config.get('pool', None)
                if not pool and existing_pools:
                    pool = existing_pools.pop()
                else:
                    pool = ctx.manager.create_pool_with_unique_name(ec_pool=config.get('ec_pool', False))
                    created_pools.append(pool)

                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
                proc = remote.run(
                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
                    ["--pool", pool],
                    logger=log.getChild("rados.{id}".format(id=id_)),
                    stdin=run.PIPE,
                    wait=False
                    )
                tests[id_] = proc
            run.wait(tests.itervalues())

            for pool in created_pools:
                ctx.manager.remove_pool(pool)