Exemple #1
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)  # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
Exemple #3
0
    def __init__(self, ctx, manager, mds_cluster, config, logger,
                 failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(
            self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(
            self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay',
                                                      10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)
Exemple #4
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx)
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            'mds', 'set_max_mds', str(num_active)])

    yield
Exemple #5
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx)
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith(
            'mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [
            item for remote_roles in mdss.remotes.values()
            for item in remote_roles
        ]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph',
            'mds', 'set_max_mds',
            str(num_active)
        ])

    yield
Exemple #6
0
 def setupfs(self, name=None):
     if name is None and self.fs is not None:
         # Previous mount existed, reuse the old name
         name = self.fs.name
     self.fs = Filesystem(self.ctx, name=name)
     log.info('Wait for MDS to reach steady state...')
     self.fs.wait_for_daemons()
     log.info('Ready to start {}...'.format(type(self).__name__))
Exemple #7
0
def task(ctx, config):
    """
    Prepare MDS cluster for upgrade.

    This task reduces ranks to 1 and stops all standbys.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)
    status = fs.getinfo()

    fs.set_max_mds(1)
    fs.reach_max_mds()

    # Stop standbys now to minimize time rank 0 is down in subsequent:
    # tasks:
    # - ceph.stop: [mds.*]
    rank0 = fs.get_rank(rank=0, status=status)
    for daemon in ctx.daemons.iter_daemons_of_role('mds',
                                                   fs.mon_manager.cluster):
        if rank0['name'] != daemon.id_:
            daemon.stop()

    for i in range(1, 10):
        time.sleep(5)  # time for FSMap to update
        status = fs.getinfo()
        if len(list(status.get_standbys())) == 0:
            break
    assert (len(list(status.get_standbys())) == 0)
Exemple #8
0
def clients_evicted(ctx, config):
    """
    Check clients are evicted, unmount (cleanup) if so.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    clients = config.get('clients')

    if clients is None:
        clients = {("client."+client_id): True for client_id in ctx.mounts}

    log.info("clients is {}".format(str(clients)))

    fs = Filesystem(ctx)
    status = fs.status()

    has_session = set()
    mounts = {}
    for client in clients:
        client_id = re.match("^client.([0-9]+)$", client).groups(1)[0]
        mounts[client] = ctx.mounts.get(client_id)

    for rank in fs.get_ranks(status=status):
        ls = fs.rank_asok(['session', 'ls'], rank=rank['rank'], status=status)
        for session in ls:
            for client, evicted in clients.viewitems():
                mount = mounts.get(client)
                if mount is not None:
                    global_id = mount.get_global_id()
                    if session['id'] == global_id:
                        if evicted:
                            raise RuntimeError("client still has session: {}".format(str(session)))
                        else:
                            log.info("client {} has a session with MDS {}.{}".format(client, fs.id, rank['rank']))
                            has_session.add(client)

    no_session = set(clients) - has_session
    should_assert = False
    for client, evicted in clients.viewitems():
        mount = mounts.get(client)
        if mount is not None:
            if evicted:
                log.info("confirming client {} is blacklisted".format(client))
                assert mount.is_blacklisted()
            elif client in no_session:
                log.info("client {} should not be evicted but has no session with an MDS".format(client))
                mount.is_blacklisted() # for debugging
                should_assert = True
    if should_assert:
        raise RuntimeError("some clients which should not be evicted have no session with an MDS?")
Exemple #9
0
def clients_evicted(ctx, config):
    """
    Check clients are evicted, unmount (cleanup) if so.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    clients = config.get('clients')

    if clients is None:
        clients = {("client."+client_id): True for client_id in ctx.mounts}

    log.info("clients is {}".format(str(clients)))

    fs = Filesystem(ctx)
    status = fs.status()

    has_session = set()
    mounts = {}
    for client in clients:
        client_id = re.match("^client.([0-9]+)$", client).groups(1)[0]
        mounts[client] = ctx.mounts.get(client_id)

    for rank in fs.get_ranks(status=status):
        ls = fs.rank_asok(['session', 'ls'], rank=rank['rank'], status=status)
        for session in ls:
            for client, evicted in six.viewitems(clients):
                mount = mounts.get(client)
                if mount is not None:
                    global_id = mount.get_global_id()
                    if session['id'] == global_id:
                        if evicted:
                            raise RuntimeError("client still has session: {}".format(str(session)))
                        else:
                            log.info("client {} has a session with MDS {}.{}".format(client, fs.id, rank['rank']))
                            has_session.add(client)

    no_session = set(clients) - has_session
    should_assert = False
    for client, evicted in six.viewitems(clients):
        mount = mounts.get(client)
        if mount is not None:
            if evicted:
                log.info("confirming client {} is blacklisted".format(client))
                assert mount.is_blacklisted()
            elif client in no_session:
                log.info("client {} should not be evicted but has no session with an MDS".format(client))
                mount.is_blacklisted() # for debugging
                should_assert = True
    if should_assert:
        raise RuntimeError("some clients which should not be evicted have no session with an MDS?")
def task(ctx, config):
    """
    Execute CephFS client recovery test suite.

    Requires:
    - An outer ceph_fuse task with at least two clients
    - That the clients are on a separate host to the MDS
    """
    fs = Filesystem(ctx)

    # Pick out the clients we will use from the configuration
    # =======================================================
    if len(ctx.mounts) < 2:
        raise RuntimeError("Need at least two clients")
    mount_a = ctx.mounts.values()[0]
    mount_b = ctx.mounts.values()[1]

    if not isinstance(mount_a, FuseMount) or not isinstance(mount_b, FuseMount):
        # kclient kill() power cycles nodes, so requires clients to each be on
        # their own node
        if mount_a.client_remote.hostname == mount_b.client_remote.hostname:
            raise RuntimeError("kclient clients must be on separate nodes")

    # Check we have at least one remote client for use with network-dependent tests
    # =============================================================================
    if mount_a.client_remote.hostname in fs.get_mds_hostnames():
        raise RuntimeError("Require first client to on separate server from MDSs")

    # Stash references on ctx so that we can easily debug in interactive mode
    # =======================================================================
    ctx.filesystem = fs
    ctx.mount_a = mount_a
    ctx.mount_b = mount_b

    run_tests(ctx, config, TestClientRecovery, {
        "mds_reconnect_timeout": int(fs.mds_asok(
            ['config', 'get', 'mds_reconnect_timeout']
        )['mds_reconnect_timeout']),
        "mds_session_timeout": int(fs.mds_asok(
            ['config', 'get', 'mds_session_timeout']
        )['mds_session_timeout']),
        "ms_max_backoff": int(fs.mds_asok(
            ['config', 'get', 'ms_max_backoff']
        )['ms_max_backoff']),
        "fs": fs,
        "mount_a": mount_a,
        "mount_b": mount_b
    })

    # Continue to any downstream tasks
    # ================================
    yield
Exemple #11
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        try:
            proc = mon_remote.run(args=[
                'sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools'
            ],
                                  stdout=StringIO())
            pools = json.loads(proc.stdout.getvalue())
            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
        except CommandFailedError as e:
            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
            # structured output (--format) from the CLI.
            if e.exitstatus == 22:
                metadata_pool_exists = True
            else:
                raise

        # In case we are using an older Ceph which creates FS by default
        if metadata_pool_exists:
            log.info("Metadata pool already exists, skipping")
        else:
            ceph_fs = Filesystem(ctx)
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith(
            'mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [
            item for remote_roles in mdss.remotes.values()
            for item in remote_roles
        ]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds',
            'set_max_mds',
            str(num_active)
        ])

    yield
Exemple #12
0
def task(ctx, config):
    fs = Filesystem(ctx)

    # Pick out the clients we will use from the configuration
    # =======================================================
    if len(ctx.mounts) < 2:
        raise RuntimeError("Need at least two clients")
    mount_a = ctx.mounts.values()[0]
    mount_b = ctx.mounts.values()[1]

    if not isinstance(mount_a, FuseMount) or not isinstance(
            mount_b, FuseMount):
        # kclient kill() power cycles nodes, so requires clients to each be on
        # their own node
        if mount_a.client_remote.hostname == mount_b.client_remote.hostname:
            raise RuntimeError("kclient clients must be on separate nodes")

    # Stash references on ctx so that we can easily debug in interactive mode
    # =======================================================================
    ctx.filesystem = fs
    ctx.mount_a = mount_a
    ctx.mount_b = mount_b

    run_tests(ctx, config, TestClientLimits, {
        'fs': fs,
        'mount_a': mount_a,
        'mount_b': mount_b
    })

    # Continue to any downstream tasks
    # ================================
    yield
    def __init__(self, ctx, manager, mds_cluster, config, logger, failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)
Exemple #14
0
def task(ctx, config):
    fs = Filesystem(ctx)

    # Pick out the clients we will use from the configuration
    # =======================================================
    if len(ctx.mounts) < 2:
        raise RuntimeError("Need at least two clients")
    mount_a = ctx.mounts.values()[0]
    mount_b = ctx.mounts.values()[1]

    # Stash references on ctx so that we can easily debug in interactive mode
    # =======================================================================
    ctx.filesystem = fs
    ctx.mount_a = mount_a
    ctx.mount_b = mount_b

    run_tests(ctx, config, TestClusterFull, {
        'fs': fs,
        'mount_a': mount_a,
        'mount_b': mount_b
    })

    # Continue to any downstream tasks
    # ================================
    yield
Exemple #15
0
 def setupfs(self, name=None):
     if name is None and self.fs is not None:
         # Previous mount existed, reuse the old name
         name = self.fs.name
     self.fs = Filesystem(self.ctx, name=name)
     log.info('Wait for MDS to reach steady state...')
     self.fs.wait_for_daemons()
     log.info('Ready to start {}...'.format(type(self).__name__))
Exemple #16
0
def task(ctx, config):
    """
    Prepare MDS cluster for upgrade.

    This task reduces ranks to 1 and stops all standbys.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)
    fs.getinfo()  # load name
    fs.set_allow_standby_replay(False)
    fs.set_max_mds(1)
    fs.reach_max_mds()
Exemple #17
0
def task(ctx, config):
    """
    Prepare MDS cluster for upgrade.

    This task reduces ranks to 1 and stops all standbys.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)
    status = fs.getinfo()

    fs.set_max_mds(1)
    status = fs.getinfo()
    targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
    if len(targets) > 0:
        # deactivate mds in decending order
        targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
        for target in targets:
            self.log("deactivating rank %d" % target['rank'])
            self.fs.deactivate(target['rank'])
            status = self.wait_for_stable()[0]
        else:
            status = self.wait_for_stable()[0]

    assert(fs.get_mds_map(status=status)['max_mds'] == 1)
    assert(fs.get_mds_map(status=status)['in'] == [0])

    # Stop standbys now to minimize time rank 0 is down in subsequent:
    # tasks:
    # - ceph.stop: [mds.*]
    rank0 = fs.get_rank(rank=0, status=status)
    for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster):
        if rank0['name'] != daemon.id_:
            daemon.stop()

    for i in range(1, 10):
        time.sleep(5) # time for FSMap to update
        status = fs.getinfo()
        if len(list(status.get_standbys())) == 0:
            break
    assert(len(list(status.get_standbys())) == 0)
Exemple #18
0
def ready(ctx, config):
    """
    That the file system is ready for clients.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    timeout = config.get('timeout', 300)

    mdsc = MDSCluster(ctx)
    status = mdsc.status()

    for filesystem in status.get_filesystems():
        fs = Filesystem(ctx, fscid=filesystem['id'])
        fs.wait_for_daemons(timeout=timeout, status=status)
Exemple #19
0
def cephfs_setup(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        try:
            proc = mon_remote.run(args=['sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools'],
                                  stdout=StringIO())
            pools = json.loads(proc.stdout.getvalue())
            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
        except CommandFailedError as e:
            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
            # structured output (--format) from the CLI.
            if e.exitstatus == 22:
                metadata_pool_exists = True
            else:
                raise

        # In case we are using an older Ceph which creates FS by default
        if metadata_pool_exists:
            log.info("Metadata pool already exists, skipping")
        else:
            ceph_fs = Filesystem(ctx)
            ceph_fs.create()

        is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(args=[
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            'mds', 'set_max_mds', str(num_active)])

    yield
Exemple #20
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )

    if ctx.cluster.only(teuthology.is_type('mds')).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)
        ceph_fs.wait_for_daemons(timeout=300)
Exemple #21
0
def cephfs_setup(ctx, config):
    cluster_name = config['cluster']
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(
            args=[
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph', 'mds', 'set', 'allow_multimds', 'true',
                '--yes-i-really-mean-it'],
	    check_status=False,  # probably old version, upgrade test
        )
        mon_remote.run(args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            '--cluster', cluster_name,
            'mds', 'set_max_mds', str(num_active)])

    yield
Exemple #22
0
def cephfs_setup(ctx, config):
    cluster_name = config['cluster']
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info('Setting up CephFS filesystem...')

        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(
            args=[
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph', 'mds', 'set', 'allow_multimds', 'true',
                '--yes-i-really-mean-it'],
	    check_status=False,  # probably old version, upgrade test
        )
        mon_remote.run(args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph',
            '--cluster', cluster_name,
            'mds', 'set_max_mds', str(num_active)])

    yield
def task(ctx, config):
    """
    Upgrade CephFS file system snap format.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)

    mds_map = fs.get_mds_map()
    assert(mds_map['max_mds'] == 1)

    json = fs.rank_tell(["scrub", "start", "/", "force", "recursive", "repair"])
    if not json or json['return_code'] == 0:
        log.info("scrub / completed")
    else:
        log.info("scrub / failed: {}".format(json))

    json = fs.rank_tell(["scrub", "start", "~mdsdir", "force", "recursive", "repair"])
    if not json or json['return_code'] == 0:
        log.info("scrub ~mdsdir completed")
    else:
        log.info("scrub / failed: {}".format(json))

    for i in range(0, 10):
        mds_map = fs.get_mds_map()
        if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
            break
        time.sleep(10)
    assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
    assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
Exemple #24
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )

    if ctx.cluster.only(teuthology.is_type('mds')).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)
        ceph_fs.wait_for_daemons(timeout=300)
def task(ctx, config):
    fs = Filesystem(ctx)
    mount_a = ctx.mounts.values()[0]

    # Stash references on ctx so that we can easily debug in interactive mode
    # =======================================================================
    ctx.filesystem = fs
    ctx.mount_a = mount_a

    run_tests(ctx, config, TestMDSAutoRepair, {
        'fs': fs,
        'mount_a': mount_a,
    })

    # Continue to any downstream tasks
    # ================================
    yield
Exemple #26
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.

    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    def ceph_disk_osd_create(ctx, config):
        node_dev_list = get_dev_for_osd(ctx, config)
        no_of_osds = 0
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = './ceph-deploy disk zap ' + node + ' ' + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = './ceph-deploy osd create '
            # first check for filestore, default is bluestore with ceph-deploy
            if config.get('filestore') is not None:
                osd_create_cmd += '--filestore '
            elif config.get('bluestore') is not None:
                osd_create_cmd += '--bluestore '
            if config.get('dmcrypt') is not None:
                osd_create_cmd += '--dmcrypt '
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")
        return no_of_osds

    def ceph_volume_osd_create(ctx, config):
        osds = ctx.cluster.only(teuthology.is_type('osd'))
        no_of_osds = 0
        for remote in osds.remotes.keys():
            # all devs should be lvm
            osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' '
            # default is bluestore so we just need config item for filestore
            roles = ctx.cluster.remotes[remote]
            dev_needed = len([role for role in roles
                              if role.startswith('osd')])
            all_devs = teuthology.get_scratch_devices(remote)
            log.info("node={n}, need_devs={d}, available={a}".format(
                        n=remote.shortname,
                        d=dev_needed,
                        a=all_devs,
                        ))
            devs = all_devs[0:dev_needed]
            # rest of the devices can be used for journal if required
            jdevs = dev_needed
            for device in devs:
                device_split = device.split('/')
                lv_device = device_split[-2] + '/' + device_split[-1]
                if config.get('filestore') is not None:
                    osd_create_cmd += '--filestore --data ' + lv_device + ' '
                    # filestore with ceph-volume also needs journal disk
                    try:
                        jdevice = all_devs.pop(jdevs)
                    except IndexError:
                        raise RuntimeError("No device available for \
                                            journal configuration")
                    jdevice_split = jdevice.split('/')
                    j_lv = jdevice_split[-2] + '/' + jdevice_split[-1]
                    osd_create_cmd += '--journal ' + j_lv
                else:
                    osd_create_cmd += ' --data ' + lv_device
                estatus_osd = execute_ceph_deploy(osd_create_cmd)
                if estatus_osd == 0:
                    log.info('successfully created osd')
                    no_of_osds += 1
                else:
                    raise RuntimeError("ceph-deploy: Failed to create osds")
        return no_of_osds

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.items():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        # skip mgr based on config item
        # this is needed when test uses latest code to install old ceph
        # versions
        skip_mgr = config.get('skip-mgr', False)
        if not skip_mgr:
            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
            mgr_nodes = " ".join(mgr_nodes)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        if not skip_mgr:
            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.items():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                sudo=True)
                for key, value in keys.items():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(
                        ceph_admin, conf_path, lines, sudo=True)

        # install ceph
        dev_branch = ctx.config['branch']
        branch = '--dev={branch}'.format(branch=dev_branch)
        if ceph_branch:
            option = ceph_branch
        else:
            option = branch
        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = './ceph-deploy install --tests ' + option + \
                         " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        estatus_gather = execute_ceph_deploy(gather_keys)
        if estatus_gather != 0:
            raise RuntimeError("ceph-deploy: Failed during gather keys")

        # install admin key on mons (ceph-create-keys doesn't do this any more)
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote in mons.remotes.keys():
            execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname)

        # create osd's
        if config.get('use-ceph-volume', False):
            no_of_osds = ceph_volume_osd_create(ctx, config)
        else:
            # this method will only work with ceph-deploy v1.5.39 or older
            no_of_osds = ceph_disk_osd_create(ctx, config)

        if not skip_mgr:
            execute_ceph_deploy(mgr_create)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + \
                    " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")



        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.items():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                        ],
                    )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )

            if mds_nodes:
                log.info('Configuring CephFS...')
                Filesystem(ctx, create=True)
        elif not config.get('only_mon'):
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        # create rbd pool
        ceph_admin.run(
            args=[
                'sudo', 'ceph', '--cluster', 'ceph',
                'osd', 'pool', 'create', 'rbd', '128', '128'],
            check_status=False)
        ceph_admin.run(
            args=[
                'sudo', 'ceph', '--cluster', 'ceph',
                'osd', 'pool', 'application', 'enable',
                'rbd', 'rbd', '--yes-i-really-mean-it'
                ],
            check_status=False)
        yield

    except Exception:
        log.info(
            "Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get('keep_running'):
            return
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'],
                        check_status=False)
        time.sleep(4)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)
        ctx.cluster.run(args=['sudo', 'systemctl', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.items():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ),
            )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.keys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemple #27
0
 def setupfs(self, name=None):
     self.fs = Filesystem(self.ctx, name=name)
     log.info('Wait for MDS to reach steady state...')
     self.fs.wait_for_daemons()
     log.info('Ready to start {}...'.format(type(self).__name__))
Exemple #28
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.
    (ceph_admin,) = ctx.cluster.only(
        teuthology.get_first_mon(ctx, config)).remotes.iterkeys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(
                        ceph_admin, conf_path, lines, sudo=True)

        # install ceph
        dev_branch = ctx.config['branch']
        branch = '--dev={branch}'.format(branch=dev_branch)
        if ceph_branch:
            option = ceph_branch
        else:
            option = branch
        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = './ceph-deploy install --tests ' + option + \
                         " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        # create-keys is explicit now
        # http://tracker.ceph.com/issues/16036
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote in mons.remotes.iterkeys():
            remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph',
                             '--id', remote.shortname])

        estatus_gather = execute_ceph_deploy(gather_keys)
        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + \
                    " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = './ceph-deploy disk zap ' + node + ':' + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = './ceph-deploy osd create '
            if config.get('dmcrypt') is not None:
                osd_create_cmd += '--dmcrypt '
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                        ],
                    )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )

            if mds_nodes:
                log.info('Configuring CephFS...')
                ceph_fs = Filesystem(ctx)
                if not ceph_fs.legacy_configured():
                    ceph_fs.create()
        elif not config.get('only_mon'):
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    except Exception:
        log.info(
            "Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get('keep_running'):
            return
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
                              'sudo', 'systemctl', 'stop', 'ceph.target'])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(
            args=[
                'sudo', 'status', 'ceph-all', run.Raw('||'),
                'sudo', 'service', 'ceph', 'status', run.Raw('||'),
                'sudo', 'systemctl', 'status', 'ceph.target'],
            check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ),
            )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemple #29
0
def task(ctx, config):
    """
    Stress test the mds by thrashing while another task/workunit
    is running.

    Please refer to MDSThrasher class for further information on the
    available options.
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mds_thrash task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 1, \
        'mds_thrash task requires at least 2 metadata servers'

    # choose random seed
    if 'seed' in config:
        seed = int(config['seed'])
    else:
        seed = int(time.time())
    log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
    random.seed(seed)

    (first, ) = ctx.cluster.only(
        'mds.{_id}'.format(_id=mdslist[0])).remotes.keys()
    manager = ceph_manager.CephManager(
        first,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    status = mds_cluster.status()
    while True:
        steady = True
        for info in status.get_all():
            state = info['state']
            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
                steady = False
                break
        if steady:
            break
        sleep(2)
        status = mds_cluster.status()
    log.info('Ready to start thrashing')

    manager.wait_for_clean()
    assert manager.is_clean()

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    for fs in status.get_filesystems():
        thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']),
                               fs['mdsmap']['max_mds'])
        thrasher.start()
        ctx.ceph[config['cluster']].thrashers.append(thrasher)

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining mds_thrasher')
        thrasher.stop()
        if thrasher.exception is not None:
            raise RuntimeError('error during thrashing')
        thrasher.join()
        log.info('done joining')
Exemple #30
0
class CephFSMount(object):
    def __init__(self,
                 ctx,
                 test_dir,
                 client_id,
                 client_remote,
                 client_keyring_path=None,
                 hostfs_mntpt=None,
                 cephfs_name=None,
                 cephfs_mntpt=None,
                 brxnet=None):
        """
        :param test_dir: Global teuthology test dir
        :param client_id: Client ID, the 'foo' in client.foo
        :param client_keyring_path: path to keyring for given client_id
        :param client_remote: Remote instance for the host where client will
                              run
        :param hostfs_mntpt: Path to directory on the FS on which Ceph FS will
                             be mounted
        :param cephfs_name: Name of Ceph FS to be mounted
        :param cephfs_mntpt: Path to directory inside Ceph FS that will be
                             mounted as root
        """
        self.mounted = False
        self.ctx = ctx
        self.test_dir = test_dir

        self._verify_attrs(client_id=client_id,
                           client_keyring_path=client_keyring_path,
                           hostfs_mntpt=hostfs_mntpt,
                           cephfs_name=cephfs_name,
                           cephfs_mntpt=cephfs_mntpt)

        self.client_id = client_id
        self.client_keyring_path = client_keyring_path
        self.client_remote = client_remote
        if hostfs_mntpt:
            self.hostfs_mntpt = hostfs_mntpt
            self.hostfs_mntpt_dirname = os.path.basename(self.hostfs_mntpt)
        else:
            self.hostfs_mntpt = os.path.join(self.test_dir,
                                             f'mnt.{self.client_id}')
        self.cephfs_name = cephfs_name
        self.cephfs_mntpt = cephfs_mntpt

        self.fs = None

        self._netns_name = None
        self.nsid = -1
        if brxnet is None:
            self.ceph_brx_net = '192.168.0.0/16'
        else:
            self.ceph_brx_net = brxnet

        self.test_files = ['a', 'b', 'c']

        self.background_procs = []

    # This will cleanup the stale netnses, which are from the
    # last failed test cases.
    @staticmethod
    def cleanup_stale_netnses_and_bridge(remote):
        p = remote.run(args=['ip', 'netns', 'list'],
                       stdout=StringIO(),
                       timeout=(5 * 60))
        p = p.stdout.getvalue().strip()

        # Get the netns name list
        netns_list = re.findall(r'ceph-ns-[^()\s][-.\w]+[^():\s]', p)

        # Remove the stale netnses
        for ns in netns_list:
            ns_name = ns.split()[0]
            args = ['sudo', 'ip', 'netns', 'delete', '{0}'.format(ns_name)]
            try:
                remote.run(args=args, timeout=(5 * 60), omit_sudo=False)
            except Exception:
                pass

        # Remove the stale 'ceph-brx'
        try:
            args = ['sudo', 'ip', 'link', 'delete', 'ceph-brx']
            remote.run(args=args, timeout=(5 * 60), omit_sudo=False)
        except Exception:
            pass

    def _parse_netns_name(self):
        self._netns_name = '-'.join(
            ["ceph-ns", re.sub(r'/+', "-", self.mountpoint)])

    @property
    def mountpoint(self):
        if self.hostfs_mntpt == None:
            self.hostfs_mntpt = os.path.join(self.test_dir,
                                             self.hostfs_mntpt_dirname)
        return self.hostfs_mntpt

    @mountpoint.setter
    def mountpoint(self, path):
        if not isinstance(path, str):
            raise RuntimeError('path should be of str type.')
        self._mountpoint = self.hostfs_mntpt = path

    @property
    def netns_name(self):
        if self._netns_name == None:
            self._parse_netns_name()
        return self._netns_name

    @netns_name.setter
    def netns_name(self, name):
        self._netns_name = name

    def assert_that_ceph_fs_exists(self):
        output = self.client_remote.run(args='ceph fs ls', stdout=StringIO()).\
            stdout.getvalue()
        if self.cephfs_name:
            assert self.cephfs_name in output, \
                'expected ceph fs is not present on the cluster'
            log.info(
                f'Mounting Ceph FS {self.cephfs_name}; just confirmed its presence on cluster'
            )
        else:
            assert 'No filesystems enabled' not in output, \
                'ceph cluster has no ceph fs, not even the default ceph fs'
            log.info(
                'Mounting default Ceph FS; just confirmed its presence on cluster'
            )

    def assert_and_log_minimum_mount_details(self):
        """
        Make sure we have minimum details required for mounting. Ideally, this
        method should be called at the beginning of the mount method.
        """
        if not self.client_id or not self.client_remote or \
           not self.hostfs_mntpt:
            errmsg = ('Mounting CephFS requires that at least following '
                      'details to be provided -\n'
                      '1. the client ID,\n2. the mountpoint and\n'
                      '3. the remote machine where CephFS will be mounted.\n')
            raise RuntimeError(errmsg)

        self.assert_that_ceph_fs_exists()

        log.info('Mounting Ceph FS. Following are details of mount; remember '
                 '"None" represents Python type None -')
        log.info(
            f'self.client_remote.hostname = {self.client_remote.hostname}')
        log.info(f'self.client.name = client.{self.client_id}')
        log.info(f'self.hostfs_mntpt = {self.hostfs_mntpt}')
        log.info(f'self.cephfs_name = {self.cephfs_name}')
        log.info(f'self.cephfs_mntpt = {self.cephfs_mntpt}')
        log.info(f'self.client_keyring_path = {self.client_keyring_path}')
        if self.client_keyring_path:
            log.info('keyring content -\n' + get_file(self.client_remote,
                                                      self.client_keyring_path,
                                                      sudo=True).decode())

    def is_mounted(self):
        return self.mounted

    def setupfs(self, name=None):
        if name is None and self.fs is not None:
            # Previous mount existed, reuse the old name
            name = self.fs.name
        self.fs = Filesystem(self.ctx, name=name)
        log.info('Wait for MDS to reach steady state...')
        self.fs.wait_for_daemons()
        log.info('Ready to start {}...'.format(type(self).__name__))

    def _create_mntpt(self, cwd=None):
        stderr = StringIO()
        # Use 0000 mode to prevent undesired modifications to the mountpoint on
        # the local file system.
        script = f'mkdir -m 0000 -p -v {self.hostfs_mntpt}'.split()
        try:
            self.client_remote.run(args=script,
                                   timeout=(15 * 60),
                                   stderr=stderr)
        except CommandFailedError:
            if 'file exists' not in stderr.getvalue().lower():
                raise

    @property
    def _nsenter_args(self):
        return ['nsenter', f'--net=/var/run/netns/{self.netns_name}']

    def _set_filemode_on_mntpt(self):
        stderr = StringIO()
        try:
            self.client_remote.run(
                args=['sudo', 'chmod', '1777', self.hostfs_mntpt],
                stderr=stderr,
                timeout=(5 * 60))
        except CommandFailedError:
            # the client does not have write permissions in the caps it holds
            # for the Ceph FS that was just mounted.
            if 'permission denied' in stderr.getvalue().lower():
                pass

    def _setup_brx_and_nat(self):
        # The ip for ceph-brx should be
        ip = IP(self.ceph_brx_net)[-2]
        mask = self.ceph_brx_net.split('/')[1]
        brd = IP(self.ceph_brx_net).broadcast()

        brx = self.client_remote.run(args=['ip', 'addr'],
                                     stderr=StringIO(),
                                     stdout=StringIO(),
                                     timeout=(5 * 60))
        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
        if brx:
            # If the 'ceph-brx' already exists, then check whether
            # the new net is conflicting with it
            _ip, _mask = brx[0].split()[1].split('/', 1)
            if _ip != "{}".format(ip) or _mask != mask:
                raise RuntimeError(
                    "Conflict with existing ceph-brx {0}, new {1}/{2}".format(
                        brx[0].split()[1], ip, mask))

        # Setup the ceph-brx and always use the last valid IP
        if not brx:
            log.info("Setuping the 'ceph-brx' with {0}/{1}".format(ip, mask))

            self.run_shell_payload(f"""
                set -e
                sudo ip link add name ceph-brx type bridge
                sudo ip addr flush dev ceph-brx
                sudo ip link set ceph-brx up
                sudo ip addr add {ip}/{mask} brd {brd} dev ceph-brx
            """,
                                   timeout=(5 * 60),
                                   omit_sudo=False,
                                   cwd='/')

        args = "echo 1 | sudo tee /proc/sys/net/ipv4/ip_forward"
        self.client_remote.run(args=args, timeout=(5 * 60), omit_sudo=False)

        # Setup the NAT
        p = self.client_remote.run(args=['route'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        p = re.findall(r'default .*', p.stdout.getvalue())
        if p == False:
            raise RuntimeError("No default gw found")
        gw = p[0].split()[7]

        self.run_shell_payload(f"""
            set -e
            sudo iptables -A FORWARD -o {gw} -i ceph-brx -j ACCEPT
            sudo iptables -A FORWARD -i {gw} -o ceph-brx -j ACCEPT
            sudo iptables -t nat -A POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

    def _setup_netns(self):
        p = self.client_remote.run(args=['ip', 'netns', 'list'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60)).stdout.getvalue().strip()

        # Get the netns name list
        netns_list = re.findall(r'[^()\s][-.\w]+[^():\s]', p)

        out = re.search(r"{0}".format(self.netns_name), p)
        if out is None:
            # Get an uniq nsid for the new netns
            nsid = 0
            p = self.client_remote.run(args=['ip', 'netns', 'list-id'],
                                       stderr=StringIO(),
                                       stdout=StringIO(),
                                       timeout=(5 * 60)).stdout.getvalue()
            while True:
                out = re.search(r"nsid {} ".format(nsid), p)
                if out is None:
                    break

                nsid += 1

            # Add one new netns and set it id
            self.run_shell_payload(f"""
                set -e
                sudo ip netns add {self.netns_name}
                sudo ip netns set {self.netns_name} {nsid}
            """,
                                   timeout=(5 * 60),
                                   omit_sudo=False,
                                   cwd='/')
            self.nsid = nsid
        else:
            # The netns already exists and maybe suspended by self.kill()
            self.resume_netns()

            nsid = int(
                re.search(r"{0} \(id: (\d+)\)".format(self.netns_name),
                          p).group(1))
            self.nsid = nsid
            return

        # Get one ip address for netns
        ips = IP(self.ceph_brx_net)
        for ip in ips:
            found = False
            if ip == ips[0]:
                continue
            if ip == ips[-2]:
                raise RuntimeError("we have ran out of the ip addresses")

            for ns in netns_list:
                ns_name = ns.split()[0]
                args = [
                    'sudo', 'ip', 'netns', 'exec', '{0}'.format(ns_name), 'ip',
                    'addr'
                ]
                try:
                    p = self.client_remote.run(args=args,
                                               stderr=StringIO(),
                                               stdout=StringIO(),
                                               timeout=(5 * 60),
                                               omit_sudo=False)
                    q = re.search("{0}".format(ip), p.stdout.getvalue())
                    if q is not None:
                        found = True
                        break
                except CommandFailedError:
                    if "No such file or directory" in p.stderr.getvalue():
                        pass
                    if "Invalid argument" in p.stderr.getvalue():
                        pass

            if found == False:
                break

        mask = self.ceph_brx_net.split('/')[1]
        brd = IP(self.ceph_brx_net).broadcast()

        log.info("Setuping the netns '{0}' with {1}/{2}".format(
            self.netns_name, ip, mask))

        # Setup the veth interfaces
        brxip = IP(self.ceph_brx_net)[-2]
        self.run_shell_payload(f"""
            set -e
            sudo ip link add veth0 netns {self.netns_name} type veth peer name brx.{nsid}
            sudo ip netns exec {self.netns_name} ip addr add {ip}/{mask} brd {brd} dev veth0
            sudo ip netns exec {self.netns_name} ip link set veth0 up
            sudo ip netns exec {self.netns_name} ip link set lo up
            sudo ip netns exec {self.netns_name} ip route add default via {brxip}
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

        # Bring up the brx interface and join it to 'ceph-brx'
        self.run_shell_payload(f"""
            set -e
            sudo ip link set brx.{nsid} up
            sudo ip link set dev brx.{nsid} master ceph-brx
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

    def _cleanup_netns(self):
        if self.nsid == -1:
            return
        log.info("Removing the netns '{0}'".format(self.netns_name))

        # Delete the netns and the peer veth interface
        self.run_shell_payload(f"""
            set -e
            sudo ip link set brx.{self.nsid} down
            sudo ip link delete dev brx.{self.nsid}
            sudo ip netns delete {self.netns_name}
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

        self.nsid = -1

    def _cleanup_brx_and_nat(self):
        brx = self.client_remote.run(args=['ip', 'addr'],
                                     stderr=StringIO(),
                                     stdout=StringIO(),
                                     timeout=(5 * 60))
        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
        if not brx:
            return

        # If we are the last netns, will delete the ceph-brx
        args = ['sudo', 'ip', 'link', 'show']
        p = self.client_remote.run(args=args,
                                   stdout=StringIO(),
                                   timeout=(5 * 60),
                                   omit_sudo=False)
        _list = re.findall(r'brx\.', p.stdout.getvalue().strip())
        if len(_list) != 0:
            return

        log.info("Removing the 'ceph-brx'")

        self.run_shell_payload("""
            set -e
            sudo ip link set ceph-brx down
            sudo ip link delete ceph-brx
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

        # Drop the iptables NAT rules
        ip = IP(self.ceph_brx_net)[-2]
        mask = self.ceph_brx_net.split('/')[1]

        p = self.client_remote.run(args=['route'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        p = re.findall(r'default .*', p.stdout.getvalue())
        if p == False:
            raise RuntimeError("No default gw found")
        gw = p[0].split()[7]
        self.run_shell_payload(f"""
            set -e
            sudo iptables -D FORWARD -o {gw} -i ceph-brx -j ACCEPT
            sudo iptables -D FORWARD -i {gw} -o ceph-brx -j ACCEPT
            sudo iptables -t nat -D POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE
        """,
                               timeout=(5 * 60),
                               omit_sudo=False,
                               cwd='/')

    def setup_netns(self):
        """
        Setup the netns for the mountpoint.
        """
        log.info("Setting the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))
        self._setup_brx_and_nat()
        self._setup_netns()

    def cleanup_netns(self):
        """
        Cleanup the netns for the mountpoint.
        """
        # We will defer cleaning the netnses and bridge until the last
        # mountpoint is unmounted, this will be a temporary work around
        # for issue#46282.

        # log.info("Cleaning the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint))
        # self._cleanup_netns()
        # self._cleanup_brx_and_nat()

    def suspend_netns(self):
        """
        Suspend the netns veth interface.
        """
        if self.nsid == -1:
            return

        log.info("Suspending the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))

        args = [
            'sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'down'
        ]
        self.client_remote.run(args=args, timeout=(5 * 60), omit_sudo=False)

    def resume_netns(self):
        """
        Resume the netns veth interface.
        """
        if self.nsid == -1:
            return

        log.info("Resuming the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))

        args = ['sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'up']
        self.client_remote.run(args=args, timeout=(5 * 60), omit_sudo=False)

    def mount(self, mntopts=[], check_status=True, **kwargs):
        """
        kwargs expects its members to be same as the arguments accepted by
        self.update_attrs().
        """
        raise NotImplementedError()

    def mount_wait(self, **kwargs):
        """
        Accepts arguments same as self.mount().
        """
        self.mount(**kwargs)
        self.wait_until_mounted()

    def umount(self):
        raise NotImplementedError()

    def umount_wait(self, force=False, require_clean=False, timeout=None):
        """

        :param force: Expect that the mount will not shutdown cleanly: kill
                      it hard.
        :param require_clean: Wait for the Ceph client associated with the
                              mount (e.g. ceph-fuse) to terminate, and
                              raise if it doesn't do so cleanly.
        :param timeout: amount of time to be waited for umount command to finish
        :return:
        """
        raise NotImplementedError()

    def _verify_attrs(self, **kwargs):
        """
        Verify that client_id, client_keyring_path, client_remote, hostfs_mntpt,
        cephfs_name, cephfs_mntpt are either type str or None.
        """
        for k, v in kwargs.items():
            if v is not None and not isinstance(v, str):
                raise RuntimeError('value of attributes should be either str '
                                   f'or None. {k} - {v}')

    def update_attrs(self,
                     client_id=None,
                     client_keyring_path=None,
                     client_remote=None,
                     hostfs_mntpt=None,
                     cephfs_name=None,
                     cephfs_mntpt=None):
        if not (client_id or client_keyring_path or client_remote
                or cephfs_name or cephfs_mntpt or hostfs_mntpt):
            return

        self._verify_attrs(client_id=client_id,
                           client_keyring_path=client_keyring_path,
                           hostfs_mntpt=hostfs_mntpt,
                           cephfs_name=cephfs_name,
                           cephfs_mntpt=cephfs_mntpt)

        if client_id:
            self.client_id = client_id
        if client_keyring_path:
            self.client_keyring_path = client_keyring_path
        if client_remote:
            self.client_remote = client_remote
        if hostfs_mntpt:
            self.hostfs_mntpt = hostfs_mntpt
        if cephfs_name:
            self.cephfs_name = cephfs_name
        if cephfs_mntpt:
            self.cephfs_mntpt = cephfs_mntpt

    def remount(self, **kwargs):
        """
        Update mount object's attributes and attempt remount with these
        new values for these attrbiutes.

        1. Run umount_wait().
        2. Run update_attrs().
        3. Run mount().

        Accepts arguments of self.mount() and self.update_attrs() with 1
        exception: wait accepted too which can be True or False.
        """
        self.umount_wait()
        assert not self.mounted

        mntopts = kwargs.pop('mntopts', [])
        check_status = kwargs.pop('check_status', True)
        wait = kwargs.pop('wait', True)

        self.update_attrs(**kwargs)

        retval = self.mount(mntopts=mntopts, check_status=check_status)
        # avoid this scenario (again): mount command might've failed and
        # check_status might have silenced the exception, yet we attempt to
        # wait which might lead to an error.
        if retval is None and wait:
            self.wait_until_mounted()

        return retval

    def kill(self):
        """
        Suspend the netns veth interface to make the client disconnected
        from the ceph cluster
        """
        log.info('Killing connection on {0}...'.format(
            self.client_remote.name))
        self.suspend_netns()

    def kill_cleanup(self):
        """
        Follow up ``kill`` to get to a clean unmounted state.
        """
        log.info('Cleaning up killed connection on {0}'.format(
            self.client_remote.name))
        self.umount_wait(force=True)

    def cleanup(self):
        """
        Remove the mount point.

        Prerequisite: the client is not mounted.
        """
        log.info('Cleaning up mount {0}'.format(self.client_remote.name))
        stderr = StringIO()
        try:
            self.client_remote.run(args=['rmdir', '--', self.mountpoint],
                                   cwd=self.test_dir,
                                   stderr=stderr,
                                   timeout=(60 * 5),
                                   check_status=False)
        except CommandFailedError:
            if "no such file or directory" not in stderr.getvalue().lower():
                raise

        self.cleanup_netns()

    def wait_until_mounted(self):
        raise NotImplementedError()

    def get_keyring_path(self):
        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)

    def get_key_from_keyfile(self):
        # XXX: don't call run_shell(), since CephFS might be unmounted.
        keyring = self.client_remote.run(
            args=['sudo', 'cat', self.client_keyring_path],
            stdout=StringIO(),
            omit_sudo=False).stdout.getvalue()
        for line in keyring.split('\n'):
            if line.find('key') != -1:
                return line[line.find('=') + 1:].strip()

    @property
    def config_path(self):
        """
        Path to ceph.conf: override this if you're not a normal systemwide ceph install
        :return: stringv
        """
        return "/etc/ceph/ceph.conf"

    @contextmanager
    def mounted_wait(self):
        """
        A context manager, from an initially unmounted state, to mount
        this, yield, and then unmount and clean up.
        """
        self.mount()
        self.wait_until_mounted()
        try:
            yield
        finally:
            self.umount_wait()

    def create_file(self,
                    filename='testfile',
                    dirname=None,
                    user=None,
                    check_status=True):
        assert (self.is_mounted())

        if not os.path.isabs(filename):
            if dirname:
                if os.path.isabs(dirname):
                    path = os.path.join(dirname, filename)
                else:
                    path = os.path.join(self.hostfs_mntpt, dirname, filename)
            else:
                path = os.path.join(self.hostfs_mntpt, filename)
        else:
            path = filename

        if user:
            args = [
                'sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path
            ]
        else:
            args = 'touch ' + path

        return self.client_remote.run(args=args, check_status=check_status)

    def create_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Creating file {0}".format(suffix))
            self.client_remote.run(args=[
                'sudo', 'touch',
                os.path.join(self.hostfs_mntpt, suffix)
            ])

    def test_create_file(self,
                         filename='testfile',
                         dirname=None,
                         user=None,
                         check_status=True):
        return self.create_file(filename=filename,
                                dirname=dirname,
                                user=user,
                                check_status=False)

    def check_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Checking file {0}".format(suffix))
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.hostfs_mntpt, suffix)],
                check_status=False)
            if r.exitstatus != 0:
                raise RuntimeError(
                    "Expected file {0} not found".format(suffix))

    def write_file(self, path, data, perms=None):
        """
        Write the given data at the given path and set the given perms to the
        file on the path.
        """
        if path.find(self.hostfs_mntpt) == -1:
            path = os.path.join(self.hostfs_mntpt, path)

        sudo_write_file(self.client_remote, path, data)

        if perms:
            self.run_shell(args=f'chmod {perms} {path}')

    def read_file(self, path):
        """
        Return the data from the file on given path.
        """
        if path.find(self.hostfs_mntpt) == -1:
            path = os.path.join(self.hostfs_mntpt, path)

        return self.run_shell(args=['sudo', 'cat', path], omit_sudo=False).\
            stdout.getvalue().strip()

    def create_destroy(self):
        assert (self.is_mounted())

        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
        log.debug("Creating test file {0}".format(filename))
        self.client_remote.run(
            args=['sudo', 'touch',
                  os.path.join(self.hostfs_mntpt, filename)])
        log.debug("Deleting test file {0}".format(filename))
        self.client_remote.run(args=[
            'sudo', 'rm', '-f',
            os.path.join(self.hostfs_mntpt, filename)
        ])

    def _run_python(self, pyscript, py_version='python3'):
        return self.client_remote.run(args=[
            'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', py_version,
            '-c', pyscript
        ],
                                      wait=False,
                                      stdin=run.PIPE,
                                      stdout=StringIO())

    def run_python(self, pyscript, py_version='python3'):
        p = self._run_python(pyscript, py_version)
        p.wait()
        return p.stdout.getvalue().strip()

    def run_shell(self, args, omit_sudo=True, timeout=900, **kwargs):
        args = args.split() if isinstance(args, str) else args
        # XXX: all commands ran with CephFS mount as CWD must be executed with
        #  superuser privileges when tests are being run using teuthology.
        if args[0] != 'sudo':
            args.insert(0, 'sudo')
        cwd = kwargs.pop('cwd', self.mountpoint)
        stdout = kwargs.pop('stdout', StringIO())
        stderr = kwargs.pop('stderr', StringIO())

        return self.client_remote.run(args=args,
                                      cwd=cwd,
                                      timeout=timeout,
                                      stdout=stdout,
                                      stderr=stderr,
                                      **kwargs)

    def run_shell_payload(self, payload, **kwargs):
        return self.run_shell(["bash", "-c", Raw(f"'{payload}'")], **kwargs)

    def run_as_user(self, **kwargs):
        """
        Besides the arguments defined for run_shell() this method also
        accepts argument 'user'.
        """
        args = kwargs.pop('args')
        user = kwargs.pop('user')
        if isinstance(args, str):
            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', args]
        elif isinstance(args, list):
            cmdlist = args
            cmd = ''
            for i in cmdlist:
                cmd = cmd + i + ' '
            # get rid of extra space at the end.
            cmd = cmd[:-1]

            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', cmd]

        kwargs['args'] = args
        return self.run_shell(**kwargs)

    def run_as_root(self, **kwargs):
        """
        Accepts same arguments as run_shell().
        """
        kwargs['user'] = '******'
        return self.run_as_user(**kwargs)

    def _verify(self, proc, retval=None, errmsg=None):
        if retval:
            msg = ('expected return value: {}\nreceived return value: '
                   '{}\n'.format(retval, proc.returncode))
            assert proc.returncode == retval, msg

        if errmsg:
            stderr = proc.stderr.getvalue().lower()
            msg = ('didn\'t find given string in stderr -\nexpected string: '
                   '{}\nreceived error message: {}\nnote: received error '
                   'message is converted to lowercase'.format(errmsg, stderr))
            assert errmsg in stderr, msg

    def negtestcmd(self,
                   args,
                   retval=None,
                   errmsg=None,
                   stdin=None,
                   cwd=None,
                   wait=True):
        """
        Conduct a negative test for the given command.

        retval and errmsg are parameters to confirm the cause of command
        failure.
        """
        proc = self.run_shell(args=args,
                              wait=wait,
                              stdin=stdin,
                              cwd=cwd,
                              check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def negtestcmd_as_user(self,
                           args,
                           user,
                           retval=None,
                           errmsg=None,
                           stdin=None,
                           cwd=None,
                           wait=True):
        proc = self.run_as_user(args=args,
                                user=user,
                                wait=wait,
                                stdin=stdin,
                                cwd=cwd,
                                check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def negtestcmd_as_root(self,
                           args,
                           retval=None,
                           errmsg=None,
                           stdin=None,
                           cwd=None,
                           wait=True):
        proc = self.run_as_root(args=args,
                                wait=wait,
                                stdin=stdin,
                                cwd=cwd,
                                check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def open_no_data(self, basename):
        """
        A pure metadata operation
        """
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        p = self._run_python(
            dedent("""
            f = open("{path}", 'w')
            """.format(path=path)))
        p.wait()

    def open_background(self, basename="background_file", write=True):
        """
        Open a file for writing, then block such that the client
        will hold a capability.

        Don't return until the remote process has got as far as opening
        the file, then return the RemoteProcess instance.
        """
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        if write:
            pyscript = dedent("""
                import time

                with open("{path}", 'w') as f:
                    f.write('content')
                    f.flush()
                    f.write('content2')
                    while True:
                        time.sleep(1)
                """).format(path=path)
        else:
            pyscript = dedent("""
                import time

                with open("{path}", 'r') as f:
                    while True:
                        time.sleep(1)
                """).format(path=path)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)

        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
        self.wait_for_visible(basename)

        return rproc

    def wait_for_dir_empty(self, dirname, timeout=30):
        dirpath = os.path.join(self.hostfs_mntpt, dirname)
        with safe_while(sleep=5, tries=(timeout // 5)) as proceed:
            while proceed():
                p = self.run_shell_payload(f"stat -c %h {dirpath}")
                nr_links = int(p.stdout.getvalue().strip())
                if nr_links == 2:
                    return

    def wait_for_visible(self, basename="background_file", timeout=30):
        i = 0
        while i < timeout:
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.hostfs_mntpt, basename)],
                check_status=False)
            if r.exitstatus == 0:
                log.debug("File {0} became visible from {1} after {2}s".format(
                    basename, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError(
            "Timed out after {0}s waiting for {1} to become visible from {2}".
            format(i, basename, self.client_id))

    def lock_background(self, basename="background_file", do_flock=True):
        """
        Open and lock a files for writing, hold the lock in a background process
        """
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        script_builder = """
            import time
            import fcntl
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
        script_builder += """
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            while True:
                time.sleep(1)
            """

        pyscript = dedent(script_builder).format(path=path)

        log.info("lock_background file {0}".format(basename))
        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def lock_and_release(self, basename="background_file"):
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        script = """
            import time
            import fcntl
            import struct
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX)
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            """
        pyscript = dedent(script).format(path=path)

        log.info("lock_and_release file {0}".format(basename))
        return self._run_python(pyscript)

    def check_filelock(self, basename="background_file", do_flock=True):
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        script_builder = """
            import fcntl
            import errno
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'r')
            try:
                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
            except IOError as e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("flock on file {path}-1 not found")"""
        script_builder += """
            f2 = open("{path}-2", 'r')
            try:
                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            except IOError as e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("posix lock on file {path}-2 not found")
            """
        pyscript = dedent(script_builder).format(path=path)

        log.info("check lock on file {0}".format(basename))
        self.client_remote.run(args=['sudo', 'python3', '-c', pyscript])

    def write_background(self, basename="background_file", loop=False):
        """
        Open a file for writing, complete as soon as you can
        :param basename:
        :return:
        """
        assert (self.is_mounted())

        path = os.path.join(self.hostfs_mntpt, basename)

        pyscript = dedent("""
            import os
            import time

            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0o644)
            try:
                while True:
                    os.write(fd, b'content')
                    time.sleep(1)
                    if not {loop}:
                        break
            except IOError as e:
                pass
            os.close(fd)
            """).format(path=path, loop=str(loop))

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
        """
        Write the requested number of megabytes to a file
        """
        assert (self.is_mounted())

        return self.run_shell([
            "dd", "if=/dev/urandom", "of={0}".format(filename), "bs=1M",
            "conv=fdatasync", "count={0}".format(int(n_mb)), "seek={0}".format(
                int(seek))
        ],
                              wait=wait)

    def write_test_pattern(self, filename, size):
        log.info("Writing {0} bytes to {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            with open(path, 'w') as f:
                for i in range(0, {size}):
                    val = zlib.crc32(str(i).encode('utf-8')) & 7
                    f.write(chr(val))
        """.format(path=os.path.join(self.hostfs_mntpt, filename), size=size)))

    def validate_test_pattern(self, filename, size):
        log.info("Validating {0} bytes from {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            with open(path, 'r') as f:
                bytes = f.read()
            if len(bytes) != {size}:
                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
                    len(bytes), {size}
                ))
            for i, b in enumerate(bytes):
                val = zlib.crc32(str(i).encode('utf-8')) & 7
                if b != chr(val):
                    raise RuntimeError("Bad data at offset {{0}}".format(i))
        """.format(path=os.path.join(self.hostfs_mntpt, filename), size=size)))

    def open_n_background(self, fs_path, count):
        """
        Open N files for writing, hold them open in a background process

        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
        :return: a RemoteProcess
        """
        assert (self.is_mounted())

        abs_path = os.path.join(self.hostfs_mntpt, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(abs_path):
                os.makedirs(abs_path)

            handles = []
            for i in range(0, n):
                fname = "file_"+str(i)
                path = os.path.join(abs_path, fname)
                handles.append(open(path, 'w'))

            while True:
                time.sleep(1)
            """).format(abs_path=abs_path, count=count)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def create_n_files(self, fs_path, count, sync=False):
        assert (self.is_mounted())

        abs_path = os.path.join(self.hostfs_mntpt, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                with open(fname, 'w') as f:
                    f.write('content')
                    if {sync}:
                        f.flush()
                        os.fsync(f.fileno())
            """).format(abs_path=abs_path, count=count, sync=str(sync))

        self.run_python(pyscript)

    def teardown(self):
        for p in self.background_procs:
            log.info("Terminating background process")
            self._kill_background(p)

        self.background_procs = []

    def _kill_background(self, p):
        if p.stdin:
            p.stdin.close()
            try:
                p.wait()
            except (CommandFailedError, ConnectionLostError):
                pass

    def kill_background(self, p):
        """
        For a process that was returned by one of the _background member functions,
        kill it hard.
        """
        self._kill_background(p)
        self.background_procs.remove(p)

    def send_signal(self, signal):
        signal = signal.lower()
        if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']:
            raise NotImplementedError

        self.client_remote.run(
            args=['sudo', 'kill', '-{0}'.format(signal), self.client_pid],
            omit_sudo=False)

    def get_global_id(self):
        raise NotImplementedError()

    def get_global_inst(self):
        raise NotImplementedError()

    def get_global_addr(self):
        raise NotImplementedError()

    def get_osd_epoch(self):
        raise NotImplementedError()

    def get_op_read_count(self):
        raise NotImplementedError()

    def lstat(self, fs_path, follow_symlinks=False, wait=True):
        return self.stat(fs_path, follow_symlinks=False, wait=True)

    def stat(self, fs_path, follow_symlinks=True, wait=True):
        """
        stat a file, and return the result as a dictionary like this:
        {
          "st_ctime": 1414161137.0,
          "st_mtime": 1414161137.0,
          "st_nlink": 33,
          "st_gid": 0,
          "st_dev": 16777218,
          "st_size": 1190,
          "st_ino": 2,
          "st_uid": 0,
          "st_mode": 16877,
          "st_atime": 1431520593.0
        }

        Raises exception on absent file.
        """
        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
        if follow_symlinks:
            stat_call = "os.stat('" + abs_path + "')"
        else:
            stat_call = "os.lstat('" + abs_path + "')"

        pyscript = dedent("""
            import os
            import stat
            import json
            import sys

            try:
                s = {stat_call}
            except OSError as e:
                sys.exit(e.errno)

            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
            print(json.dumps(
                dict([(a, getattr(s, a)) for a in attrs]),
                indent=2))
            """).format(stat_call=stat_call)
        proc = self._run_python(pyscript)
        if wait:
            proc.wait()
            return json.loads(proc.stdout.getvalue().strip())
        else:
            return proc

    def touch(self, fs_path):
        """
        Create a dentry if it doesn't already exist.  This python
        implementation exists because the usual command line tool doesn't
        pass through error codes like EIO.

        :param fs_path:
        :return:
        """
        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
        pyscript = dedent("""
            import sys
            import errno

            try:
                f = open("{path}", "w")
                f.close()
            except IOError as e:
                sys.exit(errno.EIO)
            """).format(path=abs_path)
        proc = self._run_python(pyscript)
        proc.wait()

    def path_to_ino(self, fs_path, follow_symlinks=True):
        abs_path = os.path.join(self.hostfs_mntpt, fs_path)

        if follow_symlinks:
            pyscript = dedent("""
                import os
                import stat

                print(os.stat("{path}").st_ino)
                """).format(path=abs_path)
        else:
            pyscript = dedent("""
                import os
                import stat

                print(os.lstat("{path}").st_ino)
                """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def path_to_nlink(self, fs_path):
        abs_path = os.path.join(self.hostfs_mntpt, fs_path)

        pyscript = dedent("""
            import os
            import stat

            print(os.stat("{path}").st_nlink)
            """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def ls(self, path=None):
        """
        Wrap ls: return a list of strings
        """
        cmd = ["ls"]
        if path:
            cmd.append(path)

        ls_text = self.run_shell(cmd).stdout.getvalue().strip()

        if ls_text:
            return ls_text.split("\n")
        else:
            # Special case because otherwise split on empty string
            # gives you [''] instead of []
            return []

    def setfattr(self, path, key, val):
        """
        Wrap setfattr.

        :param path: relative to mount point
        :param key: xattr name
        :param val: xattr value
        :return: None
        """
        self.run_shell(["setfattr", "-n", key, "-v", val, path])

    def getfattr(self, path, attr):
        """
        Wrap getfattr: return the values of a named xattr on one file, or
        None if the attribute is not found.

        :return: a string
        """
        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path],
                           wait=False)
        try:
            p.wait()
        except CommandFailedError as e:
            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(
            ):
                return None
            else:
                raise

        return str(p.stdout.getvalue())

    def df(self):
        """
        Wrap df: return a dict of usage fields in bytes
        """

        p = self.run_shell(["df", "-B1", "."])
        lines = p.stdout.getvalue().strip().split("\n")
        fs, total, used, avail = lines[1].split()[:4]
        log.warning(lines)

        return {
            "total": int(total),
            "used": int(used),
            "available": int(avail)
        }

    def dir_checksum(self, path=None, follow_symlinks=False):
        cmd = ["find"]
        if follow_symlinks:
            cmd.append("-L")
        if path:
            cmd.append(path)
        cmd.extend(["-type", "f", "-exec", "md5sum", "{}", "+"])
        checksum_text = self.run_shell(cmd).stdout.getvalue().strip()
        checksum_sorted = sorted(checksum_text.split('\n'),
                                 key=lambda v: v.split()[1])
        return hashlib.md5(
            ('\n'.join(checksum_sorted)).encode('utf-8')).hexdigest()
Exemple #31
0
def task(ctx, config):
    """
    Mount/unmount a ``ceph-fuse`` client.

    The config is optional and defaults to mounting on all clients. If
    a config is given, it is expected to be a list of clients to do
    this operation on. This lets you e.g. set up one client with
    ``ceph-fuse`` and another with ``kclient``.

    Example that mounts all clients::

        tasks:
        - ceph:
        - ceph-fuse:
        - interactive:

    Example that uses both ``kclient` and ``ceph-fuse``::

        tasks:
        - ceph:
        - ceph-fuse: [client.0]
        - kclient: [client.1]
        - interactive:

    Example that enables valgrind:

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
        - interactive:

    Example that stops an already-mounted client:

    ::

        tasks:
            - ceph:
            - ceph-fuse: [client.0]
            - ... do something that requires the FS mounted ...
            - ceph-fuse:
                client.0:
                    mounted: false
            - ... do something that requires the FS unmounted ...

    Example that adds more generous wait time for mount (for virtual machines):

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              mount_wait: 60 # default is 0, do not wait before checking /sys/
              mount_timeout: 120 # default is 30, give up if /sys/ is not populated
        - interactive:

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Mounting ceph-fuse clients...')

    testdir = teuthology.get_testdir(ctx)
    config = get_client_configs(ctx, config)

    # List clients we will configure mounts for, default is all clients
    clients = list(teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys())))

    all_mounts = getattr(ctx, 'mounts', {})
    mounted_by_me = {}

    log.info('Wait for MDS to reach steady state...')
    mds_cluster = MDSCluster(ctx)
    status = mds_cluster.status()
    for filesystem in status.get_filesystems():
        fs = Filesystem(ctx, fscid=filesystem['id']) 
        fs.wait_for_daemons()
    log.info('Ready to start ceph-fuse...')

    # Construct any new FuseMount instances
    for id_, remote in clients:
        client_config = config.get("client.%s" % id_)
        if client_config is None:
            client_config = {}

        if id_ not in all_mounts:
            fuse_mount = FuseMount(client_config, testdir, id_, remote)
            all_mounts[id_] = fuse_mount
        else:
            # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
            assert isinstance(all_mounts[id_], FuseMount)

        if not config.get("disabled", False) and client_config.get('mounted', True):
            mounted_by_me[id_] = all_mounts[id_]

    ctx.mounts = all_mounts

    # Mount any clients we have been asked to (default to mount all)
    for mount in mounted_by_me.values():
        mount.mount()

    for mount in mounted_by_me.values():
        mount.wait_until_mounted()

    # Umount any pre-existing clients that we have not been asked to mount
    for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
        mount = all_mounts[client_id]
        if mount.is_mounted():
            mount.umount_wait()

    try:
        yield all_mounts
    finally:
        log.info('Unmounting ceph-fuse clients...')

        for mount in mounted_by_me.values():
            # Conditional because an inner context might have umounted it
            if mount.is_mounted():
                mount.umount_wait()
Exemple #32
0
class CephFSMount(object):
    def __init__(self, ctx, test_dir, client_id, client_remote, brxnet):
        """
        :param test_dir: Global teuthology test dir
        :param client_id: Client ID, the 'foo' in client.foo
        :param client_remote: Remote instance for the host where client will run
        """

        self.ctx = ctx
        self.test_dir = test_dir
        self.client_id = client_id
        self.client_remote = client_remote
        self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
        self._mountpoint = None
        self.fs = None
        self.mounted = False
        self._netns_name = None
        self.nsid = -1
        if brxnet is None:
            self.ceph_brx_net = '192.168.0.0/16'
        else:
            self.ceph_brx_net = brxnet

        self.test_files = ['a', 'b', 'c']

        self.background_procs = []

    def _parse_netns_name(self):
        self._netns_name = '-'.join(
            ["ceph-ns", re.sub(r'/+', "-", self.mountpoint)])

    @property
    def mountpoint(self):
        if self._mountpoint == None:
            self._mountpoint = os.path.join(
                self.test_dir,
                '{dir_name}'.format(dir_name=self.mountpoint_dir_name))
        return self._mountpoint

    @mountpoint.setter
    def mountpoint(self, path):
        if not isinstance(path, str):
            raise RuntimeError('path should be of str type.')
        self._mountpoint = path

    @property
    def netns_name(self):
        if self._netns_name == None:
            self._parse_netns_name()
        return self._netns_name

    @netns_name.setter
    def netns_name(self, name):
        if not isinstance(path, str):
            raise RuntimeError('path should be of str type.')
        self._netns_name = name

    def is_mounted(self):
        return self.mounted

    def setupfs(self, name=None):
        if name is None and self.fs is not None:
            # Previous mount existed, reuse the old name
            name = self.fs.name
        self.fs = Filesystem(self.ctx, name=name)
        log.info('Wait for MDS to reach steady state...')
        self.fs.wait_for_daemons()
        log.info('Ready to start {}...'.format(type(self).__name__))

    def _bringup_network_manager_service(self):
        args = ["sudo", "bash", "-c", "systemctl start NetworkManager"]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def _setup_brx_and_nat(self):
        # The ip for ceph-brx should be
        ip = IP(self.ceph_brx_net)[-2]
        mask = self.ceph_brx_net.split('/')[1]
        brd = IP(self.ceph_brx_net).broadcast()

        brx = self.client_remote.run(args=['ip', 'addr'],
                                     stderr=StringIO(),
                                     stdout=StringIO(),
                                     timeout=(5 * 60))
        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
        if brx:
            # If the 'ceph-brx' already exists, then check whether
            # the new net is conflicting with it
            _ip, _mask = brx[0].split()[1].split('/', 1)
            if _ip != "{}".format(ip) or _mask != mask:
                raise RuntimeError(
                    "Conflict with existing ceph-brx {0}, new {1}/{2}".format(
                        brx[0].split()[1], ip, mask))

        # Setup the ceph-brx and always use the last valid IP
        if not brx:
            log.info("Setuping the 'ceph-brx' with {0}/{1}".format(ip, mask))

            args = [
                "sudo", "bash", "-c", "ip link add name ceph-brx type bridge"
            ]
            self.client_remote.run(args=args, timeout=(5 * 60))
            args = ["sudo", "bash", "-c", "ip link set ceph-brx up"]
            self.client_remote.run(args=args, timeout=(5 * 60))
            args = [
                "sudo", "bash", "-c",
                "ip addr add {0}/{1} brd {2} dev ceph-brx".format(
                    ip, mask, brd)
            ]
            self.client_remote.run(args=args, timeout=(5 * 60))

        args = "echo 1 | sudo tee /proc/sys/net/ipv4/ip_forward"
        self.client_remote.run(args=args, timeout=(5 * 60))

        # Setup the NAT
        p = self.client_remote.run(args=['route'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        p = re.findall(r'default .*', p.stdout.getvalue())
        if p == False:
            raise RuntimeError("No default gw found")
        gw = p[0].split()[7]
        args = [
            "sudo", "bash", "-c",
            "iptables -A FORWARD -o {0} -i ceph-brx -j ACCEPT".format(gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "iptables -A FORWARD -i {0} -o ceph-brx -j ACCEPT".format(gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "iptables -t nat -A POSTROUTING -s {0}/{1} -o {2} -j MASQUERADE".
            format(ip, mask, gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def _setup_netns(self):
        p = self.client_remote.run(args=['ip', 'netns', 'list'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        p = p.stdout.getvalue().strip()
        if re.match(self.netns_name, p) is not None:
            raise RuntimeError("the netns '{}' already exists!".format(
                self.netns_name))

        # Get the netns name list
        netns_list = re.findall(r'[^()\s][-.\w]+[^():\s]', p)

        # Get an uniq netns id
        nsid = 0
        while True:
            p = self.client_remote.run(args=['ip', 'netns', 'list-id'],
                                       stderr=StringIO(),
                                       stdout=StringIO(),
                                       timeout=(5 * 60))
            p = re.search(r"nsid {} ".format(nsid), p.stdout.getvalue())
            if p is None:
                break

            nsid += 1

        self.nsid = nsid

        # Add one new netns and set it id
        args = [
            "sudo", "bash", "-c", "ip netns add {0}".format(self.netns_name)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "ip netns set {0} {1}".format(self.netns_name, nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

        # Get one ip address for netns
        ips = IP(self.ceph_brx_net)
        for ip in ips:
            found = False
            if ip == ips[0]:
                continue
            if ip == ips[-2]:
                raise RuntimeError("we have ran out of the ip addresses")

            for ns in netns_list:
                ns_name = ns.split()[0]
                args = [
                    "sudo", "bash", "-c",
                    "ip netns exec {0} ip addr".format(ns_name)
                ]
                p = self.client_remote.run(args=args,
                                           stderr=StringIO(),
                                           stdout=StringIO(),
                                           timeout=(5 * 60))
                q = re.search("{0}".format(ip), p.stdout.getvalue())
                if q is not None:
                    found = True
                    break

            if found == False:
                break

        mask = self.ceph_brx_net.split('/')[1]
        brd = IP(self.ceph_brx_net).broadcast()

        log.info("Setuping the netns '{0}' with {1}/{2}".format(
            self.netns_name, ip, mask))

        # Setup the veth interfaces
        args = [
            "sudo", "bash", "-c",
            "ip link add veth0 netns {0} type veth peer name brx.{1}".format(
                self.netns_name, nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "ip netns exec {0} ip addr add {1}/{2} brd {3} dev veth0".format(
                self.netns_name, ip, mask, brd)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "ip netns exec {0} ip link set veth0 up".format(self.netns_name)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "ip netns exec {0} ip link set lo up".format(self.netns_name)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

        brxip = IP(self.ceph_brx_net)[-2]
        args = [
            "sudo", "bash", "-c",
            "ip netns exec {0} ip route add default via {1}".format(
                self.netns_name, brxip)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

        # Bring up the brx interface and join it to 'ceph-brx'
        args = ["sudo", "bash", "-c", "ip link set brx.{0} up".format(nsid)]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "ip link set dev brx.{0} master ceph-brx".format(nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def _cleanup_netns(self):
        if self.nsid == -1:
            return
        log.info("Removing the netns '{0}'".format(self.netns_name))

        # Delete the netns and the peer veth interface
        args = [
            "sudo", "bash", "-c", "ip link set brx.{0} down".format(self.nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c", "ip link delete brx.{0}".format(self.nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

        args = [
            "sudo", "bash", "-c", "ip netns delete {0}".format(self.netns_name)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

        self.nsid = -1

    def _cleanup_brx_and_nat(self):
        brx = self.client_remote.run(args=['ip', 'addr'],
                                     stderr=StringIO(),
                                     stdout=StringIO(),
                                     timeout=(5 * 60))
        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
        if not brx:
            return

        # If we are the last netns, will delete the ceph-brx
        args = ["sudo", "bash", "-c", "ip link show"]
        p = self.client_remote.run(args=args,
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        _list = re.findall(r'brx\.', p.stdout.getvalue().strip())
        if len(_list) != 0:
            return

        log.info("Removing the 'ceph-brx'")

        args = ["sudo", "bash", "-c", "ip link set ceph-brx down"]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = ["sudo", "bash", "-c", "ip link delete ceph-brx"]
        self.client_remote.run(args=args, timeout=(5 * 60))

        # Drop the iptables NAT rules
        ip = IP(self.ceph_brx_net)[-2]
        mask = self.ceph_brx_net.split('/')[1]

        p = self.client_remote.run(args=['route'],
                                   stderr=StringIO(),
                                   stdout=StringIO(),
                                   timeout=(5 * 60))
        p = re.findall(r'default .*', p.stdout.getvalue())
        if p == False:
            raise RuntimeError("No default gw found")
        gw = p[0].split()[7]
        args = [
            "sudo", "bash", "-c",
            "iptables -D FORWARD -o {0} -i ceph-brx -j ACCEPT".format(gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "iptables -D FORWARD -i {0} -o ceph-brx -j ACCEPT".format(gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))
        args = [
            "sudo", "bash", "-c",
            "iptables -t nat -D POSTROUTING -s {0}/{1} -o {2} -j MASQUERADE".
            format(ip, mask, gw)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def setup_netns(self):
        """
        Setup the netns for the mountpoint.
        """
        log.info("Setting the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))
        self._setup_brx_and_nat()
        self._setup_netns()

    def cleanup_netns(self):
        """
        Cleanup the netns for the mountpoint.
        """
        log.info("Cleaning the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))
        self._cleanup_netns()
        self._cleanup_brx_and_nat()

    def suspend_netns(self):
        """
        Suspend the netns veth interface.
        """
        if self.nsid == -1:
            return

        log.info("Suspending the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))

        args = [
            "sudo", "bash", "-c", "ip link set brx.{0} down".format(self.nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def resume_netns(self):
        """
        Resume the netns veth interface.
        """
        if self.nsid == -1:
            return

        log.info("Resuming the '{0}' netns for '{1}'".format(
            self._netns_name, self.mountpoint))

        args = [
            "sudo", "bash", "-c", "ip link set brx.{0} up".format(self.nsid)
        ]
        self.client_remote.run(args=args, timeout=(5 * 60))

    def mount(self,
              mount_path=None,
              mount_fs_name=None,
              mountpoint=None,
              mount_options=[]):
        raise NotImplementedError()

    def mount_wait(self,
                   mount_path=None,
                   mount_fs_name=None,
                   mountpoint=None,
                   mount_options=[]):
        self.mount(mount_path=mount_path,
                   mount_fs_name=mount_fs_name,
                   mountpoint=mountpoint,
                   mount_options=mount_options)
        self.wait_until_mounted()

    def umount(self):
        raise NotImplementedError()

    def umount_wait(self, force=False, require_clean=False):
        """

        :param force: Expect that the mount will not shutdown cleanly: kill
                      it hard.
        :param require_clean: Wait for the Ceph client associated with the
                              mount (e.g. ceph-fuse) to terminate, and
                              raise if it doesn't do so cleanly.
        :return:
        """
        raise NotImplementedError()

    def kill(self):
        """
        Suspend the netns veth interface to make the client disconnected
        from the ceph cluster
        """
        log.info('Killing connection on {0}...'.format(
            self.client_remote.name))
        self.suspend_netns()

    def kill_cleanup(self):
        """
        Follow up ``kill`` to get to a clean unmounted state.
        """
        log.info('Cleaning up killed connection on {0}'.format(
            self.client_remote.name))
        self.umount_wait(force=True)

    def cleanup(self):
        """
        Remove the mount point.

        Prerequisite: the client is not mounted.
        """
        stderr = StringIO()
        try:
            self.client_remote.run(
                args=[
                    'rmdir',
                    '--',
                    self.mountpoint,
                ],
                cwd=self.test_dir,
                stderr=stderr,
                timeout=(60 * 5),
                check_status=False,
            )
        except CommandFailedError:
            if "No such file or directory" in stderr.getvalue():
                pass
            else:
                raise

        self.cleanup_netns()

    def wait_until_mounted(self):
        raise NotImplementedError()

    def get_keyring_path(self):
        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)

    @property
    def config_path(self):
        """
        Path to ceph.conf: override this if you're not a normal systemwide ceph install
        :return: stringv
        """
        return "/etc/ceph/ceph.conf"

    @contextmanager
    def mounted_wait(self):
        """
        A context manager, from an initially unmounted state, to mount
        this, yield, and then unmount and clean up.
        """
        self.mount()
        self.wait_until_mounted()
        try:
            yield
        finally:
            self.umount_wait()

    def is_blacklisted(self):
        addr = self.get_global_addr()
        blacklist = json.loads(
            self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "ls",
                                                "--format=json"))
        for b in blacklist:
            if addr == b["addr"]:
                return True
        return False

    def create_file(self,
                    filename='testfile',
                    dirname=None,
                    user=None,
                    check_status=True):
        assert (self.is_mounted())

        if not os.path.isabs(filename):
            if dirname:
                if os.path.isabs(dirname):
                    path = os.path.join(dirname, filename)
                else:
                    path = os.path.join(self.mountpoint, dirname, filename)
            else:
                path = os.path.join(self.mountpoint, filename)
        else:
            path = filename

        if user:
            args = [
                'sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path
            ]
        else:
            args = 'touch ' + path

        return self.client_remote.run(args=args, check_status=check_status)

    def create_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Creating file {0}".format(suffix))
            self.client_remote.run(
                args=['sudo', 'touch',
                      os.path.join(self.mountpoint, suffix)])

    def test_create_file(self,
                         filename='testfile',
                         dirname=None,
                         user=None,
                         check_status=True):
        return self.create_file(filename=filename,
                                dirname=dirname,
                                user=user,
                                check_status=False)

    def check_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Checking file {0}".format(suffix))
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.mountpoint, suffix)],
                check_status=False)
            if r.exitstatus != 0:
                raise RuntimeError(
                    "Expected file {0} not found".format(suffix))

    def create_destroy(self):
        assert (self.is_mounted())

        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
        log.debug("Creating test file {0}".format(filename))
        self.client_remote.run(
            args=['sudo', 'touch',
                  os.path.join(self.mountpoint, filename)])
        log.debug("Deleting test file {0}".format(filename))
        self.client_remote.run(
            args=['sudo', 'rm', '-f',
                  os.path.join(self.mountpoint, filename)])

    def _run_python(self, pyscript, py_version='python3'):
        return self.client_remote.run(args=[
            'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', py_version,
            '-c', pyscript
        ],
                                      wait=False,
                                      stdin=run.PIPE,
                                      stdout=StringIO())

    def run_python(self, pyscript, py_version='python3'):
        p = self._run_python(pyscript, py_version)
        p.wait()
        return six.ensure_str(p.stdout.getvalue().strip())

    def run_shell(self,
                  args,
                  wait=True,
                  stdin=None,
                  check_status=True,
                  cwd=None,
                  omit_sudo=True):
        args = args.split() if isinstance(args, str) else args
        # XXX: all commands ran with CephFS mount as CWD must be executed with
        #  superuser privileges when tests are being run using teuthology.
        if args[0] != 'sudo':
            args.insert(0, 'sudo')
        if not cwd:
            cwd = self.mountpoint

        return self.client_remote.run(args=args,
                                      stdin=stdin,
                                      wait=wait,
                                      stdout=StringIO(),
                                      stderr=StringIO(),
                                      cwd=cwd,
                                      check_status=check_status)

    def run_as_user(self,
                    args,
                    user,
                    wait=True,
                    stdin=None,
                    check_status=True,
                    cwd=None):
        if isinstance(args, str):
            args = 'sudo -u %s -s /bin/bash -c %s' % (user, args)
        elif isinstance(args, list):
            cmdlist = args
            cmd = ''
            for i in cmdlist:
                cmd = cmd + i + ' '
            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c']
            args.append(cmd)
        if not cwd:
            cwd = self.mountpoint

        return self.client_remote.run(args=args,
                                      wait=wait,
                                      stdin=stdin,
                                      stdout=StringIO(),
                                      stderr=StringIO(),
                                      check_status=check_status,
                                      cwd=cwd)

    def run_as_root(self,
                    args,
                    wait=True,
                    stdin=None,
                    check_status=True,
                    cwd=None):
        if isinstance(args, str):
            args = 'sudo ' + args
        if isinstance(args, list):
            args.insert(0, 'sudo')
        if not cwd:
            cwd = self.mountpoint

        return self.client_remote.run(args=args,
                                      wait=wait,
                                      stdin=stdin,
                                      stdout=StringIO(),
                                      stderr=StringIO(),
                                      check_status=check_status,
                                      cwd=cwd)

    def _verify(self, proc, retval=None, errmsg=None):
        if retval:
            msg = ('expected return value: {}\nreceived return value: '
                   '{}\n'.format(retval, proc.returncode))
            assert proc.returncode == retval, msg

        if errmsg:
            stderr = proc.stderr.getvalue().lower()
            msg = ('didn\'t find given string in stderr -\nexpected string: '
                   '{}\nreceived error message: {}\nnote: received error '
                   'message is converted to lowercase'.format(errmsg, stderr))
            assert errmsg in stderr, msg

    def negtestcmd(self,
                   args,
                   retval=None,
                   errmsg=None,
                   stdin=None,
                   cwd=None,
                   wait=True):
        """
        Conduct a negative test for the given command.

        retval and errmsg are parameters to confirm the cause of command
        failure.
        """
        proc = self.run_shell(args=args,
                              wait=wait,
                              stdin=stdin,
                              cwd=cwd,
                              check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def negtestcmd_as_user(self,
                           args,
                           user,
                           retval=None,
                           errmsg=None,
                           stdin=None,
                           cwd=None,
                           wait=True):
        proc = self.run_as_user(args=args,
                                user=user,
                                wait=wait,
                                stdin=stdin,
                                cwd=cwd,
                                check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def negtestcmd_as_root(self,
                           args,
                           retval=None,
                           errmsg=None,
                           stdin=None,
                           cwd=None,
                           wait=True):
        proc = self.run_as_root(args=args,
                                wait=wait,
                                stdin=stdin,
                                cwd=cwd,
                                check_status=False)
        self._verify(proc, retval, errmsg)
        return proc

    def open_no_data(self, basename):
        """
        A pure metadata operation
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        p = self._run_python(
            dedent("""
            f = open("{path}", 'w')
            """.format(path=path)))
        p.wait()

    def open_background(self, basename="background_file", write=True):
        """
        Open a file for writing, then block such that the client
        will hold a capability.

        Don't return until the remote process has got as far as opening
        the file, then return the RemoteProcess instance.
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        if write:
            pyscript = dedent("""
                import time

                with open("{path}", 'w') as f:
                    f.write('content')
                    f.flush()
                    f.write('content2')
                    while True:
                        time.sleep(1)
                """).format(path=path)
        else:
            pyscript = dedent("""
                import time

                with open("{path}", 'r') as f:
                    while True:
                        time.sleep(1)
                """).format(path=path)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)

        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
        self.wait_for_visible(basename)

        return rproc

    def wait_for_dir_empty(self, dirname, timeout=30):
        i = 0
        dirpath = os.path.join(self.mountpoint, dirname)
        while i < timeout:
            nr_entries = int(self.getfattr(dirpath, "ceph.dir.entries"))
            if nr_entries == 0:
                log.debug(
                    "Directory {0} seen empty from {1} after {2}s ".format(
                        dirname, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError(
            "Timed out after {0}s waiting for {1} to become empty from {2}".
            format(i, dirname, self.client_id))

    def wait_for_visible(self, basename="background_file", timeout=30):
        i = 0
        while i < timeout:
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.mountpoint, basename)],
                check_status=False)
            if r.exitstatus == 0:
                log.debug("File {0} became visible from {1} after {2}s".format(
                    basename, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError(
            "Timed out after {0}s waiting for {1} to become visible from {2}".
            format(i, basename, self.client_id))

    def lock_background(self, basename="background_file", do_flock=True):
        """
        Open and lock a files for writing, hold the lock in a background process
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import time
            import fcntl
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
        script_builder += """
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            while True:
                time.sleep(1)
            """

        pyscript = dedent(script_builder).format(path=path)

        log.info("lock_background file {0}".format(basename))
        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def lock_and_release(self, basename="background_file"):
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script = """
            import time
            import fcntl
            import struct
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX)
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            """
        pyscript = dedent(script).format(path=path)

        log.info("lock_and_release file {0}".format(basename))
        return self._run_python(pyscript)

    def check_filelock(self, basename="background_file", do_flock=True):
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import fcntl
            import errno
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'r')
            try:
                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
            except IOError as e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("flock on file {path}-1 not found")"""
        script_builder += """
            f2 = open("{path}-2", 'r')
            try:
                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            except IOError as e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("posix lock on file {path}-2 not found")
            """
        pyscript = dedent(script_builder).format(path=path)

        log.info("check lock on file {0}".format(basename))
        self.client_remote.run(args=['sudo', 'python3', '-c', pyscript])

    def write_background(self, basename="background_file", loop=False):
        """
        Open a file for writing, complete as soon as you can
        :param basename:
        :return:
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        pyscript = dedent("""
            import os
            import time

            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0o644)
            try:
                while True:
                    os.write(fd, b'content')
                    time.sleep(1)
                    if not {loop}:
                        break
            except IOError as e:
                pass
            os.close(fd)
            """).format(path=path, loop=str(loop))

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
        """
        Write the requested number of megabytes to a file
        """
        assert (self.is_mounted())

        return self.run_shell([
            "dd", "if=/dev/urandom", "of={0}".format(filename), "bs=1M",
            "conv=fdatasync", "count={0}".format(int(n_mb)), "seek={0}".format(
                int(seek))
        ],
                              wait=wait)

    def write_test_pattern(self, filename, size):
        log.info("Writing {0} bytes to {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            with open(path, 'w') as f:
                for i in range(0, {size}):
                    val = zlib.crc32(str(i).encode('utf-8')) & 7
                    f.write(chr(val))
        """.format(path=os.path.join(self.mountpoint, filename), size=size)))

    def validate_test_pattern(self, filename, size):
        log.info("Validating {0} bytes from {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            with open(path, 'r') as f:
                bytes = f.read()
            if len(bytes) != {size}:
                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
                    len(bytes), {size}
                ))
            for i, b in enumerate(bytes):
                val = zlib.crc32(str(i).encode('utf-8')) & 7
                if b != chr(val):
                    raise RuntimeError("Bad data at offset {{0}}".format(i))
        """.format(path=os.path.join(self.mountpoint, filename), size=size)))

    def open_n_background(self, fs_path, count):
        """
        Open N files for writing, hold them open in a background process

        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
        :return: a RemoteProcess
        """
        assert (self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            handles = []
            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                handles.append(open(fname, 'w'))

            while True:
                time.sleep(1)
            """).format(abs_path=abs_path, count=count)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def create_n_files(self, fs_path, count, sync=False):
        assert (self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                with open(fname, 'w') as f:
                    f.write('content')
                    if {sync}:
                        f.flush()
                        os.fsync(f.fileno())
            """).format(abs_path=abs_path, count=count, sync=str(sync))

        self.run_python(pyscript)

    def teardown(self):
        for p in self.background_procs:
            log.info("Terminating background process")
            self._kill_background(p)

        self.background_procs = []

    def _kill_background(self, p):
        if p.stdin:
            p.stdin.close()
            try:
                p.wait()
            except (CommandFailedError, ConnectionLostError):
                pass

    def kill_background(self, p):
        """
        For a process that was returned by one of the _background member functions,
        kill it hard.
        """
        self._kill_background(p)
        self.background_procs.remove(p)

    def send_signal(self, signal):
        signal = signal.lower()
        if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']:
            raise NotImplementedError

        self.client_remote.run(
            args=['sudo', 'kill', '-{0}'.format(signal), self.client_pid],
            omit_sudo=False)

    def get_global_id(self):
        raise NotImplementedError()

    def get_global_inst(self):
        raise NotImplementedError()

    def get_global_addr(self):
        raise NotImplementedError()

    def get_osd_epoch(self):
        raise NotImplementedError()

    def lstat(self, fs_path, follow_symlinks=False, wait=True):
        return self.stat(fs_path, follow_symlinks=False, wait=True)

    def stat(self, fs_path, follow_symlinks=True, wait=True):
        """
        stat a file, and return the result as a dictionary like this:
        {
          "st_ctime": 1414161137.0,
          "st_mtime": 1414161137.0,
          "st_nlink": 33,
          "st_gid": 0,
          "st_dev": 16777218,
          "st_size": 1190,
          "st_ino": 2,
          "st_uid": 0,
          "st_mode": 16877,
          "st_atime": 1431520593.0
        }

        Raises exception on absent file.
        """
        abs_path = os.path.join(self.mountpoint, fs_path)
        if follow_symlinks:
            stat_call = "os.stat('" + abs_path + "')"
        else:
            stat_call = "os.lstat('" + abs_path + "')"

        pyscript = dedent("""
            import os
            import stat
            import json
            import sys

            try:
                s = {stat_call}
            except OSError as e:
                sys.exit(e.errno)

            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
            print(json.dumps(
                dict([(a, getattr(s, a)) for a in attrs]),
                indent=2))
            """).format(stat_call=stat_call)
        proc = self._run_python(pyscript)
        if wait:
            proc.wait()
            return json.loads(proc.stdout.getvalue().strip())
        else:
            return proc

    def touch(self, fs_path):
        """
        Create a dentry if it doesn't already exist.  This python
        implementation exists because the usual command line tool doesn't
        pass through error codes like EIO.

        :param fs_path:
        :return:
        """
        abs_path = os.path.join(self.mountpoint, fs_path)
        pyscript = dedent("""
            import sys
            import errno

            try:
                f = open("{path}", "w")
                f.close()
            except IOError as e:
                sys.exit(errno.EIO)
            """).format(path=abs_path)
        proc = self._run_python(pyscript)
        proc.wait()

    def path_to_ino(self, fs_path, follow_symlinks=True):
        abs_path = os.path.join(self.mountpoint, fs_path)

        if follow_symlinks:
            pyscript = dedent("""
                import os
                import stat

                print(os.stat("{path}").st_ino)
                """).format(path=abs_path)
        else:
            pyscript = dedent("""
                import os
                import stat

                print(os.lstat("{path}").st_ino)
                """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def path_to_nlink(self, fs_path):
        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import os
            import stat

            print(os.stat("{path}").st_nlink)
            """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def ls(self, path=None):
        """
        Wrap ls: return a list of strings
        """
        cmd = ["ls"]
        if path:
            cmd.append(path)

        ls_text = self.run_shell(cmd).stdout.getvalue().strip()

        if ls_text:
            return ls_text.split("\n")
        else:
            # Special case because otherwise split on empty string
            # gives you [''] instead of []
            return []

    def setfattr(self, path, key, val):
        """
        Wrap setfattr.

        :param path: relative to mount point
        :param key: xattr name
        :param val: xattr value
        :return: None
        """
        self.run_shell(["setfattr", "-n", key, "-v", val, path])

    def getfattr(self, path, attr):
        """
        Wrap getfattr: return the values of a named xattr on one file, or
        None if the attribute is not found.

        :return: a string
        """
        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path],
                           wait=False)
        try:
            p.wait()
        except CommandFailedError as e:
            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(
            ):
                return None
            else:
                raise

        return str(p.stdout.getvalue())

    def df(self):
        """
        Wrap df: return a dict of usage fields in bytes
        """

        p = self.run_shell(["df", "-B1", "."])
        lines = p.stdout.getvalue().strip().split("\n")
        fs, total, used, avail = lines[1].split()[:4]
        log.warning(lines)

        return {
            "total": int(total),
            "used": int(used),
            "available": int(avail)
        }
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.
    (ceph_admin,) = ctx.cluster.only(teuthology.get_first_mon(ctx, config)).remotes.iterkeys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=["cd", "{tdir}/ceph-deploy".format(tdir=testdir), run.Raw("&&"), run.Raw(cmd)], check_status=False
        ).exitstatus

    try:
        log.info("Building ceph cluster using ceph-deploy...")
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get("branch") is not None:
            cbranch = config.get("branch")
            for var, val in cbranch.iteritems():
                ceph_branch = "--{var}={val}".format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, "mds")
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, "mon")
        mon_nodes = " ".join(mon_node)
        new_mon = "./ceph-deploy new" + " " + mon_nodes
        mon_hostname = mon_nodes.split(" ")[0]
        mon_hostname = str(mon_hostname)
        gather_keys = "./ceph-deploy gatherkeys" + " " + mon_hostname
        deploy_mds = "./ceph-deploy mds create" + " " + mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info("adding config inputs...")
        testdir = teuthology.get_testdir(ctx)
        conf_path = "{tdir}/ceph-deploy/ceph.conf".format(tdir=testdir)

        if config.get("conf") is not None:
            confp = config.get("conf")
            for section, keys in confp.iteritems():
                lines = "[{section}]\n".format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = "{key} = {value}\n".format(key=key, value=value)
                    teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True)

        # install ceph
        install_nodes = "./ceph-deploy install " + (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = (
            "./ceph-deploy install --tests " + (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes
        )
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = "./ceph-deploy mon create-initial"
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        estatus_gather = execute_ceph_deploy(gather_keys)
        max_gather_tries = 90
        gather_tries = 0
        while estatus_gather != 0:
            gather_tries += 1
            if gather_tries >= max_gather_tries:
                msg = "ceph-deploy was not able to gatherkeys after 15 minutes"
                raise RuntimeError(msg)
            estatus_gather = execute_ceph_deploy(gather_keys)
            time.sleep(10)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get("test_mon_destroy") is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = "./ceph-deploy mon destroy" + " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = "./ceph-deploy disk zap " + node + ":" + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = "./ceph-deploy osd create "
            if config.get("dmcrypt") is not None:
                osd_create_cmd += "--dmcrypt "
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info("successfully created osd")
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get("wait-for-healthy", True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info("Setting up client nodes...")
            conf_path = "/etc/ceph/ceph.conf"
            admin_keyring_path = "/etc/ceph/ceph.client.admin.keyring"
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(remote=mon0_remote, path=conf_path, sudo=True)
            admin_keyring = teuthology.get_file(remote=mon0_remote, path=admin_keyring_path, sudo=True)

            clients = ctx.cluster.only(teuthology.is_type("client"))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, "client"):
                    client_keyring = "/etc/ceph/ceph.client.{id}.keyring".format(id=id_)
                    mon0_remote.run(
                        args=[
                            "cd",
                            "{tdir}".format(tdir=testdir),
                            run.Raw("&&"),
                            "sudo",
                            "bash",
                            "-c",
                            run.Raw('"'),
                            "ceph",
                            "auth",
                            "get-or-create",
                            "client.{id}".format(id=id_),
                            "mds",
                            "allow",
                            "mon",
                            "allow *",
                            "osd",
                            "allow *",
                            run.Raw(">"),
                            client_keyring,
                            run.Raw('"'),
                        ]
                    )
                    key_data = teuthology.get_file(remote=mon0_remote, path=client_keyring, sudo=True)
                    teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms="0644")
                    teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms="0644")
                    teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms="0644")

            if mds_nodes:
                log.info("Configuring CephFS...")
                ceph_fs = Filesystem(ctx, admin_remote=clients.remotes.keys()[0])
                if not ceph_fs.legacy_configured():
                    ceph_fs.create()
        elif not config.get("only_mon"):
            raise RuntimeError("The cluster is NOT operational due to insufficient OSDs")
        yield

    except Exception:
        log.info("Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get("keep_running"):
            return
        log.info("Stopping ceph...")
        ctx.cluster.run(
            args=[
                "sudo",
                "stop",
                "ceph-all",
                run.Raw("||"),
                "sudo",
                "service",
                "ceph",
                "stop",
                run.Raw("||"),
                "sudo",
                "systemctl",
                "stop",
                "ceph.target",
            ]
        )

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(
            args=[
                "sudo",
                "status",
                "ceph-all",
                run.Raw("||"),
                "sudo",
                "service",
                "ceph",
                "status",
                run.Raw("||"),
                "sudo",
                "systemctl",
                "status",
                "ceph.target",
            ],
            check_status=False,
        )

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(
            args=["sudo", "ps", "aux", run.Raw("|"), "grep", "-v", "grep", run.Raw("|"), "grep", "ceph"],
            check_status=False,
        )

        if ctx.archive is not None:
            # archive mon data, too
            log.info("Archiving mon data...")
            path = os.path.join(ctx.archive, "data")
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type("mon"))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith("mon."):
                        teuthology.pull_directory_tarball(remote, "/var/lib/ceph/mon", path + "/" + role + ".tgz")

            log.info("Compressing logs...")
            run.wait(
                ctx.cluster.run(
                    args=[
                        "sudo",
                        "find",
                        "/var/log/ceph",
                        "-name",
                        "*.log",
                        "-print0",
                        run.Raw("|"),
                        "sudo",
                        "xargs",
                        "-0",
                        "--no-run-if-empty",
                        "--",
                        "gzip",
                        "--",
                    ],
                    wait=False,
                )
            )

            log.info("Archiving logs...")
            path = os.path.join(ctx.archive, "remote")
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, "/var/log/ceph", os.path.join(sub, "log"))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = "./ceph-deploy purge" + " " + all_nodes
        purgedata_nodes = "./ceph-deploy purgedata" + " " + all_nodes

        log.info("Purging package...")
        execute_ceph_deploy(purge_nodes)
        log.info("Purging data...")
        execute_ceph_deploy(purgedata_nodes)
Exemple #34
0
class MDSThrasher(Greenlet):
    """
    MDSThrasher::

    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).

    The config is optional.  Many of the config parameters are a a maximum value
    to use when selecting a random value from a range.  To always use the maximum
    value, set no_random to true.  The config is a dict containing some or all of:

    seed: [no default] seed the random number generator

    randomize: [default: true] enables randomization and use the max/min values

    max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
      any given time.

    max_thrash_delay: [default: 30] maximum number of seconds to delay before
      thrashing again.

    max_revive_delay: [default: 10] maximum number of seconds to delay before
      bringing back a thrashed MDS

    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
      during replay.  Value should be between 0.0 and 1.0

    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
      the replay state before thrashing

    thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
      overrides anything specified by max_thrash.  This option is a dict containing
      mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
      is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
      given a weight of 0.0.  For a given MDS, by default the trasher delays for up
      to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
      weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
      during that iteration based on a random value [0-1] not exceeding the weight of that MDS.

    Examples::


      The following example sets the likelihood that mds.a will be thrashed
      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
      likelihood that an MDS will be thrashed in replay to 40%.
      Thrash weights do not have to sum to 1.

      tasks:
      - ceph:
      - mds_thrash:
          thrash_weights:
            - mds.a: 0.8
            - mds.b: 0.2
          thrash_in_replay: 0.4
      - ceph-fuse:
      - workunit:
          clients:
            all: [suites/fsx.sh]

      The following example disables randomization, and uses the max delay values:

      tasks:
      - ceph:
      - mds_thrash:
          max_thrash_delay: 10
          max_revive_delay: 1
          max_replay_thrash_delay: 4

    """
    def __init__(self, ctx, manager, mds_cluster, config, logger,
                 failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(
            self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(
            self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay',
                                                      10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)

    def _run(self):
        try:
            self.do_thrash()
        except:
            # Log exceptions here so we get the full backtrace (it's lost
            # by the time someone does a .get() on this greenlet)
            self.logger.exception("Exception in do_thrash:")
            raise

    def log(self, x):
        """Write data to logger assigned to this MDThrasher"""
        self.logger.info(x)

    def stop(self):
        self.stopping.set()

    def kill_mds(self, mds):
        if self.config.get('powercycle'):
            (remote, ) = (self.ctx.cluster.only(
                'mds.{m}'.format(m=mds)).remotes.iterkeys())
            self.log('kill_mds on mds.{m} doing powercycle of {s}'.format(
                m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_off()
        else:
            self.ctx.daemons.get_daemon('mds', mds).stop()

    @staticmethod
    def _assert_ipmi(remote):
        assert remote.console.has_ipmi_credentials, (
            "powercycling requested but RemoteConsole is not "
            "initialized.  Check ipmi config.")

    def kill_mds_by_rank(self, rank):
        """
        kill_mds wrapper to kill based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.kill_mds(status['name'])

    def revive_mds(self, mds, standby_for_rank=None):
        """
        Revive mds -- do an ipmpi powercycle (if indicated by the config)
        and then restart (using --hot-standby if specified.
        """
        if self.config.get('powercycle'):
            (remote, ) = (self.ctx.cluster.only(
                'mds.{m}'.format(m=mds)).remotes.iterkeys())
            self.log('revive_mds on mds.{m} doing powercycle of {s}'.format(
                m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_on()
            self.manager.make_admin_daemon_dir(self.ctx, remote)
        args = []
        if standby_for_rank:
            args.extend(['--hot-standby', standby_for_rank])
        self.ctx.daemons.get_daemon('mds', mds).restart(*args)

    def revive_mds_by_rank(self, rank, standby_for_rank=None):
        """
        revive_mds wrapper to revive based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.revive_mds(status['name'], standby_for_rank)

    def get_mds_status_all(self):
        return self.fs.get_mds_map()

    def do_thrash(self):
        """
        Perform the random thrashing action
        """

        self.log('starting mds_do_thrash for failure group: ' + ', '.join(
            ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
        while not self.stopping.is_set():
            delay = self.max_thrash_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_thrash_delay)

            if delay > 0.0:
                self.log('waiting for {delay} secs before thrashing'.format(
                    delay=delay))
                self.stopping.wait(delay)
                if self.stopping.is_set():
                    continue

            skip = random.randrange(0.0, 1.0)
            if self.weight < 1.0 and skip > self.weight:
                self.log(
                    'skipping thrash iteration with skip ({skip}) > weight ({weight})'
                    .format(skip=skip, weight=self.weight))
                continue

            # find the active mds in the failure group
            statuses = [
                self.mds_cluster.get_mds_info(m) for m in self.failure_group
            ]
            actives = filter(lambda s: s and s['state'] == 'up:active',
                             statuses)
            assert len(
                actives) == 1, 'Can only have one active in a failure group'

            active_mds = actives[0]['name']
            active_rank = actives[0]['rank']

            self.log('kill mds.{id} (rank={r})'.format(id=active_mds,
                                                       r=active_rank))
            self.kill_mds_by_rank(active_rank)

            # wait for mon to report killed mds as crashed
            last_laggy_since = None
            itercount = 0
            while True:
                failed = self.fs.get_mds_map()['failed']
                status = self.mds_cluster.get_mds_info(active_mds)
                if not status:
                    break
                if 'laggy_since' in status:
                    last_laggy_since = status['laggy_since']
                    break
                if any([(f == active_mds) for f in failed]):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'
                    .format(_id=active_mds))
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(
                        status=self.mds_cluster.get_fs_map()))
                time.sleep(2)
            if last_laggy_since:
                self.log(
                    'mds.{_id} reported laggy/crashed since: {since}'.format(
                        _id=active_mds, since=last_laggy_since))
            else:
                self.log('mds.{_id} down, removed from mdsmap'.format(
                    _id=active_mds, since=last_laggy_since))

            # wait for a standby mds to takeover and become active
            takeover_mds = None
            takeover_rank = None
            itercount = 0
            while True:
                statuses = [
                    self.mds_cluster.get_mds_info(m)
                    for m in self.failure_group
                ]
                actives = filter(lambda s: s and s['state'] == 'up:active',
                                 statuses)
                if len(actives) > 0:
                    assert len(
                        actives
                    ) == 1, 'Can only have one active in failure group'
                    takeover_mds = actives[0]['name']
                    takeover_rank = actives[0]['rank']
                    break
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(
                        status=self.mds_cluster.get_fs_map()))

            self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))

            # wait for a while before restarting old active to become new
            # standby
            delay = self.max_revive_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_revive_delay)

            self.log(
                'waiting for {delay} secs before reviving mds.{id}'.format(
                    delay=delay, id=active_mds))
            time.sleep(delay)

            self.log('reviving mds.{id}'.format(id=active_mds))
            self.revive_mds(active_mds, standby_for_rank=takeover_rank)

            status = {}
            while True:
                status = self.mds_cluster.get_mds_info(active_mds)
                if status and (status['state'] == 'up:standby'
                               or status['state'] == 'up:standby-replay'):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is in standby or standby-replay'
                    .format(_id=active_mds))
                time.sleep(2)
            self.log('mds.{_id} reported in {state} state'.format(
                _id=active_mds, state=status['state']))

            # don't do replay thrashing right now
            continue
            # this might race with replay -> active transition...
            if status['state'] == 'up:replay' and random.randrange(
                    0.0, 1.0) < self.thrash_in_replay:

                delay = self.max_replay_thrash_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_replay_thrash_delay)
                time.sleep(delay)
                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
                self.kill_mds(self.to_kill)

                delay = self.max_revive_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_revive_delay)

                self.log(
                    'waiting for {delay} secs before reviving mds.{id}'.format(
                        delay=delay, id=self.to_kill))
                time.sleep(delay)

                self.log('revive mds.{id}'.format(id=self.to_kill))
                self.revive_mds(self.to_kill)
Exemple #35
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.
    (ceph_admin,) = ctx.cluster.only(teuthology.get_first_mon(ctx, config)).remotes.iterkeys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        new_mon = './ceph-deploy new'+" "+mon_nodes
        install_nodes = './ceph-deploy install ' + (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname
        deploy_mds = './ceph-deploy mds create'+" "+mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                    sudo=True)

        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        estatus_gather = execute_ceph_deploy(gather_keys)
        max_gather_tries = 90
        gather_tries = 0
        while (estatus_gather != 0):
            gather_tries += 1
            if gather_tries >= max_gather_tries:
                msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
                raise RuntimeError(msg)
            estatus_gather = execute_ceph_deploy(gather_keys)
            time.sleep(10)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        osd_create_cmd = './ceph-deploy osd create --zap-disk '
        for d in node_dev_list:
            if config.get('dmcrypt') is not None:
                osd_create_cmd_d = osd_create_cmd+'--dmcrypt'+" "+d
            else:
                osd_create_cmd_d = osd_create_cmd+d
            estatus_osd = execute_ceph_deploy(osd_create_cmd_d)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                disks = d.split(':')
                dev_disk = disks[0]+":"+disks[1]
                j_disk = disks[0]+":"+disks[2]
                zap_disk = './ceph-deploy disk zap '+dev_disk+" "+j_disk
                execute_ceph_deploy(zap_disk)
                estatus_osd = execute_ceph_deploy(osd_create_cmd_d)
                if estatus_osd == 0:
                    log.info('successfully created osd')
                    no_of_osds += 1
                else:
                    raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
                )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
                )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                            ],
                        )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                        )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )

            log.info('Configuring CephFS...')
            ceph_fs = Filesystem(ctx, admin_remote=clients.remotes.keys()[0])
            if not ceph_fs.legacy_configured():
                ceph_fs.create()
        else:
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    except Exception:
        log.info("Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop' ])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(args=['sudo', 'status', 'ceph-all', run.Raw('||'),
                              'sudo', 'service',  'ceph', 'status'],
                              check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                        ],
                    wait=False,
                    ),
                )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge'+" "+all_nodes
        purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemple #36
0
def cephfs_setup(ctx, config):
    cluster_name = config["cluster"]
    testdir = teuthology.get_testdir(ctx)
    coverage_dir = "{tdir}/archive/coverage".format(tdir=testdir)

    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    mdss = ctx.cluster.only(teuthology.is_type("mds", cluster_name))
    # If there are any MDSs, then create a filesystem for them to use
    # Do this last because requires mon cluster to be up and running
    if mdss.remotes:
        log.info("Setting up CephFS filesystem...")

        ceph_fs = Filesystem(ctx)  # TODO: make Filesystem cluster-aware
        if not ceph_fs.legacy_configured():
            ceph_fs.create()

        is_active_mds = lambda role: "mds." in role and not role.endswith("-s") and "-s-" not in role
        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
        num_active = len([r for r in all_roles if is_active_mds(r)])
        mon_remote.run(
            args=[
                "sudo",
                "adjust-ulimits",
                "ceph-coverage",
                coverage_dir,
                "ceph",
                "mds",
                "set",
                "allow_multimds",
                "true",
                "--yes-i-really-mean-it",
            ],
            check_status=False,  # probably old version, upgrade test
        )
        mon_remote.run(
            args=[
                "sudo",
                "adjust-ulimits",
                "ceph-coverage",
                coverage_dir,
                "ceph",
                "--cluster",
                cluster_name,
                "mds",
                "set_max_mds",
                str(num_active),
            ]
        )
        mon_remote.run(
            args=[
                "sudo",
                "adjust-ulimits",
                "ceph-coverage",
                coverage_dir,
                "ceph",
                "mds",
                "set",
                "allow_dirfrags",
                "true",
                "--yes-i-really-mean-it",
            ],
            check_status=False,  # probably old version, upgrade test
        )

    yield
Exemple #37
0
class CephFSMount(object):
    def __init__(self, ctx, test_dir, client_id, client_remote):
        """
        :param test_dir: Global teuthology test dir
        :param client_id: Client ID, the 'foo' in client.foo
        :param client_remote: Remote instance for the host where client will run
        """

        self.ctx = ctx
        self.test_dir = test_dir
        self.client_id = client_id
        self.client_remote = client_remote
        self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
        self.fs = None

        self.test_files = ['a', 'b', 'c']

        self.background_procs = []

    @property
    def mountpoint(self):
        return os.path.join(
            self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name))

    def is_mounted(self):
        raise NotImplementedError()

    def setupfs(self, name=None):
        if name is None and self.fs is not None:
            # Previous mount existed, reuse the old name
            name = self.fs.name
        self.fs = Filesystem(self.ctx, name=name)
        log.info('Wait for MDS to reach steady state...')
        self.fs.wait_for_daemons()
        log.info('Ready to start {}...'.format(type(self).__name__))

    def mount(self, mount_path=None, mount_fs_name=None):
        raise NotImplementedError()

    def umount(self):
        raise NotImplementedError()

    def umount_wait(self, force=False, require_clean=False):
        """

        :param force: Expect that the mount will not shutdown cleanly: kill
                      it hard.
        :param require_clean: Wait for the Ceph client associated with the
                              mount (e.g. ceph-fuse) to terminate, and
                              raise if it doesn't do so cleanly.
        :return:
        """
        raise NotImplementedError()

    def kill_cleanup(self):
        raise NotImplementedError()

    def kill(self):
        raise NotImplementedError()

    def cleanup(self):
        raise NotImplementedError()

    def wait_until_mounted(self):
        raise NotImplementedError()

    def get_keyring_path(self):
        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)

    @property
    def config_path(self):
        """
        Path to ceph.conf: override this if you're not a normal systemwide ceph install
        :return: stringv
        """
        return "/etc/ceph/ceph.conf"

    @contextmanager
    def mounted(self):
        """
        A context manager, from an initially unmounted state, to mount
        this, yield, and then unmount and clean up.
        """
        self.mount()
        self.wait_until_mounted()
        try:
            yield
        finally:
            self.umount_wait()

    def is_blacklisted(self):
        addr = self.get_global_addr()
        blacklist = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "ls", "--format=json"))
        for b in blacklist:
            if addr == b["addr"]:
                return True
        return False

    def create_file(self, filename='testfile', dirname=None, user=None,
                    check_status=True):
        assert(self.is_mounted())

        if not os.path.isabs(filename):
            if dirname:
                if os.path.isabs(dirname):
                    path = os.path.join(dirname, filename)
                else:
                    path = os.path.join(self.mountpoint, dirname, filename)
            else:
                path = os.path.join(self.mountpoint, filename)
        else:
            path = filename

        if user:
            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path]
        else:
            args = 'touch ' + path

        return self.client_remote.run(args=args, check_status=check_status)

    def create_files(self):
        assert(self.is_mounted())

        for suffix in self.test_files:
            log.info("Creating file {0}".format(suffix))
            self.client_remote.run(args=[
                'sudo', 'touch', os.path.join(self.mountpoint, suffix)
            ])

    def test_create_file(self, filename='testfile', dirname=None, user=None,
                         check_status=True):
        return self.create_file(filename=filename, dirname=dirname, user=user,
                                check_status=False)

    def check_files(self):
        assert(self.is_mounted())

        for suffix in self.test_files:
            log.info("Checking file {0}".format(suffix))
            r = self.client_remote.run(args=[
                'sudo', 'ls', os.path.join(self.mountpoint, suffix)
            ], check_status=False)
            if r.exitstatus != 0:
                raise RuntimeError("Expected file {0} not found".format(suffix))

    def create_destroy(self):
        assert(self.is_mounted())

        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
        log.debug("Creating test file {0}".format(filename))
        self.client_remote.run(args=[
            'sudo', 'touch', os.path.join(self.mountpoint, filename)
        ])
        log.debug("Deleting test file {0}".format(filename))
        self.client_remote.run(args=[
            'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename)
        ])

    def _run_python(self, pyscript, py_version='python'):
        return self.client_remote.run(
               args=['sudo', 'adjust-ulimits', 'daemon-helper', 'kill',
                     py_version, '-c', pyscript], wait=False, stdin=run.PIPE,
               stdout=StringIO())

    def run_python(self, pyscript, py_version='python'):
        p = self._run_python(pyscript, py_version)
        p.wait()
        return p.stdout.getvalue().strip()

    def run_shell(self, args, wait=True, stdin=None, check_status=True,
                  omit_sudo=True):
        args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
        return self.client_remote.run(args=args, stdout=StringIO(),
                                      stderr=StringIO(), wait=wait,
                                      stdin=stdin, check_status=check_status,
                                      omit_sudo=omit_sudo)

    def open_no_data(self, basename):
        """
        A pure metadata operation
        """
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        p = self._run_python(dedent(
            """
            f = open("{path}", 'w')
            """.format(path=path)
        ))
        p.wait()

    def open_background(self, basename="background_file"):
        """
        Open a file for writing, then block such that the client
        will hold a capability.

        Don't return until the remote process has got as far as opening
        the file, then return the RemoteProcess instance.
        """
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        pyscript = dedent("""
            import time

            f = open("{path}", 'w')
            f.write('content')
            f.flush()
            f.write('content2')
            while True:
                time.sleep(1)
            """).format(path=path)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)

        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
        self.wait_for_visible(basename)

        return rproc

    def wait_for_visible(self, basename="background_file", timeout=30):
        i = 0
        while i < timeout:
            r = self.client_remote.run(args=[
                'sudo', 'ls', os.path.join(self.mountpoint, basename)
            ], check_status=False)
            if r.exitstatus == 0:
                log.debug("File {0} became visible from {1} after {2}s".format(
                    basename, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
            i, basename, self.client_id))

    def lock_background(self, basename="background_file", do_flock=True):
        """
        Open and lock a files for writing, hold the lock in a background process
        """
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import time
            import fcntl
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
        script_builder += """
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            while True:
                time.sleep(1)
            """

        pyscript = dedent(script_builder).format(path=path)

        log.info("lock_background file {0}".format(basename))
        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def lock_and_release(self, basename="background_file"):
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script = """
            import time
            import fcntl
            import struct
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX)
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            """
        pyscript = dedent(script).format(path=path)

        log.info("lock_and_release file {0}".format(basename))
        return self._run_python(pyscript)

    def check_filelock(self, basename="background_file", do_flock=True):
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import fcntl
            import errno
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'r')
            try:
                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
            except IOError, e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("flock on file {path}-1 not found")"""
        script_builder += """
            f2 = open("{path}-2", 'r')
            try:
                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            except IOError, e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("posix lock on file {path}-2 not found")
            """
        pyscript = dedent(script_builder).format(path=path)

        log.info("check lock on file {0}".format(basename))
        self.client_remote.run(args=[
            'sudo', 'python', '-c', pyscript
        ])

    def write_background(self, basename="background_file", loop=False):
        """
        Open a file for writing, complete as soon as you can
        :param basename:
        :return:
        """
        assert(self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        pyscript = dedent("""
            import os
            import time

            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644)
            try:
                while True:
                    os.write(fd, 'content')
                    time.sleep(1)
                    if not {loop}:
                        break
            except IOError, e:
                pass
            os.close(fd)
            """).format(path=path, loop=str(loop))

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
        """
        Write the requested number of megabytes to a file
        """
        assert(self.is_mounted())

        return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
                               "bs=1M", "conv=fdatasync",
                               "count={0}".format(n_mb),
                               "seek={0}".format(seek)
                               ], wait=wait)

    def write_test_pattern(self, filename, size):
        log.info("Writing {0} bytes to {1}".format(size, filename))
        return self.run_python(dedent("""
            import zlib
            path = "{path}"
            f = open(path, 'w')
            for i in range(0, {size}):
                val = zlib.crc32("%s" % i) & 7
                f.write(chr(val))
            f.close()
        """.format(
            path=os.path.join(self.mountpoint, filename),
            size=size
        )))

    def validate_test_pattern(self, filename, size):
        log.info("Validating {0} bytes from {1}".format(size, filename))
        return self.run_python(dedent("""
            import zlib
            path = "{path}"
            f = open(path, 'r')
            bytes = f.read()
            f.close()
            if len(bytes) != {size}:
                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
                    len(bytes), {size}
                ))
            for i, b in enumerate(bytes):
                val = zlib.crc32("%s" % i) & 7
                if b != chr(val):
                    raise RuntimeError("Bad data at offset {{0}}".format(i))
        """.format(
            path=os.path.join(self.mountpoint, filename),
            size=size
        )))

    def open_n_background(self, fs_path, count):
        """
        Open N files for writing, hold them open in a background process

        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
        :return: a RemoteProcess
        """
        assert(self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            handles = []
            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                handles.append(open(fname, 'w'))

            while True:
                time.sleep(1)
            """).format(abs_path=abs_path, count=count)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def create_n_files(self, fs_path, count, sync=False):
        assert(self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                h = open(fname, 'w')
                h.write('content')
                if {sync}:
                    h.flush()
                    os.fsync(h.fileno())
                h.close()
            """).format(abs_path=abs_path, count=count, sync=str(sync))

        self.run_python(pyscript)

    def teardown(self):
        for p in self.background_procs:
            log.info("Terminating background process")
            self._kill_background(p)

        self.background_procs = []

    def _kill_background(self, p):
        if p.stdin:
            p.stdin.close()
            try:
                p.wait()
            except (CommandFailedError, ConnectionLostError):
                pass

    def kill_background(self, p):
        """
        For a process that was returned by one of the _background member functions,
        kill it hard.
        """
        self._kill_background(p)
        self.background_procs.remove(p)

    def get_global_id(self):
        raise NotImplementedError()

    def get_global_inst(self):
        raise NotImplementedError()

    def get_global_addr(self):
        raise NotImplementedError()

    def get_osd_epoch(self):
        raise NotImplementedError()

    def stat(self, fs_path, wait=True):
        """
        stat a file, and return the result as a dictionary like this:
        {
          "st_ctime": 1414161137.0,
          "st_mtime": 1414161137.0,
          "st_nlink": 33,
          "st_gid": 0,
          "st_dev": 16777218,
          "st_size": 1190,
          "st_ino": 2,
          "st_uid": 0,
          "st_mode": 16877,
          "st_atime": 1431520593.0
        }

        Raises exception on absent file.
        """
        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import os
            import stat
            import json
            import sys

            try:
                s = os.stat("{path}")
            except OSError as e:
                sys.exit(e.errno)

            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
            print json.dumps(
                dict([(a, getattr(s, a)) for a in attrs]),
                indent=2)
            """).format(path=abs_path)
        proc = self._run_python(pyscript)
        if wait:
            proc.wait()
            return json.loads(proc.stdout.getvalue().strip())
        else:
            return proc

    def touch(self, fs_path):
        """
        Create a dentry if it doesn't already exist.  This python
        implementation exists because the usual command line tool doesn't
        pass through error codes like EIO.

        :param fs_path:
        :return:
        """
        abs_path = os.path.join(self.mountpoint, fs_path)
        pyscript = dedent("""
            import sys
            import errno

            try:
                f = open("{path}", "w")
                f.close()
            except IOError as e:
                sys.exit(errno.EIO)
            """).format(path=abs_path)
        proc = self._run_python(pyscript)
        proc.wait()

    def path_to_ino(self, fs_path, follow_symlinks=True):
        abs_path = os.path.join(self.mountpoint, fs_path)

        if follow_symlinks:
            pyscript = dedent("""
                import os
                import stat

                print os.stat("{path}").st_ino
                """).format(path=abs_path)
        else:
            pyscript = dedent("""
                import os
                import stat

                print os.lstat("{path}").st_ino
                """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def path_to_nlink(self, fs_path):
        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import os
            import stat

            print os.stat("{path}").st_nlink
            """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def ls(self, path=None):
        """
        Wrap ls: return a list of strings
        """
        cmd = ["ls"]
        if path:
            cmd.append(path)

        ls_text = self.run_shell(cmd).stdout.getvalue().strip()

        if ls_text:
            return ls_text.split("\n")
        else:
            # Special case because otherwise split on empty string
            # gives you [''] instead of []
            return []

    def setfattr(self, path, key, val):
        """
        Wrap setfattr.

        :param path: relative to mount point
        :param key: xattr name
        :param val: xattr value
        :return: None
        """
        self.run_shell(["setfattr", "-n", key, "-v", val, path])

    def getfattr(self, path, attr):
        """
        Wrap getfattr: return the values of a named xattr on one file, or
        None if the attribute is not found.

        :return: a string
        """
        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False)
        try:
            p.wait()
        except CommandFailedError as e:
            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue():
                return None
            else:
                raise

        return p.stdout.getvalue()

    def df(self):
        """
        Wrap df: return a dict of usage fields in bytes
        """

        p = self.run_shell(["df", "-B1", "."])
        lines = p.stdout.getvalue().strip().split("\n")
        fs, total, used, avail = lines[1].split()[:4]
        log.warn(lines)

        return {
            "total": int(total),
            "used": int(used),
            "available": int(avail)
        }
class MDSThrasher(Greenlet):
    """
    MDSThrasher::

    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).

    The config is optional.  Many of the config parameters are a a maximum value
    to use when selecting a random value from a range.  To always use the maximum
    value, set no_random to true.  The config is a dict containing some or all of:

    seed: [no default] seed the random number generator

    randomize: [default: true] enables randomization and use the max/min values

    max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
      any given time.

    max_thrash_delay: [default: 30] maximum number of seconds to delay before
      thrashing again.

    max_revive_delay: [default: 10] maximum number of seconds to delay before
      bringing back a thrashed MDS

    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
      during replay.  Value should be between 0.0 and 1.0

    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
      the replay state before thrashing

    thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
      overrides anything specified by max_thrash.  This option is a dict containing
      mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
      is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
      given a weight of 0.0.  For a given MDS, by default the trasher delays for up
      to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
      weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
      during that iteration based on a random value [0-1] not exceeding the weight of that MDS.

    Examples::


      The following example sets the likelihood that mds.a will be thrashed
      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
      likelihood that an MDS will be thrashed in replay to 40%.
      Thrash weights do not have to sum to 1.

      tasks:
      - ceph:
      - mds_thrash:
          thrash_weights:
            - mds.a: 0.8
            - mds.b: 0.2
          thrash_in_replay: 0.4
      - ceph-fuse:
      - workunit:
          clients:
            all: [suites/fsx.sh]

      The following example disables randomization, and uses the max delay values:

      tasks:
      - ceph:
      - mds_thrash:
          max_thrash_delay: 10
          max_revive_delay: 1
          max_replay_thrash_delay: 4

    """

    def __init__(self, ctx, manager, mds_cluster, config, logger, failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)

    def _run(self):
        try:
            self.do_thrash()
        except:
            # Log exceptions here so we get the full backtrace (it's lost
            # by the time someone does a .get() on this greenlet)
            self.logger.exception("Exception in do_thrash:")
            raise

    def log(self, x):
        """Write data to logger assigned to this MDThrasher"""
        self.logger.info(x)

    def stop(self):
        self.stopping.set()

    def kill_mds(self, mds):
        if self.config.get('powercycle'):
            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
                         remotes.iterkeys())
            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
                     format(m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_off()
        else:
            self.ctx.daemons.get_daemon('mds', mds).stop()

    @staticmethod
    def _assert_ipmi(remote):
        assert remote.console.has_ipmi_credentials, (
            "powercycling requested but RemoteConsole is not "
            "initialized.  Check ipmi config.")

    def kill_mds_by_rank(self, rank):
        """
        kill_mds wrapper to kill based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.kill_mds(status['name'])

    def revive_mds(self, mds, standby_for_rank=None):
        """
        Revive mds -- do an ipmpi powercycle (if indicated by the config)
        and then restart (using --hot-standby if specified.
        """
        if self.config.get('powercycle'):
            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
                         remotes.iterkeys())
            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
                     format(m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_on()
            self.manager.make_admin_daemon_dir(self.ctx, remote)
        args = []
        if standby_for_rank:
            args.extend(['--hot-standby', standby_for_rank])
        self.ctx.daemons.get_daemon('mds', mds).restart(*args)

    def revive_mds_by_rank(self, rank, standby_for_rank=None):
        """
        revive_mds wrapper to revive based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.revive_mds(status['name'], standby_for_rank)

    def get_mds_status_all(self):
        return self.fs.get_mds_map()

    def do_thrash(self):
        """
        Perform the random thrashing action
        """

        self.log('starting mds_do_thrash for failure group: ' + ', '.join(
            ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
        while not self.stopping.is_set():
            delay = self.max_thrash_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_thrash_delay)

            if delay > 0.0:
                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
                self.stopping.wait(delay)
                if self.stopping.is_set():
                    continue

            skip = random.randrange(0.0, 1.0)
            if self.weight < 1.0 and skip > self.weight:
                self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip,
                                                                                                   weight=self.weight))
                continue

            # find the active mds in the failure group
            statuses = [self.mds_cluster.get_mds_info(m) for m in self.failure_group]
            actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
            assert len(actives) == 1, 'Can only have one active in a failure group'

            active_mds = actives[0]['name']
            active_rank = actives[0]['rank']

            self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank))
            self.manager.kill_mds_by_rank(active_rank)

            # wait for mon to report killed mds as crashed
            last_laggy_since = None
            itercount = 0
            while True:
                failed = self.fs.get_mds_map()['failed']
                status = self.mds_cluster.get_mds_info(active_mds)
                if not status:
                    break
                if 'laggy_since' in status:
                    last_laggy_since = status['laggy_since']
                    break
                if any([(f == active_mds) for f in failed]):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format(
                        _id=active_mds))
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(status=self.mds_cluster.get_fs_map()))
                time.sleep(2)
            if last_laggy_since:
                self.log(
                    'mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since))
            else:
                self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since))

            # wait for a standby mds to takeover and become active
            takeover_mds = None
            takeover_rank = None
            itercount = 0
            while True:
                statuses = [self.mds_cluster.get_mds_info(m) for m in self.failure_group]
                actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
                if len(actives) > 0:
                    assert len(actives) == 1, 'Can only have one active in failure group'
                    takeover_mds = actives[0]['name']
                    takeover_rank = actives[0]['rank']
                    break
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(status=self.mds_cluster.get_fs_map()))

            self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))

            # wait for a while before restarting old active to become new
            # standby
            delay = self.max_revive_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_revive_delay)

            self.log('waiting for {delay} secs before reviving mds.{id}'.format(
                delay=delay, id=active_mds))
            time.sleep(delay)

            self.log('reviving mds.{id}'.format(id=active_mds))
            self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank)

            status = {}
            while True:
                status = self.mds_cluster.get_mds_info(active_mds)
                if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds))
                time.sleep(2)
            self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state']))

            # don't do replay thrashing right now
            continue
            # this might race with replay -> active transition...
            if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:

                delay = self.max_replay_thrash_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_replay_thrash_delay)
                time.sleep(delay)
                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
                self.manager.kill_mds(self.to_kill)

                delay = self.max_revive_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_revive_delay)

                self.log('waiting for {delay} secs before reviving mds.{id}'.format(
                    delay=delay, id=self.to_kill))
                time.sleep(delay)

                self.log('revive mds.{id}'.format(id=self.to_kill))
                self.manager.revive_mds(self.to_kill)
def task(ctx, config):
    """
    Execute CephFS client recovery test suite.

    Requires:
    - An outer ceph_fuse task with at least two clients
    - That the clients are on a separate host to the MDS
    """
    fs = Filesystem(ctx)

    # Pick out the clients we will use from the configuration
    # =======================================================
    if len(ctx.mounts) < 2:
        raise RuntimeError("Need at least two clients")
    mount_a = ctx.mounts.values()[0]
    mount_b = ctx.mounts.values()[1]

    if not isinstance(mount_a, FuseMount) or not isinstance(
            mount_b, FuseMount):
        # kclient kill() power cycles nodes, so requires clients to each be on
        # their own node
        if mount_a.client_remote.hostname == mount_b.client_remote.hostname:
            raise RuntimeError("kclient clients must be on separate nodes")

    # Check we have at least one remote client for use with network-dependent tests
    # =============================================================================
    if mount_a.client_remote.hostname in fs.get_mds_hostnames():
        raise RuntimeError(
            "Require first client to on separate server from MDSs")

    # Stash references on ctx so that we can easily debug in interactive mode
    # =======================================================================
    ctx.filesystem = fs
    ctx.mount_a = mount_a
    ctx.mount_b = mount_b

    run_tests(
        ctx, config, TestClientRecovery, {
            "mds_reconnect_timeout":
            int(
                fs.mds_asok(['config', 'get', 'mds_reconnect_timeout'
                             ])['mds_reconnect_timeout']),
            "mds_session_timeout":
            int(
                fs.mds_asok(['config', 'get', 'mds_session_timeout'
                             ])['mds_session_timeout']),
            "ms_max_backoff":
            int(
                fs.mds_asok(['config', 'get', 'ms_max_backoff'
                             ])['ms_max_backoff']),
            "fs":
            fs,
            "mount_a":
            mount_a,
            "mount_b":
            mount_b
        })

    # Continue to any downstream tasks
    # ================================
    yield
Exemple #40
0
def task(ctx, config):
    """
    Mount/unmount a ``ceph-fuse`` client.

    The config is optional and defaults to mounting on all clients. If
    a config is given, it is expected to be a list of clients to do
    this operation on. This lets you e.g. set up one client with
    ``ceph-fuse`` and another with ``kclient``.

    Example that mounts all clients::

        tasks:
        - ceph:
        - ceph-fuse:
        - interactive:

    Example that uses both ``kclient` and ``ceph-fuse``::

        tasks:
        - ceph:
        - ceph-fuse: [client.0]
        - kclient: [client.1]
        - interactive:

    Example that enables valgrind:

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
        - interactive:

    Example that stops an already-mounted client:

    ::

        tasks:
            - ceph:
            - ceph-fuse: [client.0]
            - ... do something that requires the FS mounted ...
            - ceph-fuse:
                client.0:
                    mounted: false
            - ... do something that requires the FS unmounted ...

    Example that adds more generous wait time for mount (for virtual machines):

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              mount_wait: 60 # default is 0, do not wait before checking /sys/
              mount_timeout: 120 # default is 30, give up if /sys/ is not populated
        - interactive:

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Mounting ceph-fuse clients...')

    testdir = teuthology.get_testdir(ctx)
    config = get_client_configs(ctx, config)

    # List clients we will configure mounts for, default is all clients
    clients = list(
        teuthology.get_clients(ctx=ctx,
                               roles=filter(lambda x: 'client.' in x,
                                            config.keys())))

    all_mounts = getattr(ctx, 'mounts', {})
    mounted_by_me = {}

    log.info('Wait for MDS to reach steady state...')
    mds_cluster = MDSCluster(ctx)
    status = mds_cluster.status()
    for filesystem in status.get_filesystems():
        fs = Filesystem(ctx, fscid=filesystem['id'])
        fs.wait_for_daemons()
    log.info('Ready to start ceph-fuse...')

    # Construct any new FuseMount instances
    for id_, remote in clients:
        client_config = config.get("client.%s" % id_)
        if client_config is None:
            client_config = {}

        if id_ not in all_mounts:
            fuse_mount = FuseMount(client_config, testdir, id_, remote)
            all_mounts[id_] = fuse_mount
        else:
            # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
            assert isinstance(all_mounts[id_], FuseMount)

        if not config.get("disabled", False) and client_config.get(
                'mounted', True):
            mounted_by_me[id_] = all_mounts[id_]

    ctx.mounts = all_mounts

    # Mount any clients we have been asked to (default to mount all)
    for mount in mounted_by_me.values():
        mount.mount()

    for mount in mounted_by_me.values():
        mount.wait_until_mounted()

    # Umount any pre-existing clients that we have not been asked to mount
    for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
        mount = all_mounts[client_id]
        if mount.is_mounted():
            mount.umount_wait()

    try:
        yield all_mounts
    finally:
        log.info('Unmounting ceph-fuse clients...')

        for mount in mounted_by_me.values():
            # Conditional because an inner context might have umounted it
            if mount.is_mounted():
                mount.umount_wait()
Exemple #41
0
class CephFSMount(object):
    def __init__(self, ctx, test_dir, client_id, client_remote):
        """
        :param test_dir: Global teuthology test dir
        :param client_id: Client ID, the 'foo' in client.foo
        :param client_remote: Remote instance for the host where client will run
        """

        self.ctx = ctx
        self.test_dir = test_dir
        self.client_id = client_id
        self.client_remote = client_remote
        self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
        self.fs = None

        self.test_files = ['a', 'b', 'c']

        self.background_procs = []

    @property
    def mountpoint(self):
        return os.path.join(
            self.test_dir,
            '{dir_name}'.format(dir_name=self.mountpoint_dir_name))

    def is_mounted(self):
        raise NotImplementedError()

    def setupfs(self, name=None):
        if name is None and self.fs is not None:
            # Previous mount existed, reuse the old name
            name = self.fs.name
        self.fs = Filesystem(self.ctx, name=name)
        log.info('Wait for MDS to reach steady state...')
        self.fs.wait_for_daemons()
        log.info('Ready to start {}...'.format(type(self).__name__))

    def mount(self, mount_path=None, mount_fs_name=None):
        raise NotImplementedError()

    def umount(self):
        raise NotImplementedError()

    def umount_wait(self, force=False, require_clean=False):
        """

        :param force: Expect that the mount will not shutdown cleanly: kill
                      it hard.
        :param require_clean: Wait for the Ceph client associated with the
                              mount (e.g. ceph-fuse) to terminate, and
                              raise if it doesn't do so cleanly.
        :return:
        """
        raise NotImplementedError()

    def kill_cleanup(self):
        raise NotImplementedError()

    def kill(self):
        raise NotImplementedError()

    def cleanup(self):
        raise NotImplementedError()

    def wait_until_mounted(self):
        raise NotImplementedError()

    def get_keyring_path(self):
        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)

    @property
    def config_path(self):
        """
        Path to ceph.conf: override this if you're not a normal systemwide ceph install
        :return: stringv
        """
        return "/etc/ceph/ceph.conf"

    @contextmanager
    def mounted(self):
        """
        A context manager, from an initially unmounted state, to mount
        this, yield, and then unmount and clean up.
        """
        self.mount()
        self.wait_until_mounted()
        try:
            yield
        finally:
            self.umount_wait()

    def is_blacklisted(self):
        addr = self.get_global_addr()
        blacklist = json.loads(
            self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "ls",
                                                "--format=json"))
        for b in blacklist:
            if addr == b["addr"]:
                return True
        return False

    def create_file(self,
                    filename='testfile',
                    dirname=None,
                    user=None,
                    check_status=True):
        assert (self.is_mounted())

        if not os.path.isabs(filename):
            if dirname:
                if os.path.isabs(dirname):
                    path = os.path.join(dirname, filename)
                else:
                    path = os.path.join(self.mountpoint, dirname, filename)
            else:
                path = os.path.join(self.mountpoint, filename)
        else:
            path = filename

        if user:
            args = [
                'sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path
            ]
        else:
            args = 'touch ' + path

        return self.client_remote.run(args=args, check_status=check_status)

    def create_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Creating file {0}".format(suffix))
            self.client_remote.run(
                args=['sudo', 'touch',
                      os.path.join(self.mountpoint, suffix)])

    def test_create_file(self,
                         filename='testfile',
                         dirname=None,
                         user=None,
                         check_status=True):
        return self.create_file(filename=filename,
                                dirname=dirname,
                                user=user,
                                check_status=False)

    def check_files(self):
        assert (self.is_mounted())

        for suffix in self.test_files:
            log.info("Checking file {0}".format(suffix))
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.mountpoint, suffix)],
                check_status=False)
            if r.exitstatus != 0:
                raise RuntimeError(
                    "Expected file {0} not found".format(suffix))

    def create_destroy(self):
        assert (self.is_mounted())

        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
        log.debug("Creating test file {0}".format(filename))
        self.client_remote.run(
            args=['sudo', 'touch',
                  os.path.join(self.mountpoint, filename)])
        log.debug("Deleting test file {0}".format(filename))
        self.client_remote.run(
            args=['sudo', 'rm', '-f',
                  os.path.join(self.mountpoint, filename)])

    def _run_python(self, pyscript, py_version='python'):
        return self.client_remote.run(args=[
            'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', py_version,
            '-c', pyscript
        ],
                                      wait=False,
                                      stdin=run.PIPE,
                                      stdout=StringIO())

    def run_python(self, pyscript, py_version='python'):
        p = self._run_python(pyscript, py_version)
        p.wait()
        return p.stdout.getvalue().strip()

    def run_shell(self,
                  args,
                  wait=True,
                  stdin=None,
                  check_status=True,
                  omit_sudo=True):
        if isinstance(args, str):
            args = args.split()

        args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
        return self.client_remote.run(args=args,
                                      stdout=StringIO(),
                                      stderr=StringIO(),
                                      wait=wait,
                                      stdin=stdin,
                                      check_status=check_status,
                                      omit_sudo=omit_sudo)

    def open_no_data(self, basename):
        """
        A pure metadata operation
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        p = self._run_python(
            dedent("""
            f = open("{path}", 'w')
            """.format(path=path)))
        p.wait()

    def open_background(self, basename="background_file", write=True):
        """
        Open a file for writing, then block such that the client
        will hold a capability.

        Don't return until the remote process has got as far as opening
        the file, then return the RemoteProcess instance.
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        if write:
            pyscript = dedent("""
                import time

                f = open("{path}", 'w')
                f.write('content')
                f.flush()
                f.write('content2')
                while True:
                    time.sleep(1)
                """).format(path=path)
        else:
            pyscript = dedent("""
                import time

                f = open("{path}", 'r')
                while True:
                    time.sleep(1)
                """).format(path=path)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)

        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
        self.wait_for_visible(basename)

        return rproc

    def wait_for_dir_empty(self, dirname, timeout=30):
        i = 0
        dirpath = os.path.join(self.mountpoint, dirname)
        while i < timeout:
            nr_entries = int(self.getfattr(dirpath, "ceph.dir.entries"))
            if nr_entries == 0:
                log.debug(
                    "Directory {0} seen empty from {1} after {2}s ".format(
                        dirname, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError(
            "Timed out after {0}s waiting for {1} to become empty from {2}".
            format(i, dirname, self.client_id))

    def wait_for_visible(self, basename="background_file", timeout=30):
        i = 0
        while i < timeout:
            r = self.client_remote.run(
                args=['sudo', 'ls',
                      os.path.join(self.mountpoint, basename)],
                check_status=False)
            if r.exitstatus == 0:
                log.debug("File {0} became visible from {1} after {2}s".format(
                    basename, self.client_id, i))
                return
            else:
                time.sleep(1)
                i += 1

        raise RuntimeError(
            "Timed out after {0}s waiting for {1} to become visible from {2}".
            format(i, basename, self.client_id))

    def lock_background(self, basename="background_file", do_flock=True):
        """
        Open and lock a files for writing, hold the lock in a background process
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import time
            import fcntl
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
        script_builder += """
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            while True:
                time.sleep(1)
            """

        pyscript = dedent(script_builder).format(path=path)

        log.info("lock_background file {0}".format(basename))
        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def lock_and_release(self, basename="background_file"):
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script = """
            import time
            import fcntl
            import struct
            f1 = open("{path}-1", 'w')
            fcntl.flock(f1, fcntl.LOCK_EX)
            f2 = open("{path}-2", 'w')
            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            """
        pyscript = dedent(script).format(path=path)

        log.info("lock_and_release file {0}".format(basename))
        return self._run_python(pyscript)

    def check_filelock(self, basename="background_file", do_flock=True):
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        script_builder = """
            import fcntl
            import errno
            import struct"""
        if do_flock:
            script_builder += """
            f1 = open("{path}-1", 'r')
            try:
                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
            except IOError, e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("flock on file {path}-1 not found")"""
        script_builder += """
            f2 = open("{path}-2", 'r')
            try:
                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
            except IOError, e:
                if e.errno == errno.EAGAIN:
                    pass
            else:
                raise RuntimeError("posix lock on file {path}-2 not found")
            """
        pyscript = dedent(script_builder).format(path=path)

        log.info("check lock on file {0}".format(basename))
        self.client_remote.run(args=['sudo', 'python', '-c', pyscript])

    def write_background(self, basename="background_file", loop=False):
        """
        Open a file for writing, complete as soon as you can
        :param basename:
        :return:
        """
        assert (self.is_mounted())

        path = os.path.join(self.mountpoint, basename)

        pyscript = dedent("""
            import os
            import time

            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644)
            try:
                while True:
                    os.write(fd, 'content')
                    time.sleep(1)
                    if not {loop}:
                        break
            except IOError, e:
                pass
            os.close(fd)
            """).format(path=path, loop=str(loop))

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
        """
        Write the requested number of megabytes to a file
        """
        assert (self.is_mounted())

        return self.run_shell([
            "dd", "if=/dev/urandom", "of={0}".format(filename), "bs=1M",
            "conv=fdatasync", "count={0}".format(n_mb), "seek={0}".format(seek)
        ],
                              wait=wait)

    def write_test_pattern(self, filename, size):
        log.info("Writing {0} bytes to {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            f = open(path, 'w')
            for i in range(0, {size}):
                val = zlib.crc32("%s" % i) & 7
                f.write(chr(val))
            f.close()
        """.format(path=os.path.join(self.mountpoint, filename), size=size)))

    def validate_test_pattern(self, filename, size):
        log.info("Validating {0} bytes from {1}".format(size, filename))
        return self.run_python(
            dedent("""
            import zlib
            path = "{path}"
            f = open(path, 'r')
            bytes = f.read()
            f.close()
            if len(bytes) != {size}:
                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
                    len(bytes), {size}
                ))
            for i, b in enumerate(bytes):
                val = zlib.crc32("%s" % i) & 7
                if b != chr(val):
                    raise RuntimeError("Bad data at offset {{0}}".format(i))
        """.format(path=os.path.join(self.mountpoint, filename), size=size)))

    def open_n_background(self, fs_path, count):
        """
        Open N files for writing, hold them open in a background process

        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
        :return: a RemoteProcess
        """
        assert (self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            handles = []
            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                handles.append(open(fname, 'w'))

            while True:
                time.sleep(1)
            """).format(abs_path=abs_path, count=count)

        rproc = self._run_python(pyscript)
        self.background_procs.append(rproc)
        return rproc

    def create_n_files(self, fs_path, count, sync=False):
        assert (self.is_mounted())

        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import sys
            import time
            import os

            n = {count}
            abs_path = "{abs_path}"

            if not os.path.exists(os.path.dirname(abs_path)):
                os.makedirs(os.path.dirname(abs_path))

            for i in range(0, n):
                fname = "{{0}}_{{1}}".format(abs_path, i)
                h = open(fname, 'w')
                h.write('content')
                if {sync}:
                    h.flush()
                    os.fsync(h.fileno())
                h.close()
            """).format(abs_path=abs_path, count=count, sync=str(sync))

        self.run_python(pyscript)

    def teardown(self):
        for p in self.background_procs:
            log.info("Terminating background process")
            self._kill_background(p)

        self.background_procs = []

    def _kill_background(self, p):
        if p.stdin:
            p.stdin.close()
            try:
                p.wait()
            except (CommandFailedError, ConnectionLostError):
                pass

    def kill_background(self, p):
        """
        For a process that was returned by one of the _background member functions,
        kill it hard.
        """
        self._kill_background(p)
        self.background_procs.remove(p)

    def send_signal(self, signal):
        signal = signal.lower()
        if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']:
            raise NotImplementedError

        self.client_remote.run(
            args=['sudo', 'kill', '-{0}'.format(signal), self.client_pid],
            omit_sudo=False)

    def get_global_id(self):
        raise NotImplementedError()

    def get_global_inst(self):
        raise NotImplementedError()

    def get_global_addr(self):
        raise NotImplementedError()

    def get_osd_epoch(self):
        raise NotImplementedError()

    def lstat(self, fs_path, follow_symlinks=False, wait=True):
        return self.stat(fs_path, follow_symlinks=False, wait=True)

    def stat(self, fs_path, follow_symlinks=True, wait=True):
        """
        stat a file, and return the result as a dictionary like this:
        {
          "st_ctime": 1414161137.0,
          "st_mtime": 1414161137.0,
          "st_nlink": 33,
          "st_gid": 0,
          "st_dev": 16777218,
          "st_size": 1190,
          "st_ino": 2,
          "st_uid": 0,
          "st_mode": 16877,
          "st_atime": 1431520593.0
        }

        Raises exception on absent file.
        """
        abs_path = os.path.join(self.mountpoint, fs_path)
        if follow_symlinks:
            stat_call = "os.stat('" + abs_path + "')"
        else:
            stat_call = "os.lstat('" + abs_path + "')"

        pyscript = dedent("""
            import os
            import stat
            import json
            import sys

            try:
                s = {stat_call}
            except OSError as e:
                sys.exit(e.errno)

            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
            print(json.dumps(
                dict([(a, getattr(s, a)) for a in attrs]),
                indent=2))
            """).format(stat_call=stat_call)
        proc = self._run_python(pyscript)
        if wait:
            proc.wait()
            return json.loads(proc.stdout.getvalue().strip())
        else:
            return proc

    def touch(self, fs_path):
        """
        Create a dentry if it doesn't already exist.  This python
        implementation exists because the usual command line tool doesn't
        pass through error codes like EIO.

        :param fs_path:
        :return:
        """
        abs_path = os.path.join(self.mountpoint, fs_path)
        pyscript = dedent("""
            import sys
            import errno

            try:
                f = open("{path}", "w")
                f.close()
            except IOError as e:
                sys.exit(errno.EIO)
            """).format(path=abs_path)
        proc = self._run_python(pyscript)
        proc.wait()

    def path_to_ino(self, fs_path, follow_symlinks=True):
        abs_path = os.path.join(self.mountpoint, fs_path)

        if follow_symlinks:
            pyscript = dedent("""
                import os
                import stat

                print(os.stat("{path}").st_ino)
                """).format(path=abs_path)
        else:
            pyscript = dedent("""
                import os
                import stat

                print(os.lstat("{path}").st_ino)
                """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def path_to_nlink(self, fs_path):
        abs_path = os.path.join(self.mountpoint, fs_path)

        pyscript = dedent("""
            import os
            import stat

            print(os.stat("{path}").st_nlink)
            """).format(path=abs_path)

        proc = self._run_python(pyscript)
        proc.wait()
        return int(proc.stdout.getvalue().strip())

    def ls(self, path=None):
        """
        Wrap ls: return a list of strings
        """
        cmd = ["ls"]
        if path:
            cmd.append(path)

        ls_text = self.run_shell(cmd).stdout.getvalue().strip()

        if ls_text:
            return ls_text.split("\n")
        else:
            # Special case because otherwise split on empty string
            # gives you [''] instead of []
            return []

    def setfattr(self, path, key, val):
        """
        Wrap setfattr.

        :param path: relative to mount point
        :param key: xattr name
        :param val: xattr value
        :return: None
        """
        self.run_shell(["setfattr", "-n", key, "-v", val, path])

    def getfattr(self, path, attr):
        """
        Wrap getfattr: return the values of a named xattr on one file, or
        None if the attribute is not found.

        :return: a string
        """
        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path],
                           wait=False)
        try:
            p.wait()
        except CommandFailedError as e:
            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(
            ):
                return None
            else:
                raise

        return p.stdout.getvalue()

    def df(self):
        """
        Wrap df: return a dict of usage fields in bytes
        """

        p = self.run_shell(["df", "-B1", "."])
        lines = p.stdout.getvalue().strip().split("\n")
        fs, total, used, avail = lines[1].split()[:4]
        log.warn(lines)

        return {
            "total": int(total),
            "used": int(used),
            "available": int(avail)
        }
Exemple #42
0
def task(ctx, config):
    """
    Stress test the mds by running scrub iterations while another task/workunit
    is running.
    Example config:

    - fwd_scrub:
      scrub_timeout: 300
      sleep_between_iterations: 1
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'fwd_scrub task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 0, \
        'fwd_scrub task requires at least 1 metadata server'

    (first, ) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys()
    manager = ceph_manager.CephManager(
        first,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    status = mds_cluster.status()
    while True:
        steady = True
        for info in status.get_all():
            state = info['state']
            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
                steady = False
                break
        if steady:
            break
        sleep(2)
        status = mds_cluster.status()

    log.info('Ready to start scrub thrashing')

    manager.wait_for_clean()
    assert manager.is_clean()

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    for fs in status.get_filesystems():
        fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']),
                                       config['scrub_timeout'],
                                       config['sleep_between_iterations'])
        fwd_scrubber.start()
        ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber)

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining ForwardScrubbers')
        stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers)
        log.info('done joining')
def task(ctx, config):
    """
    Run the CephFS test cases.

    Run everything in tasks/cephfs/test_*.py:

    ::

        tasks:
          - install:
          - ceph:
          - ceph-fuse:
          - cephfs_test_runner:

    `modules` argument allows running only some specific modules:

    ::

        tasks:
            ...
          - cephfs_test_runner:
              modules:
                - tasks.cephfs.test_sessionmap
                - tasks.cephfs.test_auto_repair

    By default, any cases that can't be run on the current cluster configuration
    will generate a failure.  When the optional `fail_on_skip` argument is set
    to false, any tests that can't be run on the current configuration will
    simply be skipped:

    ::
        tasks:
            ...
         - cephfs_test_runner:
           fail_on_skip: false

    """

    ceph_cluster = CephCluster(ctx)

    if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
        mds_cluster = MDSCluster(ctx)
        fs = Filesystem(ctx)
    else:
        mds_cluster = None
        fs = None

    if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
        mgr_cluster = MgrCluster(ctx)
    else:
        mgr_cluster = None

    # Mount objects, sorted by ID
    if hasattr(ctx, 'mounts'):
        mounts = [
            v for k, v in sorted(ctx.mounts.items(),
                                 lambda a, b: cmp(a[0], b[0]))
        ]
    else:
        # The test configuration has a filesystem but no fuse/kclient mounts
        mounts = []

    decorating_loader = DecoratingLoader({
        "ctx": ctx,
        "mounts": mounts,
        "fs": fs,
        "ceph_cluster": ceph_cluster,
        "mds_cluster": mds_cluster,
        "mgr_cluster": mgr_cluster,
    })

    fail_on_skip = config.get('fail_on_skip', True)

    # Put useful things onto ctx for interactive debugging
    ctx.fs = fs
    ctx.mds_cluster = mds_cluster
    ctx.mgr_cluster = mgr_cluster

    # Depending on config, either load specific modules, or scan for moduless
    if config and 'modules' in config and config['modules']:
        module_suites = []
        for mod_name in config['modules']:
            # Test names like cephfs.test_auto_repair
            module_suites.append(decorating_loader.loadTestsFromName(mod_name))
        overall_suite = suite.TestSuite(module_suites)
    else:
        # Default, run all tests
        overall_suite = decorating_loader.discover(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "cephfs/"))

    if ctx.config.get("interactive-on-error", False):
        InteractiveFailureResult.ctx = ctx
        result_class = InteractiveFailureResult
    else:
        result_class = unittest.TextTestResult

    class LoggingResult(result_class):
        def startTest(self, test):
            log.info("Starting test: {0}".format(self.getDescription(test)))
            return super(LoggingResult, self).startTest(test)

        def addSkip(self, test, reason):
            if fail_on_skip:
                # Don't just call addFailure because that requires a traceback
                self.failures.append((test, reason))
            else:
                super(LoggingResult, self).addSkip(test, reason)

    # Execute!
    result = unittest.TextTestRunner(stream=LogStream(),
                                     resultclass=LoggingResult,
                                     verbosity=2,
                                     failfast=True).run(overall_suite)

    if not result.wasSuccessful():
        result.printErrors()  # duplicate output at end for convenience

        bad_tests = []
        for test, error in result.errors:
            bad_tests.append(str(test))
        for test, failure in result.failures:
            bad_tests.append(str(test))

        raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))

    yield
Exemple #44
0
def task(ctx, config):
    fs = Filesystem(ctx)

    run_test(ctx, config, fs)