def task(ctx, config):
    """
    Upgrade CephFS file system snap format.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)

    mds_map = fs.get_mds_map()
    assert(mds_map['max_mds'] == 1)

    json = fs.rank_tell(["scrub", "start", "/", "force", "recursive", "repair"])
    if not json or json['return_code'] == 0:
        log.info("scrub / completed")
    else:
        log.info("scrub / failed: {}".format(json))

    json = fs.rank_tell(["scrub", "start", "~mdsdir", "force", "recursive", "repair"])
    if not json or json['return_code'] == 0:
        log.info("scrub ~mdsdir completed")
    else:
        log.info("scrub / failed: {}".format(json))

    for i in range(0, 10):
        mds_map = fs.get_mds_map()
        if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
            break
        time.sleep(10)
    assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
    assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
Exemple #2
0
def task(ctx, config):
    """
    Prepare MDS cluster for upgrade.

    This task reduces ranks to 1 and stops all standbys.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'snap-upgrade task only accepts a dict for configuration'

    fs = Filesystem(ctx)
    status = fs.getinfo()

    fs.set_max_mds(1)
    status = fs.getinfo()
    targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
    if len(targets) > 0:
        # deactivate mds in decending order
        targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
        for target in targets:
            self.log("deactivating rank %d" % target['rank'])
            self.fs.deactivate(target['rank'])
            status = self.wait_for_stable()[0]
        else:
            status = self.wait_for_stable()[0]

    assert(fs.get_mds_map(status=status)['max_mds'] == 1)
    assert(fs.get_mds_map(status=status)['in'] == [0])

    # Stop standbys now to minimize time rank 0 is down in subsequent:
    # tasks:
    # - ceph.stop: [mds.*]
    rank0 = fs.get_rank(rank=0, status=status)
    for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster):
        if rank0['name'] != daemon.id_:
            daemon.stop()

    for i in range(1, 10):
        time.sleep(5) # time for FSMap to update
        status = fs.getinfo()
        if len(list(status.get_standbys())) == 0:
            break
    assert(len(list(status.get_standbys())) == 0)
class MDSThrasher(Greenlet):
    """
    MDSThrasher::

    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).

    The config is optional.  Many of the config parameters are a a maximum value
    to use when selecting a random value from a range.  To always use the maximum
    value, set no_random to true.  The config is a dict containing some or all of:

    seed: [no default] seed the random number generator

    randomize: [default: true] enables randomization and use the max/min values

    max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
      any given time.

    max_thrash_delay: [default: 30] maximum number of seconds to delay before
      thrashing again.

    max_revive_delay: [default: 10] maximum number of seconds to delay before
      bringing back a thrashed MDS

    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
      during replay.  Value should be between 0.0 and 1.0

    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
      the replay state before thrashing

    thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
      overrides anything specified by max_thrash.  This option is a dict containing
      mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
      is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
      given a weight of 0.0.  For a given MDS, by default the trasher delays for up
      to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
      weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
      during that iteration based on a random value [0-1] not exceeding the weight of that MDS.

    Examples::


      The following example sets the likelihood that mds.a will be thrashed
      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
      likelihood that an MDS will be thrashed in replay to 40%.
      Thrash weights do not have to sum to 1.

      tasks:
      - ceph:
      - mds_thrash:
          thrash_weights:
            - mds.a: 0.8
            - mds.b: 0.2
          thrash_in_replay: 0.4
      - ceph-fuse:
      - workunit:
          clients:
            all: [suites/fsx.sh]

      The following example disables randomization, and uses the max delay values:

      tasks:
      - ceph:
      - mds_thrash:
          max_thrash_delay: 10
          max_revive_delay: 1
          max_replay_thrash_delay: 4

    """

    def __init__(self, ctx, manager, mds_cluster, config, logger, failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)

    def _run(self):
        try:
            self.do_thrash()
        except:
            # Log exceptions here so we get the full backtrace (it's lost
            # by the time someone does a .get() on this greenlet)
            self.logger.exception("Exception in do_thrash:")
            raise

    def log(self, x):
        """Write data to logger assigned to this MDThrasher"""
        self.logger.info(x)

    def stop(self):
        self.stopping.set()

    def kill_mds(self, mds):
        if self.config.get('powercycle'):
            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
                         remotes.iterkeys())
            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
                     format(m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_off()
        else:
            self.ctx.daemons.get_daemon('mds', mds).stop()

    @staticmethod
    def _assert_ipmi(remote):
        assert remote.console.has_ipmi_credentials, (
            "powercycling requested but RemoteConsole is not "
            "initialized.  Check ipmi config.")

    def kill_mds_by_rank(self, rank):
        """
        kill_mds wrapper to kill based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.kill_mds(status['name'])

    def revive_mds(self, mds, standby_for_rank=None):
        """
        Revive mds -- do an ipmpi powercycle (if indicated by the config)
        and then restart (using --hot-standby if specified.
        """
        if self.config.get('powercycle'):
            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
                         remotes.iterkeys())
            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
                     format(m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_on()
            self.manager.make_admin_daemon_dir(self.ctx, remote)
        args = []
        if standby_for_rank:
            args.extend(['--hot-standby', standby_for_rank])
        self.ctx.daemons.get_daemon('mds', mds).restart(*args)

    def revive_mds_by_rank(self, rank, standby_for_rank=None):
        """
        revive_mds wrapper to revive based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.revive_mds(status['name'], standby_for_rank)

    def get_mds_status_all(self):
        return self.fs.get_mds_map()

    def do_thrash(self):
        """
        Perform the random thrashing action
        """

        self.log('starting mds_do_thrash for failure group: ' + ', '.join(
            ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
        while not self.stopping.is_set():
            delay = self.max_thrash_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_thrash_delay)

            if delay > 0.0:
                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
                self.stopping.wait(delay)
                if self.stopping.is_set():
                    continue

            skip = random.randrange(0.0, 1.0)
            if self.weight < 1.0 and skip > self.weight:
                self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip,
                                                                                                   weight=self.weight))
                continue

            # find the active mds in the failure group
            statuses = [self.mds_cluster.get_mds_info(m) for m in self.failure_group]
            actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
            assert len(actives) == 1, 'Can only have one active in a failure group'

            active_mds = actives[0]['name']
            active_rank = actives[0]['rank']

            self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank))
            self.manager.kill_mds_by_rank(active_rank)

            # wait for mon to report killed mds as crashed
            last_laggy_since = None
            itercount = 0
            while True:
                failed = self.fs.get_mds_map()['failed']
                status = self.mds_cluster.get_mds_info(active_mds)
                if not status:
                    break
                if 'laggy_since' in status:
                    last_laggy_since = status['laggy_since']
                    break
                if any([(f == active_mds) for f in failed]):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format(
                        _id=active_mds))
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(status=self.mds_cluster.get_fs_map()))
                time.sleep(2)
            if last_laggy_since:
                self.log(
                    'mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since))
            else:
                self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since))

            # wait for a standby mds to takeover and become active
            takeover_mds = None
            takeover_rank = None
            itercount = 0
            while True:
                statuses = [self.mds_cluster.get_mds_info(m) for m in self.failure_group]
                actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
                if len(actives) > 0:
                    assert len(actives) == 1, 'Can only have one active in failure group'
                    takeover_mds = actives[0]['name']
                    takeover_rank = actives[0]['rank']
                    break
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(status=self.mds_cluster.get_fs_map()))

            self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))

            # wait for a while before restarting old active to become new
            # standby
            delay = self.max_revive_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_revive_delay)

            self.log('waiting for {delay} secs before reviving mds.{id}'.format(
                delay=delay, id=active_mds))
            time.sleep(delay)

            self.log('reviving mds.{id}'.format(id=active_mds))
            self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank)

            status = {}
            while True:
                status = self.mds_cluster.get_mds_info(active_mds)
                if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds))
                time.sleep(2)
            self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state']))

            # don't do replay thrashing right now
            continue
            # this might race with replay -> active transition...
            if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:

                delay = self.max_replay_thrash_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_replay_thrash_delay)
                time.sleep(delay)
                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
                self.manager.kill_mds(self.to_kill)

                delay = self.max_revive_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_revive_delay)

                self.log('waiting for {delay} secs before reviving mds.{id}'.format(
                    delay=delay, id=self.to_kill))
                time.sleep(delay)

                self.log('revive mds.{id}'.format(id=self.to_kill))
                self.manager.revive_mds(self.to_kill)
Exemple #4
0
class MDSThrasher(Greenlet):
    """
    MDSThrasher::

    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).

    The config is optional.  Many of the config parameters are a a maximum value
    to use when selecting a random value from a range.  To always use the maximum
    value, set no_random to true.  The config is a dict containing some or all of:

    seed: [no default] seed the random number generator

    randomize: [default: true] enables randomization and use the max/min values

    max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
      any given time.

    max_thrash_delay: [default: 30] maximum number of seconds to delay before
      thrashing again.

    max_revive_delay: [default: 10] maximum number of seconds to delay before
      bringing back a thrashed MDS

    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
      during replay.  Value should be between 0.0 and 1.0

    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
      the replay state before thrashing

    thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
      overrides anything specified by max_thrash.  This option is a dict containing
      mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
      is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
      given a weight of 0.0.  For a given MDS, by default the trasher delays for up
      to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
      weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
      during that iteration based on a random value [0-1] not exceeding the weight of that MDS.

    Examples::


      The following example sets the likelihood that mds.a will be thrashed
      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
      likelihood that an MDS will be thrashed in replay to 40%.
      Thrash weights do not have to sum to 1.

      tasks:
      - ceph:
      - mds_thrash:
          thrash_weights:
            - mds.a: 0.8
            - mds.b: 0.2
          thrash_in_replay: 0.4
      - ceph-fuse:
      - workunit:
          clients:
            all: [suites/fsx.sh]

      The following example disables randomization, and uses the max delay values:

      tasks:
      - ceph:
      - mds_thrash:
          max_thrash_delay: 10
          max_revive_delay: 1
          max_replay_thrash_delay: 4

    """
    def __init__(self, ctx, manager, mds_cluster, config, logger,
                 failure_group, weight):
        super(MDSThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        assert self.manager.is_clean()
        self.mds_cluster = mds_cluster

        self.stopping = Event()
        self.logger = logger
        self.config = config

        self.randomize = bool(self.config.get('randomize', True))
        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
        self.thrash_in_replay = float(
            self.config.get('thrash_in_replay', False))
        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
            v=self.thrash_in_replay)

        self.max_replay_thrash_delay = float(
            self.config.get('max_replay_thrash_delay', 4.0))

        self.max_revive_delay = float(self.config.get('max_revive_delay',
                                                      10.0))

        self.failure_group = failure_group
        self.weight = weight

        # TODO support multiple filesystems: will require behavioural change to select
        # which filesystem to act on when doing rank-ish things
        self.fs = Filesystem(self.ctx)

    def _run(self):
        try:
            self.do_thrash()
        except:
            # Log exceptions here so we get the full backtrace (it's lost
            # by the time someone does a .get() on this greenlet)
            self.logger.exception("Exception in do_thrash:")
            raise

    def log(self, x):
        """Write data to logger assigned to this MDThrasher"""
        self.logger.info(x)

    def stop(self):
        self.stopping.set()

    def kill_mds(self, mds):
        if self.config.get('powercycle'):
            (remote, ) = (self.ctx.cluster.only(
                'mds.{m}'.format(m=mds)).remotes.iterkeys())
            self.log('kill_mds on mds.{m} doing powercycle of {s}'.format(
                m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_off()
        else:
            self.ctx.daemons.get_daemon('mds', mds).stop()

    @staticmethod
    def _assert_ipmi(remote):
        assert remote.console.has_ipmi_credentials, (
            "powercycling requested but RemoteConsole is not "
            "initialized.  Check ipmi config.")

    def kill_mds_by_rank(self, rank):
        """
        kill_mds wrapper to kill based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.kill_mds(status['name'])

    def revive_mds(self, mds, standby_for_rank=None):
        """
        Revive mds -- do an ipmpi powercycle (if indicated by the config)
        and then restart (using --hot-standby if specified.
        """
        if self.config.get('powercycle'):
            (remote, ) = (self.ctx.cluster.only(
                'mds.{m}'.format(m=mds)).remotes.iterkeys())
            self.log('revive_mds on mds.{m} doing powercycle of {s}'.format(
                m=mds, s=remote.name))
            self._assert_ipmi(remote)
            remote.console.power_on()
            self.manager.make_admin_daemon_dir(self.ctx, remote)
        args = []
        if standby_for_rank:
            args.extend(['--hot-standby', standby_for_rank])
        self.ctx.daemons.get_daemon('mds', mds).restart(*args)

    def revive_mds_by_rank(self, rank, standby_for_rank=None):
        """
        revive_mds wrapper to revive based on rank passed.
        """
        status = self.mds_cluster.get_mds_info_by_rank(rank)
        self.revive_mds(status['name'], standby_for_rank)

    def get_mds_status_all(self):
        return self.fs.get_mds_map()

    def do_thrash(self):
        """
        Perform the random thrashing action
        """

        self.log('starting mds_do_thrash for failure group: ' + ', '.join(
            ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
        while not self.stopping.is_set():
            delay = self.max_thrash_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_thrash_delay)

            if delay > 0.0:
                self.log('waiting for {delay} secs before thrashing'.format(
                    delay=delay))
                self.stopping.wait(delay)
                if self.stopping.is_set():
                    continue

            skip = random.randrange(0.0, 1.0)
            if self.weight < 1.0 and skip > self.weight:
                self.log(
                    'skipping thrash iteration with skip ({skip}) > weight ({weight})'
                    .format(skip=skip, weight=self.weight))
                continue

            # find the active mds in the failure group
            statuses = [
                self.mds_cluster.get_mds_info(m) for m in self.failure_group
            ]
            actives = filter(lambda s: s and s['state'] == 'up:active',
                             statuses)
            assert len(
                actives) == 1, 'Can only have one active in a failure group'

            active_mds = actives[0]['name']
            active_rank = actives[0]['rank']

            self.log('kill mds.{id} (rank={r})'.format(id=active_mds,
                                                       r=active_rank))
            self.kill_mds_by_rank(active_rank)

            # wait for mon to report killed mds as crashed
            last_laggy_since = None
            itercount = 0
            while True:
                failed = self.fs.get_mds_map()['failed']
                status = self.mds_cluster.get_mds_info(active_mds)
                if not status:
                    break
                if 'laggy_since' in status:
                    last_laggy_since = status['laggy_since']
                    break
                if any([(f == active_mds) for f in failed]):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'
                    .format(_id=active_mds))
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(
                        status=self.mds_cluster.get_fs_map()))
                time.sleep(2)
            if last_laggy_since:
                self.log(
                    'mds.{_id} reported laggy/crashed since: {since}'.format(
                        _id=active_mds, since=last_laggy_since))
            else:
                self.log('mds.{_id} down, removed from mdsmap'.format(
                    _id=active_mds, since=last_laggy_since))

            # wait for a standby mds to takeover and become active
            takeover_mds = None
            takeover_rank = None
            itercount = 0
            while True:
                statuses = [
                    self.mds_cluster.get_mds_info(m)
                    for m in self.failure_group
                ]
                actives = filter(lambda s: s and s['state'] == 'up:active',
                                 statuses)
                if len(actives) > 0:
                    assert len(
                        actives
                    ) == 1, 'Can only have one active in failure group'
                    takeover_mds = actives[0]['name']
                    takeover_rank = actives[0]['rank']
                    break
                itercount = itercount + 1
                if itercount > 10:
                    self.log('mds map: {status}'.format(
                        status=self.mds_cluster.get_fs_map()))

            self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))

            # wait for a while before restarting old active to become new
            # standby
            delay = self.max_revive_delay
            if self.randomize:
                delay = random.randrange(0.0, self.max_revive_delay)

            self.log(
                'waiting for {delay} secs before reviving mds.{id}'.format(
                    delay=delay, id=active_mds))
            time.sleep(delay)

            self.log('reviving mds.{id}'.format(id=active_mds))
            self.revive_mds(active_mds, standby_for_rank=takeover_rank)

            status = {}
            while True:
                status = self.mds_cluster.get_mds_info(active_mds)
                if status and (status['state'] == 'up:standby'
                               or status['state'] == 'up:standby-replay'):
                    break
                self.log(
                    'waiting till mds map indicates mds.{_id} is in standby or standby-replay'
                    .format(_id=active_mds))
                time.sleep(2)
            self.log('mds.{_id} reported in {state} state'.format(
                _id=active_mds, state=status['state']))

            # don't do replay thrashing right now
            continue
            # this might race with replay -> active transition...
            if status['state'] == 'up:replay' and random.randrange(
                    0.0, 1.0) < self.thrash_in_replay:

                delay = self.max_replay_thrash_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_replay_thrash_delay)
                time.sleep(delay)
                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
                self.kill_mds(self.to_kill)

                delay = self.max_revive_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_revive_delay)

                self.log(
                    'waiting for {delay} secs before reviving mds.{id}'.format(
                        delay=delay, id=self.to_kill))
                time.sleep(delay)

                self.log('revive mds.{id}'.format(id=self.to_kill))
                self.revive_mds(self.to_kill)