Beispiel #1
0
    def __init__(self, fsid, cluster_name):
        """
        Requiring cluster_name and fsid is redundant (ideally everything would
        speak in terms of fsid) but convenient, because the librados interface
        wants a cluster name when you create a client, and otherwise we would
        have to look up via ceph.conf.
        """
        # getChild isn't in 2.6
        logname = '.'.join((log.name, self.__class__.__name__))
        self.log = logging.getLogger(logname)
        self.requested_at = now()
        self.completed_at = None

        # This is actually kind of overkill compared with having a counter,
        # somewhere but it's easy.
        self.id = uuid.uuid4().__str__()

        self._minion_id = None
        self.fsid = fsid
        self._cluster_name = cluster_name

        self.jid = None

        self.state = self.NEW
        self.result = None
        self.error = False
        self.error_message = ""

        # Time at which we last believed the current JID to be really running
        self.alive_at = None
    def _is_favorite(self, minion_id):
        """
        Check if this minion is the one which we are currently treating
        as the primary source of updates, and promote it to be the
        favourite if the favourite has not sent a heartbeat since
        rlyeh->favorite_timeout_s.

        :return True if this minion was the favorite or has just been
                promoted.
        """
        t_now = now()
        self._last_heartbeat[minion_id] = t_now

        if self._favorite_mon is None:
            log.debug("%s is my new favourite" % minion_id)
            self._set_favorite(minion_id)
            return True
        elif minion_id != self._favorite_mon:
            # Consider whether this minion should become my new favourite: has it been
            # too long since my current favourite reported in?
            time_since = t_now - self._last_heartbeat[self._favorite_mon]
            favorite_timeout_s = self._servers.get_contact_period(
                self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR
            if time_since > datetime.timedelta(seconds=favorite_timeout_s):
                log.debug(
                    "My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite"
                    % (self._favorite_mon, time_since, minion_id))
                self._set_favorite(minion_id)

        return minion_id == self._favorite_mon
Beispiel #3
0
    def _submit(self):
        self.jid = remote.run_job(self._minion_id, self._cmd, self._args)
        self.alive_at = now()

        self.log.info("Request %s started job %s" % (self.id, self.jid))

        return self.jid
Beispiel #4
0
    def complete(self):
        """
        Call this when you're all done
        """
        assert self.state != self.COMPLETE
        assert self.jid is None

        self.log.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message))
        self.state = self.COMPLETE
        self.completed_at = now()
Beispiel #5
0
    def on_tick_response(self, minion_id, jobs):
        """
        Update the alive_at parameter of requests to record that they
        are still running remotely.

        :param jobs: The response from a saltutil.running
        """
        log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id))
        for job in jobs:
            try:
                request = self._by_jid[job['jid']]
            except KeyError:
                # Not one of mine, ignore it
                pass
            else:
                request.alive_at = now()
Beispiel #6
0
    def _submit(self, commands=None):
        if commands is None:
            commands = self._commands

        self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__,
                                                 self._minion_id,
                                                 self._cluster_name,
                                                 commands))

        self.jid = remote.run_job(self._minion_id, 'ceph.rados_commands',
                                  {'fsid': self.fsid,
                                   'cluster_name': self._cluster_name,
                                   'commands': commands})
        self.log.info("Request %s started job %s" % (self.id, self.jid))
        self.alive_at = now()

        return self.jid
Beispiel #7
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending get_running for {0}".format(query_minions))
            self._remote.get_running(list(query_minions))
    def on_sync_object(self, minion_id, data):
        if minion_id != self._favorite_mon:
            log.debug("Ignoring map from %s, it is not my favourite (%s)" %
                      (minion_id, self._favorite_mon))

        assert data['fsid'] == self.fsid

        sync_object = data['data']

        sync_type = SYNC_OBJECT_STR_TYPE[data['type']]
        new_object = self.inject_sync_object(minion_id, data['type'],
                                             data['version'], sync_object)
        if new_object:
            self._requests.on_map(self.fsid, sync_type, new_object)
            self._persister.update_sync_object(
                self.fsid, self.name,
                sync_type.str, new_object.version if isinstance(
                    new_object.version, int) else None, now(), sync_object)
        else:
            log.warn(
                "ClusterMonitor.on_sync_object: stale object received from %s"
                % minion_id)
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(
                minion_id, 'ceph.get_cluster_object', {
                    'cluster_name': self._cluster_name,
                    'sync_type': sync_type.str,
                    'since': None
                })
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" %
                      (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" %
                  (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" %
                         (self._cluster_name, sync_type.str, known_version,
                          new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" %
                         (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" %
                             (sync_type.str, self._fetching_at[sync_type]))

            log.info(
                "on_version: fetching %s/%s from %s, currently got %s, know %s"
                % (sync_type, new_version, reported_by, old_version,
                   known_version))
            self.fetch(reported_by, sync_type)