def __init__(self, fsid, cluster_name): """ Requiring cluster_name and fsid is redundant (ideally everything would speak in terms of fsid) but convenient, because the librados interface wants a cluster name when you create a client, and otherwise we would have to look up via ceph.conf. """ # getChild isn't in 2.6 logname = '.'.join((log.name, self.__class__.__name__)) self.log = logging.getLogger(logname) self.requested_at = now() self.completed_at = None # This is actually kind of overkill compared with having a counter, # somewhere but it's easy. self.id = uuid.uuid4().__str__() self._minion_id = None self.fsid = fsid self._cluster_name = cluster_name self.jid = None self.state = self.NEW self.result = None self.error = False self.error_message = "" # Time at which we last believed the current JID to be really running self.alive_at = None
def _is_favorite(self, minion_id): """ Check if this minion is the one which we are currently treating as the primary source of updates, and promote it to be the favourite if the favourite has not sent a heartbeat since rlyeh->favorite_timeout_s. :return True if this minion was the favorite or has just been promoted. """ t_now = now() self._last_heartbeat[minion_id] = t_now if self._favorite_mon is None: log.debug("%s is my new favourite" % minion_id) self._set_favorite(minion_id) return True elif minion_id != self._favorite_mon: # Consider whether this minion should become my new favourite: has it been # too long since my current favourite reported in? time_since = t_now - self._last_heartbeat[self._favorite_mon] favorite_timeout_s = self._servers.get_contact_period( self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR if time_since > datetime.timedelta(seconds=favorite_timeout_s): log.debug( "My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite" % (self._favorite_mon, time_since, minion_id)) self._set_favorite(minion_id) return minion_id == self._favorite_mon
def _submit(self): self.jid = remote.run_job(self._minion_id, self._cmd, self._args) self.alive_at = now() self.log.info("Request %s started job %s" % (self.id, self.jid)) return self.jid
def complete(self): """ Call this when you're all done """ assert self.state != self.COMPLETE assert self.jid is None self.log.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message)) self.state = self.COMPLETE self.completed_at = now()
def on_tick_response(self, minion_id, jobs): """ Update the alive_at parameter of requests to record that they are still running remotely. :param jobs: The response from a saltutil.running """ log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id)) for job in jobs: try: request = self._by_jid[job['jid']] except KeyError: # Not one of mine, ignore it pass else: request.alive_at = now()
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) self.jid = remote.run_job(self._minion_id, 'ceph.rados_commands', {'fsid': self.fsid, 'cluster_name': self._cluster_name, 'commands': commands}) self.log.info("Request %s started job %s" % (self.id, self.jid)) self.alive_at = now() return self.jid
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending get_running for {0}".format(query_minions)) self._remote.get_running(list(query_minions))
def on_sync_object(self, minion_id, data): if minion_id != self._favorite_mon: log.debug("Ignoring map from %s, it is not my favourite (%s)" % (minion_id, self._favorite_mon)) assert data['fsid'] == self.fsid sync_object = data['data'] sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(minion_id, data['type'], data['version'], sync_object) if new_object: self._requests.on_map(self.fsid, sync_type, new_object) self._persister.update_sync_object( self.fsid, self.name, sync_type.str, new_object.version if isinstance( new_object.version, int) else None, now(), sync_object) else: log.warn( "ClusterMonitor.on_sync_object: stale object received from %s" % minion_id)
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() try: # TODO clean up unused 'since' argument jid = remote.run_job( minion_id, 'ceph.get_cluster_object', { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None }) except Unavailable: # Don't throw an exception because if a fetch fails we should end up # issuing another on next heartbeat log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) else: log.debug("SyncObjects.fetch: jid=%s" % jid)
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % (self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % (sync_type.str, self._fetching_at[sync_type])) log.info( "on_version: fetching %s/%s from %s, currently got %s, know %s" % (sync_type, new_version, reported_by, old_version, known_version)) self.fetch(reported_by, sync_type)