def lustre_check(self): """ Check Router health at Lustre level. Check LNET routing capabilities and change object state based on the results. """ # LNET is not loaded if not os.path.isfile("/proc/sys/lnet/routes"): self.state = OFFLINE return # Read routing information try: routes = open("/proc/sys/lnet/routes") # read only first line state = routes.readline().strip().lower() except: self.state = RUNTIME_ERROR raise ComponentError(self, "Could not read routing information") # routing info tells this is ok? if state == "routing enabled": self.state = MOUNTED elif state == "routing disabled": self.state = TARGET_ERROR raise ComponentError(self, "Misconfigured router") else: self.state = RUNTIME_ERROR raise ComponentError(self, "Bad routing status")
def full_check(self, mountdata=True): """Device type check.""" try: info = os.stat(self.dev) except OSError as exp: raise ComponentError(self, str(exp)) if not stat.S_ISBLK(info[stat.ST_MODE]): raise ComponentError(self, "bad journal device")
def lustre_check(self): """ Check Client health at Lustre level. """ self.state = None # Undefined proc_lov_match = glob("/proc/fs/lustre/lov/%s-clilov-*" % self.fs.fs_name) if not proc_lov_match: self.state = OFFLINE return # # There is at least one clilov declared. Check for coherence. # loaded = os.path.isdir(proc_lov_match[0]) # check for presence in /proc/mounts f_proc_mounts = open("/proc/mounts", 'r') try: curr_lnetdev = None for line in f_proc_mounts: if line.find(" %s lustre " % self.mount_path) > 0: lnetdev, mntp = line.split(' ', 2)[0:2] if loaded: curr_lnetdev = lnetdev self.state = MOUNTED self.mtpt = mntp else: self.state = CLIENT_ERROR if lnetdev != curr_lnetdev: raise ComponentError( self, "conflicting mounts " "detected for %s and %s on %s" % (lnetdev, curr_lnetdev, self.mount_path)) else: raise ComponentError( self, "multiple mounts " "detected for %s (%s)" % (lnetdev, self.mount_path)) finally: f_proc_mounts.close() if loaded and self.state != MOUNTED: # up but not mounted = incoherent state self.state = CLIENT_ERROR raise ComponentError( self, "incoherent client state for FS '%s'" " (not mounted but loaded. Mount in " "progress?)" % self.fs.fs_name) # Look for some evictions self._lustre_check_proc_state()
def full_check(self, mountdata=True): """Device type check.""" try: info = os.stat(self.dev) except OSError, exp: raise ComponentError(self, str(exp))
class Journal(Component): """ Manage a target external journal device. """ TYPE = 'journal' def __init__(self, target, device): Component.__init__(self, target.fs, target.server, target.action_enabled, target._mode) self.target = target self.dev = device @property def label(self): return self.uniqueid() def uniqueid(self): return "%s_jdev" % self.target.uniqueid() def longtext(self): return "%s journal (%s)" % (self.target.get_id(), self.dev) def full_check(self, mountdata=True): """Device type check.""" try: info = os.stat(self.dev) except OSError, exp: raise ComponentError(self, str(exp)) if not stat.S_ISBLK(info[stat.ST_MODE]): raise ComponentError(self, "bad journal device")
def _lustre_check_proc_state(self): """Check current target status in /proc/fs/lustre/*/*/state""" self.proc_states = {} for entry in glob("/proc/fs/lustre/??c/%s-*/state" % self.fs.fs_name): f_state = open(entry, 'r') for line in f_state: if line.startswith('current_state:'): state_name = line.split(None, 1)[1].strip() # Ignore inactive targets if state_name != 'FULL': mo = re.search( r'/(%s-\w{3}[0-9a-fA-F]{4})-' % self.fs.fs_name, entry) try: if not self.fs.components[mo.group(1)].is_active(): break except (AttributeError, KeyError): pass self.proc_states.setdefault(state_name, 0) self.proc_states[state_name] += 1 # Stop reading other file lines break f_state.close() if 'EVICTED' in self.proc_states: self.state = CLIENT_ERROR raise ComponentError( self, 'client connection error (%d evictions)' % self.proc_states['EVICTED'])
def raise_if_started(self, message): """Raise a ComponentError if the target device is mounted.""" if self.local_state != OFFLINE: if self.is_started(): reason = "%s: target %s (%s) is started" else: reason = "%s: target %s (%s) is busy" self.local_state = TARGET_ERROR raise ComponentError(self, reason % (message, self.label, self.dev))
def full_check(self, mountdata=True): """ Sanity checks for device files and Lustre status. If mountdata is set to False, target content will not be analyzed. """ # check for disk level status try: self._device_check() if mountdata: self._mountdata_check(self.label) if self.journal: self.journal.full_check() except (ComponentError, DiskDeviceError), error: self.local_state = TARGET_ERROR raise ComponentError(self, str(error))
def failover(self, candidates): """ Helper method to change Target current server based on a candidate list. It checks if only one server from the candidate list matches one of the failover server of this target. If more than one matches, it raises an exception. If no server matches it returns False. If it has changes the current server, it returns true. """ intersec = self.failservers.select(candidates) # If we have more than one possible failover nodes, it is ambiguous if len(intersec) > 1: raise ComponentError(self, "More than one failover server matches.") if len(intersec) == 1: self.server = intersec[0] return True return False
def lustre_check(self): """ Check target health at Lustre level. """ self.local_state = None # Unknown # find pathnames matching wanted lustre procfs # (Since Lustre 2.4. More than one path could be returned. # The first one is fine.) mntdev_path = glob('/proc/fs/lustre/*/%s/mntdev' % self.label) recov_path = glob('/proc/fs/lustre/*/%s/recovery_status' % self.label) assert len(recov_path) <= 1 # check for label presence in /proc : is this lustre target started? if len(mntdev_path) == 0 and len(recov_path) == 0: self.local_state = OFFLINE elif len(mntdev_path) == 0: self.local_state = TARGET_ERROR raise ComponentError(self, "incoherent state in " \ "/proc/fs/lustre for %s" % self.label) else: # get target's real device fproc = open(mntdev_path[0]) try: self.mntdev = fproc.readline().rstrip('\n') finally: fproc.close() loaded = True # check for presence in /proc/mounts f_proc_mounts = open("/proc/mounts", 'r') try: for line in f_proc_mounts: if line.find("%s " % self.mntdev) == 0: if line.split(' ', 3)[2] == "lustre": if loaded: self.local_state = MOUNTED else: self.local_state = TARGET_ERROR raise ComponentError(self, "multiple " \ " mounts detected for %s" % self.label) finally: f_proc_mounts.close() if self.local_state != MOUNTED and loaded: self.local_state = TARGET_ERROR # up but not mounted = incoherent state # check for loaded state: ST, UP... raise ComponentError(self, "incoherent state for %s " \ "(started but not mounted?)" % self.label) if self.local_state == MOUNTED and not loaded: self.local_state = TARGET_ERROR # mounted but not up = incoherent state # /etc/fstab was not correctly cleaned raise ComponentError(self, "incoherent state for %s " \ "(mounted but not started?)" % self.label) if self.local_state == MOUNTED and self.TYPE != MGT.TYPE: # check for MDT or OST recovery (MGS doesn't make any recovery) try: fproc = open(recov_path[0], 'r') except (IOError, IndexError): self.local_state = TARGET_ERROR raise ComponentError(self, "recovery_state file not " \ "found for %s" % self.label) try: for line in fproc: if line.startswith("status:"): status = line.rstrip().split(' ', 2)[1] break # # Recovering information depends on Lustre version. # # VERSION: 2.0 1.8 1.6 # # connected_clients: connect/TOTAL connect/TOTAL connect/TOTAL # req_replay: req_replay --- --- # lock_repay: lock_replay --- --- # delayed_client: --- delay/TOTAL --- # completed_clients: connect-replay TOTAL-recov-delay/TOTAL TOTAL-recov/TOTAL # evicted_clients: stale --- --- # if status == "RECOVERING": time_remaining = "??" completed = -1 evicted = 0 total = 0 for line in fproc: line = line.strip() if line.startswith("time_remaining:"): time_remaining = line.split(' ', 1)[1] elif line.startswith("connected_clients:"): total = int(line.split('/', 1)[1]) elif line.startswith("evicted_clients:"): evicted = int(line.split(' ', 1)[1]) elif line.startswith("completed_clients:"): completed = line.split(' ', 1)[1] completed = int(completed.split('/', 1)[0]) self.local_state = RECOVERING self.recov_info = "%ss (%s/%s)" % (time_remaining, completed + evicted, total) finally: fproc.close()