def check_mddev(self, dev): """ Underlying method to check the state of a MD device. @raise NPReadTimeoutError: on timeout reading a particular file in sys filesystem @raise IOError: if a sysfilesystem file disappears sinc start of this script @param dev: the name of the MD device to check (e.g. 'md0', 'md400') @type dev: str @return: a tuple of two values: * the numeric (Nagios) state * a textual description of the state @rtype: tuple of str and int """ log.debug("Checking device %r ...", dev) # Define directories and files in sysfs # /sys/block/mdX base_dir = os.sep + os.path.join('sys', 'block', dev) # /sys/block/mdX/md base_mddir = os.path.join(base_dir, 'md') # /sys/block/mdX/md/array_state array_state_file = os.path.join(base_mddir, 'array_state') # /sys/block/mdX/md/degraded degraded_file = os.path.join(base_mddir, 'degraded') # /sys/block/mdX/md/raid_disks raid_disks_file = os.path.join(base_mddir, 'raid_disks') # /sys/block/mdX/md/level raid_level_file = os.path.join(base_mddir, 'level') # /sys/block/mdX/md/degraded degraded_file = os.path.join(base_mddir, 'degraded') # /sys/block/mdX/md/suspended suspended_file = os.path.join(base_mddir, 'suspended') # /sys/block/mdX/md/sync_action sync_action_file = os.path.join(base_mddir, 'sync_action') # /sys/block/mdX/md/sync_completed sync_completed_file = os.path.join(base_mddir, 'sync_completed') # /sys/block/mdX/md/dev-* slavedir_pattern = os.path.join(base_mddir, 'dev-*') for sys_dir in (base_dir, base_mddir): if not os.path.isdir(sys_dir): raise IOError(errno.ENOENT, "Directory doesn't exists.", sys_dir) state = RaidState(dev) # Array status state.array_state = self.read_file(array_state_file).strip() # RAID level state.raid_level = self.read_file(raid_level_file).strip() # degraded state, if available if os.path.exists(degraded_file): state.degraded = bool(int(self.read_file(degraded_file))) # number of raid disks state.nr_raid_disks = int(self.read_file(raid_disks_file)) # suspended state, if available if os.path.exists(suspended_file): state.suspended = bool(int(self.read_file(suspended_file))) # state of synchronisation, if available if os.path.exists(sync_action_file): state.sync_action = self.read_file(sync_action_file).strip() # state of synchronisation process, if available if os.path.exists(sync_completed_file): sync_state = self.read_file(sync_completed_file).strip() match = re_sync_completed.search(sync_state) if match: state.sectors_synced = int(match.group(1)) state.sectors_total = int(match.group(2)) if state.sectors_total: state.sync_completed = ( float(state.sectors_synced) / float(state.sectors_total)) i = 0 while i < state.nr_raid_disks: state.raid_devices[i] = None i += 1 if self.verbose > 3: log.debug( "Searching for slave dirs with pattern %r ...", slavedir_pattern) slavedirs = glob.glob(slavedir_pattern) if self.verbose > 2: log.debug("Found slave dirs: %r", slavedirs) for slave_dir in slavedirs: if self.verbose > 3: log.debug("Checking slave dir %r ...", slave_dir) # Defining some sysfs files # /sys/block/mdX/md/dev-XYZ/state slave_state_file = os.path.join(slave_dir, 'state') # /sys/block/mdX/md/dev-XYZ/slot slave_slot_file = os.path.join(slave_dir, 'slot') # /sys/block/mdX/md/dev-XYZ/block slave_block_file = os.path.join(slave_dir, 'block') is_spare = False # Reading some status files try: slave_slot = int(self.read_file(slave_slot_file)) except ValueError: slave_slot = None slave_state = self.read_file(slave_state_file).strip() if slave_state == 'spare': is_spare = True rd_link = None if slave_slot is not None: rd_link = os.path.join(base_mddir, 'rd%d' % (slave_slot)) # Retreiving the slave block device block_target = os.readlink(slave_block_file) slave_block_device = os.path.normpath(os.path.join( os.path.dirname(slave_block_file), block_target)) slave_bd_basename = os.path.basename(slave_block_device) slave_block_device = os.sep + os.path.join('dev', slave_bd_basename) slave = SlaveState(slave_slot, slave_dir) slave.block_device = slave_block_device slave.state = slave_state # Check existense of the rdX link slave.rdlink = rd_link if rd_link is not None and os.path.exists(rd_link): slave.rdlink_exists = True else: slave.rdlink_exists = False # Assigne slave as a raid or a spare device state.slaves.append(slave_bd_basename) if is_spare: state.spare_devices[slave_bd_basename] = slave elif rd_link is None or slave_state == 'faulty': state.failed_devices[slave_bd_basename] = slave else: state.raid_devices[slave_slot] = slave if self.verbose > 2: log.debug("Status results for %r:\n%s", dev, pp(state.as_dict())) # And evaluate the results .... state_id = nagios.state.ok # Check the array state state_msg = "%s - %s" % (dev, state.array_state) if state.array_state not in ( 'readonly', 'read-auto', 'clean', 'active', 'active-idle'): if state.array_state == 'write-pending': state_id = nagios.state.warning elif state.array_state in ('clear', 'inactive', 'readonly'): state_id = nagios.state.critical else: state_id = nagios.state.unknown if not self.spare_ok: # Check for existing spare devices if state.spare_devices.keys(): state_msg += ", has spares %r" % (state.spare_devices.keys()) state_id = max_state(state_id, nagios.state.warning) # Check degraded and synchronisation state if state.degraded: state_msg += ", degraded" if state.sync_action is None: state_id = max_state(state_id, nagios.state.critical) state_msg += ", unknown sync action" elif state.sync_action == 'idle': state_id = max_state(state_id, nagios.state.critical) state_msg += ", idle" elif state.sync_action in ('resync', 'recover', 'check', 'repair'): state_id = max_state(state_id, nagios.state.warning) state_msg += ", " + state.sync_action else: state_id = max_state(state_id, nagios.state.unknown) state_msg += ", sync " + state.sync_action # Add percentage of sync completed to output if state.sync_completed is not None: state_msg += " %.1f%%" % ((state.sync_completed * 100)) # Check state of slave devices for i in state.raid_devices: log.debug("Evaluating state of raid_device[%r]", i) if state.raid_devices[i] is None: if state.sync_action in ('resync', 'recover', 'check', 'repair'): state_id = max_state(state_id, nagios.state.warning) else: state_id = max_state(state_id, nagios.state.critical) state_msg += ", raid_device[%r] fails" % (i) continue raid_device = state.raid_devices[i] if raid_device.state in ('in_sync', 'writemostly'): continue bd = os.path.basename(raid_device.block_device) state_msg += ", raid_device[%r]=%s %s" % (i, bd, raid_device.state) if not raid_device.rdlink_exists: state_msg += " failed" state_id = max_state(state_id, nagios.state.critical) if state.failed_devices.keys(): state_msg += ", failed %r" % (state.failed_devices.keys()) state_id = max_state(state_id, nagios.state.critical) return (state_id, state_msg)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "State of physical drives of MegaRaid adapter %d seems to be okay." % ( self.adapter_nr) # Enclosure Device ID: 0 re_enc = re.compile(r'^\s*Enclosure\s+Device\s+ID\s*:\s*(\d+)', re.IGNORECASE) # Slot Number: 23 re_slot = re.compile(r'^\s*Slot\s+Number\s*:\s*(\d+)', re.IGNORECASE) # Device Id: 6 re_dev_id = re.compile(r'^\s*Device\s+Id\s*:\s*(\d+)', re.IGNORECASE) # Media Error Count: 0 re_media_errors = re.compile( r'^\s*Media\s+Error\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Other Error Count: 0 re_other_errors = re.compile( r'^\s*Other\s+Error\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Predictive Failure Count: 0 re_pred_failures = re.compile( r'^\s*Predictive\s+Failure\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Firmware state: Online, Spun Up re_fw_state = re.compile(r'^\s*Firmware\s+state\s*:\s*(\S+.*)', re.IGNORECASE) # Foreign State: None re_foreign_state = re.compile( r'^\s*Foreign\s+state\s*:\s*(\S+.*)', re.IGNORECASE) good_fw_states = ( r'Online,\s+Spun\s+Up', r'Hotspare,\s+Spun\s+Up', r'Hotspare,\s+Spun\s+Down', r'Unconfigured\(good\),\s+Spun\s+Up', r'Unconfigured\(good\),\s+Spun\s+Down', ) warn_fw_states = ( r'Rebuild', r'Copyback', ) good_fw_pattern = r'^\s*(?:' + r'|'.join(good_fw_states) + r')\s*$' warn_fw_pattern = r'^\s*(?:' + r'|'.join(warn_fw_states) + r')\s*$' re_good_fw_state = re.compile(good_fw_pattern, re.IGNORECASE) re_warn_fw_state = re.compile(warn_fw_pattern, re.IGNORECASE) drives_total = 0 args = ('-PdList',) (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 3: log.debug("Output on StdOut:\n%s", stdoutdata) cur_dev = None for line in stdoutdata.splitlines(): line = line.strip() m = re_enc.search(line) if m: if cur_dev: if ('enclosure' in cur_dev) and ('slot' in cur_dev): pd_id = '[%d:%d]' % ( cur_dev['enclosure'], cur_dev['slot']) self.drive_list.append(pd_id) self.drive[pd_id] = cur_dev cur_dev = {} drives_total += 1 cur_dev = { 'enclosure': int(m.group(1)), 'media_errors': 0, 'other_errors': 0, 'predictive_failures': 0, 'fw_state': None, 'foreign_state': None, } continue m = re_slot.search(line) if m: if cur_dev: cur_dev['slot'] = int(m.group(1)) continue m = re_dev_id.search(line) if m: if cur_dev: cur_dev['dev_id'] = int(m.group(1)) continue m = re_media_errors.search(line) if m: if cur_dev: cur_dev['media_errors'] = int(m.group(1)) continue m = re_other_errors.search(line) if m: if cur_dev: cur_dev['other_errors'] = int(m.group(1)) continue m = re_pred_failures.search(line) if m: if cur_dev: cur_dev['predictive_failures'] = int(m.group(1)) continue m = re_fw_state.search(line) if m: if cur_dev: cur_dev['fw_state'] = m.group(1) continue m = re_foreign_state.search(line) if m: if cur_dev: cur_dev['foreign_state'] = m.group(1) continue if cur_dev: if ('enclosure' in cur_dev) and ('slot' in cur_dev): pd_id = '[%d:%d]' % (cur_dev['enclosure'], cur_dev['slot']) self.drive_list.append(pd_id) self.drive[pd_id] = cur_dev media_errors = 0 other_errors = 0 predictive_failures = 0 fw_state_wrong = 0 foreign_state_wrong = 0 errors = [] for pd_id in self.drive_list: cur_dev = self.drive[pd_id] found_errors = False drv_desc = [] disk_state = nagios.state.ok if cur_dev['media_errors']: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("%d media errors" % (cur_dev['media_errors'])) media_errors += 1 if cur_dev['other_errors']: found_errors = True drv_desc.append("%d other errors" % (cur_dev['other_errors'])) other_errors += 1 if cur_dev['predictive_failures']: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("%d predictive failures" % (cur_dev['predictive_failures'])) predictive_failures += 1 if not re_good_fw_state.search(cur_dev['fw_state']): if re_warn_fw_state.search(cur_dev['fw_state']): disk_state = max_state(disk_state, nagios.state.warning) else: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("wrong firmware state %r" % (cur_dev['fw_state'])) fw_state_wrong += 1 if cur_dev['foreign_state'].lower() != "none": disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("wrong foreign state %r" % (cur_dev['foreign_state'])) foreign_state_wrong += 1 if found_errors: state = max_state(state, disk_state) dd = "drive %s has " % (pd_id) dd += ' and '.join(drv_desc) errors.append(dd) if found_errors or self.verbose > 1: log.debug( "State of drive %s is %s.", pd_id, nagios.plugin.functions.STATUS_TEXT[disk_state]) log.debug("Found %d drives.", drives_total) if self.verbose > 2: log.debug("Found Pds:\n%s", self.drive_list) log.debug("Found Pd data:\n%s", self.drive) if errors: out = ', '.join(errors) self.add_perfdata(label='drives_total', value=drives_total, uom='') self.add_perfdata(label='media_errors', value=media_errors, uom='') self.add_perfdata(label='other_errors', value=other_errors, uom='') self.add_perfdata(label='predictive_failures', value=predictive_failures, uom='') self.add_perfdata(label='wrong_fw_state', value=fw_state_wrong, uom='') self.add_perfdata(label='wrong_foreign_state', value=foreign_state_wrong, uom='') self.exit(state, out)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "LD %d of MegaRaid adapter %d seems to be okay." % ( self.ld_number, self.adapter_nr) # Adapter 0: Virtual Drive 55 Does not Exist. re_not_exists = re.compile( r'^.*Virtual\s+Drive\s+\d+\s+Does\s+not\s+Exist\.', re.IGNORECASE) # RAID Level : Primary-1, Secondary-0, RAID Level Qualifier-0 re_raid_level = re.compile(r'^\s*RAID\s+Level\s*:\s+Primary-(\d+)', re.IGNORECASE) # Size : 2.728 TB re_size = re.compile(r'^\s*Size\s*:\s+(\d+(?:\.\d*)?)\s*(\S+)?', re.IGNORECASE) # State : Optimal re_state = re.compile(r'^\s*State\s*:\s+(\S+)', re.IGNORECASE) # Number Of Drives : 2 re_number = re.compile(r'^\s*Number\s+Of\s+Drives\s*:\s+(\d+)', re.IGNORECASE) # Span Depth : 1 re_span = re.compile(r'^\s*Span\s+Depth\s*:\s+(\d+)', re.IGNORECASE) # Is VD Cached: Yes # Is VD Cached: No re_cached = re.compile(r'^\s*Is\s+VD\s+Cached\s*:\s+(\S+)', re.IGNORECASE) # Check Consistency: Completed 95%, Taken 8 min re_consist = re.compile( r'Check\s+Consistency\s*:\s+Completed\s+(\d+)%,\s+Taken\s+(\d+)\s*min', re.IGNORECASE) raid_level = None size_val = None size_unit = None ld_state = None pd_number = None span_depth = None ld_cached = None consist_percent = None consist_min = None args = ('-LdInfo', '-L', ("%d" % (self.ld_number))) (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 2: log.debug("Output on StdOut:\n%s", stdoutdata) for line in stdoutdata.splitlines(): line = line.strip() # Logical Drive not exists if re_not_exists.search(line): self.die(line) match = re_raid_level.search(line) if match: raid_level = int(match.group(1)) continue match = re_size.search(line) if match: size_val = float(match.group(1)) size_unit = match.group(2) continue match = re_state.search(line) if match: ld_state = match.group(1) continue match = re_number.search(line) if match: pd_number = int(match.group(1)) continue match = re_span.search(line) if match: span_depth = int(match.group(1)) continue match = re_cached.search(line) if match: ld_cached = match.group(1) match = re_consist.search(line) if match: consist_percent = int(match.group(1)) consist_min = int(match.group(2)) if exit_code: state = nagios.state.critical elif not ld_state: state = nagios.state.critical ld_state = 'unknown' elif ld_state.lower() != 'optimal': state = nagios.state.critical consistency_out = '' if consist_percent is not None: if self.warn_on_consistency_check: state = max_state(state, nagios.state.warning) consistency_out = ", consistency check completed: %d%%, taken %d min." % ( consist_percent, consist_min) cached_out = ', cached: No' if ld_cached: cached_out = ', cached: %s' % (ld_cached) if self.cached: if not ld_cached or ld_cached.lower() != 'yes': state = max_state(state, nagios.state.warning) pd_count = 9999 if pd_number: pd_count = pd_number if span_depth and span_depth > 1: pd_count = pd_number * span_depth if raid_level < 10: raid_level *= 10 size_out = '' if size_val: if size_unit: size_out = ', %s %s' % (str(size_val), size_unit) else: size_out = ', %s' % (str(size_val)) out = "State of LD %d of MegaRaid adapter %d (RAID-%d, %d drives%s%s%s): %s." % ( self.ld_number, self.adapter_nr, raid_level, pd_count, size_out, cached_out, consistency_out, ld_state) self.exit(state, out)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "BBU of MegaRaid adapter %d seems to be okay." % ( self.adapter_nr) re_batt_type = re.compile(r'^\s*BatteryType\s*:\s*(\S+.*)', re.IGNORECASE) re_batt_state = re.compile(r'^\s*Battery\s*State\s*:\s*(\S+.*)', re.IGNORECASE) re_voltage = re.compile(r'^\s*Voltage\s*:\s+(\S+)', re.IGNORECASE) re_temp = re.compile(r'^\s*Temperature\s*:\s+(\S+)', re.IGNORECASE) re_lc_req = re.compile(r'^\s*Learn\s+Cycle\s+Requested\s*:\s+(\S+)', re.IGNORECASE) re_lc_act = re.compile(r'^\s*Learn\s+Cycle\s+Active\s*:\s+(\S+)', re.IGNORECASE) re_lc_state = re.compile(r'^\s*Learn\s+Cycle\s+Status\s*:\s+(\S+)', re.IGNORECASE) re_lc_tout = re.compile(r'^\s*Learn\s+Cycle\s+Timeout\s*:\s+(\S+)', re.IGNORECASE) re_i2c_err = re.compile(r'^\s*I2c\s+Errors\s+Detected\s*:\s+(\S+)', re.IGNORECASE) re_bbu_miss = re.compile(r'^\s*Battery\s+Pack\s+Missing\s*:\s+(\S+)', re.IGNORECASE) re_bbu_replace = re.compile( r'^\s*Battery\s+Replacement\s+required\s*:\s+(\S+)', re.IGNORECASE) re_capac_low = re.compile( r'^\s*Remaining\s+Capacity\s+Low\s*:\s+(\S+)', re.IGNORECASE) re_per_learn = re.compile( r'^\s*Periodic\s+Learn\s+Required\s*:\s+(\S+)', re.IGNORECASE) re_trans_learn = re.compile(r'^\s*Transparent\s+Learn\s*:\s+(\S+)', re.IGNORECASE) re_no_space = re.compile( r'^\s*No\s+space\s+to\s+cache\s+offload\s*:\s+(\S+)', re.IGNORECASE) re_pack_fail = re.compile( r'^\s*Pack\s+is\s+about\s+to\s+fail\s+.*:\s+(\S+)', re.IGNORECASE) re_micro_upd = re.compile( r'^\s*Module\s+microcode\s+update\s+required\s*:\s+(\S+)', re.IGNORECASE) args = ('-AdpBbuCmd', '-GetBbuStatus') (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 2: log.debug("Output on StdOut:\n%s", stdoutdata) batt_type = 'unknown' batt_state = None # optimal voltage = None # ok temperature = None # ok lc_req = None # no lc_act = None # no lc_state = None # ok lc_timeout = None # no i2c_err = None # no bbu_miss = None # no bbu_replace = None # no capac_low = None # no per_learn = None # no trans_learn = None # no no_space = None # no pack_fail = None # no micro_upd = None # no for line in stdoutdata.splitlines(): line = line.strip() match = re_batt_type.search(line) if match: batt_type = match.group(1) continue match = re_batt_state.search(line) if match: batt_state = match.group(1) continue match = re_voltage.search(line) if match: voltage = match.group(1).lower() continue match = re_temp.search(line) if match: temperature = match.group(1).lower() continue match = re_lc_req.search(line) if match: lc_req = match.group(1).lower() continue match = re_lc_act.search(line) if match: lc_act = match.group(1).lower() continue match = re_lc_state.search(line) if match: lc_state = match.group(1).lower() continue match = re_lc_tout.search(line) if match: lc_timeout = match.group(1).lower() continue match = re_i2c_err.search(line) if match: i2c_err = match.group(1).lower() continue match = re_bbu_miss.search(line) if match: bbu_miss = match.group(1).lower() continue match = re_bbu_replace.search(line) if match: bbu_replace = match.group(1).lower() continue match = re_capac_low.search(line) if match: capac_low = match.group(1).lower() continue match = re_per_learn.search(line) if match: per_learn = match.group(1).lower() continue match = re_trans_learn.search(line) if match: trans_learn = match.group(1).lower() continue match = re_no_space.search(line) if match: no_space = match.group(1).lower() continue match = re_pack_fail.search(line) if match: pack_fail = match.group(1).lower() continue match = re_micro_upd.search(line) if match: micro_upd = match.group(1).lower() continue add_infos = [] if exit_code: state = nagios.state.critical elif not batt_state: state = nagios.state.critical batt_state = 'unknown' elif batt_state.lower() != 'optimal': state = nagios.state.critical if voltage and voltage != 'ok': state = max_state(max_state, nagios.state.critical) add_infos.append("Voltage is %r." % (voltage)) if temperature and temperature != 'ok': state = max_state(max_state, nagios.state.warning) add_infos.append("Temperature is %r." % (temperature)) if lc_req and lc_req != 'no': add_infos.append("Learn Cycle Requested: %r." % (lc_req)) if lc_act and lc_act != 'no': add_infos.append("Learn Cycle Active: %r." % (lc_act)) if lc_state and lc_state != 'ok': state = max_state(max_state, nagios.state.warning) add_infos.append("Learn Cycle Status: %r." % (lc_state)) if lc_timeout and lc_timeout != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Learn Cycle Timeout: %r." % (lc_timeout)) if i2c_err and i2c_err != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("I2c Errors Detected %r." % (i2c_err)) if bbu_miss and bbu_miss != 'no': state = max_state(max_state, nagios.state.critical) add_infos.append("Battery Pack Missing: %r." % (bbu_miss)) if bbu_replace and bbu_replace != 'no': state = max_state(max_state, nagios.state.critical) add_infos.append("Battery Replacement required: %r." % (bbu_replace)) if capac_low and capac_low != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Remaining Capacity Low: %r." % (capac_low)) if per_learn and per_learn != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Periodic Learn Required: %r." % (per_learn)) if trans_learn and trans_learn != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Transparent Learn: %r." % (trans_learn)) if no_space and no_space != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("No space to cache offload %r." % (no_space)) if pack_fail and pack_fail != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append( "Pack is about to fail & should be replaced: %r." % (pack_fail)) if micro_upd and micro_upd != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Module microcode update required: %r." % (micro_upd)) add_info = '' if add_infos: add_info = '; ' + ', '.join(add_infos) out = "State of BBU of MegaRaid adapter %d (type %s): %s%s" % ( self.adapter_nr, batt_type, batt_state, add_info) self.exit(state, out)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "BBU of MegaRaid adapter %d seems to be okay." % (self.adapter_nr) re_batt_type = re.compile(r'^\s*BatteryType\s*:\s*(\S+.*)', re.IGNORECASE) re_batt_state = re.compile(r'^\s*Battery\s*State\s*:\s*(\S+.*)', re.IGNORECASE) re_voltage = re.compile(r'^\s*Voltage\s*:\s+(\S+)', re.IGNORECASE) re_temp = re.compile(r'^\s*Temperature\s*:\s+(\S+)', re.IGNORECASE) re_lc_req = re.compile(r'^\s*Learn\s+Cycle\s+Requested\s*:\s+(\S+)', re.IGNORECASE) re_lc_act = re.compile(r'^\s*Learn\s+Cycle\s+Active\s*:\s+(\S+)', re.IGNORECASE) re_lc_state = re.compile(r'^\s*Learn\s+Cycle\s+Status\s*:\s+(\S+)', re.IGNORECASE) re_lc_tout = re.compile(r'^\s*Learn\s+Cycle\s+Timeout\s*:\s+(\S+)', re.IGNORECASE) re_i2c_err = re.compile(r'^\s*I2c\s+Errors\s+Detected\s*:\s+(\S+)', re.IGNORECASE) re_bbu_miss = re.compile(r'^\s*Battery\s+Pack\s+Missing\s*:\s+(\S+)', re.IGNORECASE) re_bbu_replace = re.compile( r'^\s*Battery\s+Replacement\s+required\s*:\s+(\S+)', re.IGNORECASE) re_capac_low = re.compile( r'^\s*Remaining\s+Capacity\s+Low\s*:\s+(\S+)', re.IGNORECASE) re_per_learn = re.compile( r'^\s*Periodic\s+Learn\s+Required\s*:\s+(\S+)', re.IGNORECASE) re_trans_learn = re.compile( r'^\s*Transparent\s+Learn\s*:\s+(\S+)', re.IGNORECASE) re_no_space = re.compile( r'^\s*No\s+space\s+to\s+cache\s+offload\s*:\s+(\S+)', re.IGNORECASE) re_pack_fail = re.compile( r'^\s*Pack\s+is\s+about\s+to\s+fail\s+.*:\s+(\S+)', re.IGNORECASE) re_micro_upd = re.compile( r'^\s*Module\s+microcode\s+update\s+required\s*:\s+(\S+)', re.IGNORECASE) args = ('-AdpBbuCmd', '-GetBbuStatus') (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 2: log.debug("Output on StdOut:\n%s", stdoutdata) batt_type = 'unknown' batt_state = None # optimal voltage = None # ok temperature = None # ok lc_req = None # no lc_act = None # no lc_state = None # ok lc_timeout = None # no i2c_err = None # no bbu_miss = None # no bbu_replace = None # no capac_low = None # no per_learn = None # no trans_learn = None # no no_space = None # no pack_fail = None # no micro_upd = None # no for line in stdoutdata.splitlines(): line = line.strip() match = re_batt_type.search(line) if match: batt_type = match.group(1) continue match = re_batt_state.search(line) if match: batt_state = match.group(1) continue match = re_voltage.search(line) if match: voltage = match.group(1).lower() continue match = re_temp.search(line) if match: temperature = match.group(1).lower() continue match = re_lc_req.search(line) if match: lc_req = match.group(1).lower() continue match = re_lc_act.search(line) if match: lc_act = match.group(1).lower() continue match = re_lc_state.search(line) if match: lc_state = match.group(1).lower() continue match = re_lc_tout.search(line) if match: lc_timeout = match.group(1).lower() continue match = re_i2c_err.search(line) if match: i2c_err = match.group(1).lower() continue match = re_bbu_miss.search(line) if match: bbu_miss = match.group(1).lower() continue match = re_bbu_replace.search(line) if match: bbu_replace = match.group(1).lower() continue match = re_capac_low.search(line) if match: capac_low = match.group(1).lower() continue match = re_per_learn.search(line) if match: per_learn = match.group(1).lower() continue match = re_trans_learn.search(line) if match: trans_learn = match.group(1).lower() continue match = re_no_space.search(line) if match: no_space = match.group(1).lower() continue match = re_pack_fail.search(line) if match: pack_fail = match.group(1).lower() continue match = re_micro_upd.search(line) if match: micro_upd = match.group(1).lower() continue add_infos = [] if exit_code: state = nagios.state.critical elif not batt_state: state = nagios.state.critical batt_state = 'unknown' elif batt_state.lower() != 'optimal': state = nagios.state.critical if voltage and voltage != 'ok': state = max_state(max_state, nagios.state.critical) add_infos.append("Voltage is %r." % (voltage)) if temperature and temperature != 'ok': state = max_state(max_state, nagios.state.warning) add_infos.append("Temperature is %r." % (temperature)) if lc_req and lc_req != 'no': add_infos.append("Learn Cycle Requested: %r." % (lc_req)) if lc_act and lc_act != 'no': add_infos.append("Learn Cycle Active: %r." % (lc_act)) if lc_state and lc_state != 'ok': state = max_state(max_state, nagios.state.warning) add_infos.append("Learn Cycle Status: %r." % (lc_state)) if lc_timeout and lc_timeout != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Learn Cycle Timeout: %r." % (lc_timeout)) if i2c_err and i2c_err != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("I2c Errors Detected %r." % (i2c_err)) if bbu_miss and bbu_miss != 'no': state = max_state(max_state, nagios.state.critical) add_infos.append("Battery Pack Missing: %r." % (bbu_miss)) if bbu_replace and bbu_replace != 'no': state = max_state(max_state, nagios.state.critical) add_infos.append("Battery Replacement required: %r." % (bbu_replace)) if capac_low and capac_low != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Remaining Capacity Low: %r." % (capac_low)) if per_learn and per_learn != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Periodic Learn Required: %r." % (per_learn)) if trans_learn and trans_learn != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Transparent Learn: %r." % (trans_learn)) if no_space and no_space != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("No space to cache offload %r." % (no_space)) if pack_fail and pack_fail != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Pack is about to fail & should be replaced: %r." % (pack_fail)) if micro_upd and micro_upd != 'no': state = max_state(max_state, nagios.state.warning) add_infos.append("Module microcode update required: %r." % (micro_upd)) add_info = '' if add_infos: add_info = '; ' + ', '.join(add_infos) out = "State of BBU of MegaRaid adapter %d (type %s): %s%s" % ( self.adapter_nr, batt_type, batt_state, add_info) self.exit(state, out)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "LD %d of MegaRaid adapter %d seems to be okay." % ( self.ld_number, self.adapter_nr) # Adapter 0: Virtual Drive 55 Does not Exist. re_not_exists = re.compile( r'^.*Virtual\s+Drive\s+\d+\s+Does\s+not\s+Exist\.', re.IGNORECASE) # RAID Level : Primary-1, Secondary-0, RAID Level Qualifier-0 re_raid_level = re.compile( r'^\s*RAID\s+Level\s*:\s+Primary-(\d+)', re.IGNORECASE) # Size : 2.728 TB re_size = re.compile( r'^\s*Size\s*:\s+(\d+(?:\.\d*)?)\s*(\S+)?', re.IGNORECASE) # State : Optimal re_state = re.compile(r'^\s*State\s*:\s+(\S+)', re.IGNORECASE) # Number Of Drives : 2 re_number = re.compile( r'^\s*Number\s+Of\s+Drives\s*:\s+(\d+)', re.IGNORECASE) # Span Depth : 1 re_span = re.compile(r'^\s*Span\s+Depth\s*:\s+(\d+)', re.IGNORECASE) # Is VD Cached: Yes # Is VD Cached: No re_cached = re.compile( r'^\s*Is\s+VD\s+Cached\s*:\s+(\S+)', re.IGNORECASE) # Check Consistency: Completed 95%, Taken 8 min re_consist = re.compile( r'Check\s+Consistency\s*:\s+Completed\s+(\d+)%,\s+Taken\s+(\d+)\s*min', re.IGNORECASE) raid_level = None size_val = None size_unit = None ld_state = None pd_number = None span_depth = None ld_cached = None consist_percent = None consist_min = None args = ('-LdInfo', '-L', ("%d" % (self.ld_number))) (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 2: log.debug("Output on StdOut:\n%s", stdoutdata) for line in stdoutdata.splitlines(): line = line.strip() # Logical Drive not exists if re_not_exists.search(line): self.die(line) match = re_raid_level.search(line) if match: raid_level = int(match.group(1)) continue match = re_size.search(line) if match: size_val = float(match.group(1)) size_unit = match.group(2) continue match = re_state.search(line) if match: ld_state = match.group(1) continue match = re_number.search(line) if match: pd_number = int(match.group(1)) continue match = re_span.search(line) if match: span_depth = int(match.group(1)) continue match = re_cached.search(line) if match: ld_cached = match.group(1) match = re_consist.search(line) if match: consist_percent = int(match.group(1)) consist_min = int(match.group(2)) if exit_code: state = nagios.state.critical elif not ld_state: state = nagios.state.critical ld_state = 'unknown' elif ld_state.lower() != 'optimal': state = nagios.state.critical consistency_out = '' if consist_percent is not None: if self.warn_on_consistency_check: state = max_state(state, nagios.state.warning) consistency_out = ", consistency check completed: %d%%, taken %d min." % ( consist_percent, consist_min) cached_out = ', cached: No' if ld_cached: cached_out = ', cached: %s' % (ld_cached) if self.cached: if not ld_cached or ld_cached.lower() != 'yes': state = max_state(state, nagios.state.warning) pd_count = 9999 if pd_number: pd_count = pd_number if span_depth and span_depth > 1: pd_count = pd_number * span_depth if raid_level < 10: raid_level *= 10 size_out = '' if size_val: if size_unit: size_out = ', %s %s' % (str(size_val), size_unit) else: size_out = ', %s' % (str(size_val)) out = "State of LD %d of MegaRaid adapter %d (RAID-%d, %d drives%s%s%s): %s." % ( self.ld_number, self.adapter_nr, raid_level, pd_count, size_out, cached_out, consistency_out, ld_state) self.exit(state, out)
def call(self): """ Method to call the plugin directly. """ state = nagios.state.ok out = "State of physical drives of MegaRaid adapter %d seems to be okay." % ( self.adapter_nr) # Enclosure Device ID: 0 re_enc = re.compile(r'^\s*Enclosure\s+Device\s+ID\s*:\s*(\d+)', re.IGNORECASE) # Slot Number: 23 re_slot = re.compile(r'^\s*Slot\s+Number\s*:\s*(\d+)', re.IGNORECASE) # Device Id: 6 re_dev_id = re.compile(r'^\s*Device\s+Id\s*:\s*(\d+)', re.IGNORECASE) # Media Error Count: 0 re_media_errors = re.compile(r'^\s*Media\s+Error\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Other Error Count: 0 re_other_errors = re.compile(r'^\s*Other\s+Error\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Predictive Failure Count: 0 re_pred_failures = re.compile( r'^\s*Predictive\s+Failure\s+Count\s*:\s*(\d+)', re.IGNORECASE) # Firmware state: Online, Spun Up re_fw_state = re.compile(r'^\s*Firmware\s+state\s*:\s*(\S+.*)', re.IGNORECASE) # Foreign State: None re_foreign_state = re.compile(r'^\s*Foreign\s+state\s*:\s*(\S+.*)', re.IGNORECASE) good_fw_states = ( r'Online,\s+Spun\s+Up', r'Hotspare,\s+Spun\s+Up', r'Hotspare,\s+Spun\s+Down', r'Unconfigured\(good\),\s+Spun\s+Up', r'Unconfigured\(good\),\s+Spun\s+Down', ) warn_fw_states = ( r'Rebuild', r'Copyback', ) good_fw_pattern = r'^\s*(?:' + r'|'.join(good_fw_states) + r')\s*$' warn_fw_pattern = r'^\s*(?:' + r'|'.join(warn_fw_states) + r')\s*$' re_good_fw_state = re.compile(good_fw_pattern, re.IGNORECASE) re_warn_fw_state = re.compile(warn_fw_pattern, re.IGNORECASE) drives_total = 0 args = ('-PdList', ) (stdoutdata, stderrdata, ret, exit_code) = self.megacli(args) if self.verbose > 3: log.debug("Output on StdOut:\n%s", stdoutdata) cur_dev = None for line in stdoutdata.splitlines(): line = line.strip() m = re_enc.search(line) if m: if cur_dev: if ('enclosure' in cur_dev) and ('slot' in cur_dev): pd_id = '[%d:%d]' % (cur_dev['enclosure'], cur_dev['slot']) self.drive_list.append(pd_id) self.drive[pd_id] = cur_dev cur_dev = {} drives_total += 1 cur_dev = { 'enclosure': int(m.group(1)), 'media_errors': 0, 'other_errors': 0, 'predictive_failures': 0, 'fw_state': None, 'foreign_state': None, } continue m = re_slot.search(line) if m: if cur_dev: cur_dev['slot'] = int(m.group(1)) continue m = re_dev_id.search(line) if m: if cur_dev: cur_dev['dev_id'] = int(m.group(1)) continue m = re_media_errors.search(line) if m: if cur_dev: cur_dev['media_errors'] = int(m.group(1)) continue m = re_other_errors.search(line) if m: if cur_dev: cur_dev['other_errors'] = int(m.group(1)) continue m = re_pred_failures.search(line) if m: if cur_dev: cur_dev['predictive_failures'] = int(m.group(1)) continue m = re_fw_state.search(line) if m: if cur_dev: cur_dev['fw_state'] = m.group(1) continue m = re_foreign_state.search(line) if m: if cur_dev: cur_dev['foreign_state'] = m.group(1) continue if cur_dev: if ('enclosure' in cur_dev) and ('slot' in cur_dev): pd_id = '[%d:%d]' % (cur_dev['enclosure'], cur_dev['slot']) self.drive_list.append(pd_id) self.drive[pd_id] = cur_dev media_errors = 0 other_errors = 0 predictive_failures = 0 fw_state_wrong = 0 foreign_state_wrong = 0 errors = [] for pd_id in self.drive_list: cur_dev = self.drive[pd_id] found_errors = False drv_desc = [] disk_state = nagios.state.ok if cur_dev['media_errors']: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("%d media errors" % (cur_dev['media_errors'])) media_errors += 1 if cur_dev['other_errors']: found_errors = True drv_desc.append("%d other errors" % (cur_dev['other_errors'])) other_errors += 1 if cur_dev['predictive_failures']: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("%d predictive failures" % (cur_dev['predictive_failures'])) predictive_failures += 1 if not re_good_fw_state.search(cur_dev['fw_state']): if re_warn_fw_state.search(cur_dev['fw_state']): disk_state = max_state(disk_state, nagios.state.warning) else: disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("wrong firmware state %r" % (cur_dev['fw_state'])) fw_state_wrong += 1 if cur_dev['foreign_state'].lower() != "none": disk_state = max_state(disk_state, nagios.state.critical) found_errors = True drv_desc.append("wrong foreign state %r" % (cur_dev['foreign_state'])) foreign_state_wrong += 1 if found_errors: state = max_state(state, disk_state) dd = "drive %s has " % (pd_id) dd += ' and '.join(drv_desc) errors.append(dd) if found_errors or self.verbose > 1: log.debug("State of drive %s is %s.", pd_id, nagios.plugin.functions.STATUS_TEXT[disk_state]) log.debug("Found %d drives.", drives_total) if self.verbose > 2: log.debug("Found Pds:\n%s", self.drive_list) log.debug("Found Pd data:\n%s", self.drive) if errors: out = ', '.join(errors) self.add_perfdata(label='drives_total', value=drives_total, uom='') self.add_perfdata(label='media_errors', value=media_errors, uom='') self.add_perfdata(label='other_errors', value=other_errors, uom='') self.add_perfdata(label='predictive_failures', value=predictive_failures, uom='') self.add_perfdata(label='wrong_fw_state', value=fw_state_wrong, uom='') self.add_perfdata(label='wrong_foreign_state', value=foreign_state_wrong, uom='') self.exit(state, out)