async def check(self): alerts = [] # system certs/cas certs = await self.middleware.call('certificate.query', [['certificate', '!=', None]]) certs.extend(await self.middleware.call('certificateauthority.query')) # service certs/cas check_for_revocation = await self._get_service_certs() check_for_revocation.extend(await self._get_cas()) parsed = {} for cert in certs: # make the sure certs have been parsed correctly if not cert['parsed']: alerts.append( Alert( CertificateParsingFailedAlertClass, { "type": cert["cert_type"].capitalize(), "name": cert["name"] }, )) else: # check the parsed certificate(s) for expiration if cert['cert_type'].capitalize() == 'CERTIFICATE': diff = (datetime.strptime(cert['until'], '%a %b %d %H:%M:%S %Y') - datetime.utcnow()).days if diff < 10: if diff >= 0: alerts.append( Alert( CertificateIsExpiringSoonAlertClass if diff <= 2 else CertificateIsExpiringAlertClass, { 'name': cert['name'], 'days': diff }, key=[cert['name']], )) else: alerts.append( Alert(CertificateExpiredAlertClass, {'name': cert['name']}, key=[cert['name']])) parsed[cert['id']] = cert['revoked'] # check the parsed certificate(s) for revocation for i in filter(lambda i: parsed.get(i['id']), check_for_revocation): alerts.append( Alert(CertificateRevokedAlertClass, { 'service': i['service'], 'type': i['type'] })) return alerts
async def __run_source(self, source_name): alert_source = ALERT_SOURCES[source_name] try: alerts = (await alert_source.check()) or [] except UnavailableException: raise except Exception as e: if isinstance(e, CallError) and e.errno in [ errno.ECONNABORTED, errno.ECONNREFUSED, errno.ECONNRESET, errno.EHOSTDOWN, errno.ETIMEDOUT ]: alerts = [ Alert(AlertSourceRunFailedAlertClass, args={ "source_name": alert_source.name, "traceback": str(e), }) ] else: alerts = [ Alert(AlertSourceRunFailedAlertClass, args={ "source_name": alert_source.name, "traceback": traceback.format_exc(), }) ] else: if not isinstance(alerts, list): alerts = [alerts] for alert in alerts: alert.source = source_name return alerts
async def check(self): dmi = await self.middleware.call("system.dmidecode_info") if not dmi["system-product-name"].startswith( ("TRUENAS-M", "TRUENAS-Z")): return [] alerts = [] for disk in await self.middleware.call("boot.get_disks"): if not disk.startswith("sda"): continue lifetime = await self.middleware.call( "disk.sata_dom_lifetime_left", disk) if lifetime is not None: if lifetime <= 0.1: alerts.append( Alert(SATADOMWearCriticalAlertClass, { "disk": disk, "lifetime": int(lifetime * 100 + 0.5), })) elif lifetime <= 0.2: alerts.append( Alert(SATADOMWearWarningAlertClass, { "disk": disk, "lifetime": int(lifetime * 100 + 0.5), })) return alerts
async def check(self): uptime_seconds = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) if uptime_seconds < 300: return try: peers = await self.middleware.call("system.ntpserver.peers") except Exception: self.middleware.logger.warning("Failed to retrieve peers.", exc_info=True) peers = [] if not peers: return active_peer = [x for x in peers if x['status'].endswith('PEER')] if not active_peer: return Alert( NTPHealthCheckAlertClass, {'reason': f'No NTP peers: {[{x["remote"]: x["status"]} for x in peers]}'} ) peer = active_peer[0] if peer['offset'] < 300000: return return Alert( NTPHealthCheckAlertClass, {'reason': f'{peer["remote"]} has an offset of {peer["offset"]}, which exceeds permitted value of 5 minutes.'} )
async def check(self): max = await self.middleware.call("pool.snapshottask.max_count") max_total = await self.middleware.call( "pool.snapshottask.max_total_count") total = 0 datasets = defaultdict(lambda: 0) for snapshot in await self.middleware.call("zfs.snapshot.query", [], {"select": ["name"]}): total += 1 datasets[snapshot["name"].split("@")[0]] += 1 if total > max_total: return Alert( SnapshotTotalCountAlertClass, { "count": total, "max": max_total }, key=None, ) for dataset in sorted(datasets.keys()): count = datasets[dataset] if count > max: return Alert( SnapshotCountAlertClass, { "dataset": dataset, "count": count, "max": max }, key=None, )
def check_sync(self): alerts = [] for pool in self.middleware.call_sync("pool.query"): if not self.is_upgraded(pool): alerts.append( Alert( "New feature flags are available for volume %s. Refer " "to the \"Upgrading a ZFS Pool\" section of the User " "Guide for instructions.", pool["name"], )) proc = subprocess.Popen( "zfs upgrade | grep FILESYSTEM", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf8", ) output = proc.communicate()[0].strip(" ").strip("\n") if output: alerts.append( Alert( "ZFS filesystem version is out of date. Consider upgrading" " using \"zfs upgrade\" command line.")) return alerts
async def check(self): if not await self.middleware.call('failover.licensed'): return [] elif not await self.middleware.call('failover.internal_interfaces'): return [Alert(FailoverInterfaceNotFoundAlertClass)] try: if not await self.middleware.call('failover.call_remote', 'system.ready'): raise UnavailableException() local_version = await self.middleware.call('system.version') remote_version = await self.middleware.call( 'failover.call_remote', 'system.version') if local_version != remote_version: return [Alert(TrueNASVersionsMismatchAlertClass)] local = await self.middleware.call('failover.vip.get_states') remote = await self.middleware.call('failover.call_remote', 'failover.vip.get_states') if err := await self.middleware.call('failover.vip.check_states', local, remote): return [ Alert(VRRPStatesDoNotAgreeAlertClass, {'error': i}) for i in err ] except CallError as e: if e.errno != errno.ECONNREFUSED: return [Alert(FailoverStatusCheckFailedAlertClass, [str(e)])] if await self.middleware.call('failover.status') in ('ERROR', 'UNKNOWN'): return [Alert(FailoverFailedAlertClass)] return []
async def check(self): alerts = [] for cert in await self.middleware.call('certificate.query', [[ 'certificate', '!=', None ]]) + await self.middleware.call('certificateauthority.query'): if cert['parsed']: diff = ( datetime.strptime(cert['until'], '%a %b %d %H:%M:%S %Y') - datetime.utcnow()).days if diff < 10: if diff >= 0: alerts.append( Alert( CertificateIsExpiringSoonAlertClass if diff <= 2 else CertificateIsExpiringAlertClass, { 'name': cert['name'], 'days': diff, }, key=[cert['name']], )) else: alerts.append( Alert(CertificateExpiredAlertClass, {'name': cert['name']}, key=[cert['name']])) return alerts
async def check(self): alerts = [] if os.path.exists("/tmp/alert_invalid_ssl_nginx"): alerts.append( Alert( "System does not support certificates with keys shorter than 1024 bits. " "HTTPS will not be enabled until a certificate having at least 1024 bit " "keylength is provided", )) if os.path.exists("/tmp/alert_invalid_ssl_conf"): alerts.append( Alert("Certificate setup failed for HTTPS to be enabled")) for cert_name in glob.glob("/var/tmp/alert_invalidcert_*"): alerts.append( Alert( "The Certificate: %(cert_name)s is either malformed " "or invalid and cannot be used for any services. " "This Alert will remain here until the certificate is deleted", {"cert_name": cert_name.split("_", 2)[-1]}, )) for CA_name in glob.glob("/var/tmp/alert_invalidCA_*"): alerts.append( Alert( "The Certificate Authority(CA): %(CA_name)s is either " "malformed or invalid and cannot be used for any services. " "This Alert will remain here until the CA is deleted", {"CA_name": CA_name.split("_", 2)[-1]}, )) return alerts
async def check(self): alerts = [] for replication in await self.middleware.call( "replication.query", [["enabled", "=", True]]): if replication["state"]["state"] == "FINISHED": alerts.append( Alert( ReplicationSuccessAlertClass, { "name": replication["name"], }, key=[replication["state"]["datetime"].isoformat()], datetime=replication["state"]["datetime"], )) if replication["state"]["state"] == "ERROR": alerts.append( Alert( ReplicationSuccessAlertClass, { "name": replication["name"], "message": replication["state"]["error"], }, key=[replication["state"]["datetime"].isoformat()], datetime=replication["state"]["datetime"], )) return alerts
async def check(self): alerts = [] for num, enc in enumerate(await self.middleware.call('enclosure.query')): healthy = True for ele in sum([e['elements'] for e in enc['elements']], []): if ele['status'] in [ 'Critical', 'Noncritical', 'Unrecoverable' ]: pass elif ele['status'] == 'Not Installed' and ele['name'] in [ 'Power Supply' ]: pass else: continue # Enclosure element is CRITICAL in single head, ignore this for now # See #11918 if ele['name'] == 'Enclosure': continue # The 1.8V sensor is bugged on the echostream enclosure. The # management chip loses it's mind and claims undervoltage, but # scoping this confirms the voltage is fine. # Ignore alerts from this element. # #10077 if enc['name'] == 'ECStream 3U16+4R-4X6G.3 d10c': if ele['descriptor'] == '1.8V Sensor': continue healthy = False alerts.append( Alert( EnclosureUnhealthyAlertClass, args=[ num, enc['name'], f"{ele['name']} {hex(ele['slot'])} {ele['descriptor']}", ele['status'], ele['value_raw'] ], )) if healthy: alerts.append( Alert( EnclosureHealthyAlertClass, args=[num, enc['name']], )) return alerts
async def check(self): licensed = await self.middleware.call('failover.licensed') if licensed and (md := await self.middleware.call('failover.mismatch_disks')): if md['missing_remote']: return [ Alert(DisksAreNotPresentOnStandbyNodeAlertClass, {'serials': ', '.join(md['missing_remote'])}) ] if md['missing_remote']: return [ Alert(DisksAreNotPresentOnActiveNodeAlertClass, {'serials': ', '.join(md['missing_remote'])}) ]
def produce_nvdimm_alerts(i, critical_health, nvdimm_health, es_health): alerts = [] critical_health = parse_sysctl(critical_health) nvdimm_health = parse_sysctl(nvdimm_health) es_health = parse_sysctl(es_health) if int(critical_health["Critical Health Info"].split(":")[0], 16) & ~0x4: alerts.append( Alert( NVDIMMAlertClass, { "i": i, "k": "Critical Health Info", "value": critical_health["Critical Health Info"] })) for k in [ "Module Health", "Error Threshold Status", "Warning Threshold Status" ]: if nvdimm_health[k] != "0x0": alerts.append( Alert(NVDIMMAlertClass, { "i": i, "k": k, "value": nvdimm_health[k] })) nvm_lifetime = int(nvdimm_health["NVM Lifetime"].rstrip("%")) if nvm_lifetime < 20: klass = NVDIMMLifetimeWarningAlertClass if nvm_lifetime > 10 else NVDIMMLifetimeCriticalAlertClass alerts.append( Alert(klass, { "i": i, "name": "NVM", "value": nvm_lifetime })) es_lifetime = int(es_health["ES Lifetime Percentage"].rstrip("%")) if es_lifetime < 20: klass = NVDIMMLifetimeWarningAlertClass if es_lifetime > 10 else NVDIMMLifetimeCriticalAlertClass alerts.append( Alert(klass, { "i": i, "name": "ES", "value": es_lifetime })) return alerts
async def check(self): if not await self.enabled(): return alerts = [] for pool in await self.middleware.call("pool.query"): if not pool["is_decrypted"]: continue state, status = await self.middleware.call("notifier.zpool_status", pool["name"]) if state != "HEALTHY": if not (await self.middleware.call("system.is_freenas")): try: await self.middleware.call( "notifier.zpool_enclosure_sync", pool["name"]) except Exception: pass alerts.append( Alert( "The volume %(volume)s state is %(state)s: %(status)s", { "volume": pool["name"], "state": state, "status": status, })) return alerts
def check_sync(self): alerts = [] with libzfs.ZFS() as zfs: for pool in zfs.pools: if pool.scrub.pause is not None: alerts.append(Alert(ScrubPausedAlertClass, pool.name)) return alerts
def check_sync(self): login_failures = get_login_failures(datetime.now(), catmsgs()) if login_failures: return Alert(SSHLoginFailuresAlertClass, { "count": len(login_failures), "failures": b"".join(login_failures).decode("utf-8", "ignore") })
async def check(self): alerts = [] for pool in await self.middleware.call("pool.query"): if pool["scan"]["pause"] is not None: if pool["scan"]["pause"] < datetime.now() - timedelta(hours=8): alerts.append(Alert(ScrubPausedAlertClass, pool["name"])) return alerts
def check_sync(self): cores = "/var/db/system/cores" corefiles = [] try: listdir = os.listdir(cores) except Exception: return for file in listdir: if not file.endswith(".core"): continue path = os.path.join(cores, file) if not os.path.isfile(path): continue unlink = False if IS_FREEBSD and file == "syslog-ng.core": unlink = True elif file == "su.core": unlink = True elif os.stat(path).st_mtime < time.time() - 86400 * 5: unlink = True else: corefiles.append(file) if unlink: os.unlink(path) if corefiles: return Alert(CoreFilesArePresentAlertClass, {"corefiles": ', '.join(corefiles)})
async def test(self, data): await self._validate(data, "alert_service_test") factory = ALERT_SERVICES_FACTORIES.get(data["type"]) if factory is None: self.logger.error("Alert service %r does not exist", data["type"]) return False try: alert_service = factory(self.middleware, data["attributes"]) except Exception: self.logger.error( "Error creating alert service %r with parameters=%r", data["type"], data["attributes"], exc_info=True) return False test_alert = Alert( title="Test alert", node="A", datetime=datetime.utcnow(), level=AlertLevel.INFO, ) try: await alert_service.send([test_alert], [], [test_alert]) except Exception: self.logger.error("Error in alert service %r", data["type"], exc_info=True) return False return True
def check_sync(self): if self.middleware.call_sync("datastore.query", "services.services", [("srv_service", "=", "smartd"), ("srv_enable", "=", True)]): # sysctl kern.vm_guest will return a hypervisor name, or the string "none" # if FreeNAS is running on bare iron. p0 = subprocess.Popen(["/sbin/sysctl", "-n", "kern.vm_guest"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding="utf8") status = p0.communicate()[0].strip() # This really isn"t confused with python None if status != "none": # We got something other than "none", maybe "vmware", "xen", "vbox". Regardless, smartd not running # in these environments isn"t a huge deal. So we"ll skip alerting. return try: if self.middleware.call_sync( "notifier.failover_status") != "MASTER": return except Exception: return p1 = subprocess.Popen(["/usr/sbin/service", "smartd", "status"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding="utf8") status = p1.communicate()[0] if p1.returncode == 1: return Alert(status)
def _produce_sensor_alert(self, sensor): if sensor["lowarn"] and sensor["value"] < sensor["lowarn"]: relative = "below" if sensor["value"] < sensor["locrit"]: level = "critical" else: level = "recommended" elif sensor["hiwarn"] and sensor["value"] > sensor["hiwarn"]: relative = "above" if sensor["value"] > sensor["hicrit"]: level = "critical" else: level = "recommended" else: return return Alert( SensorAlertClass, { "name": sensor["name"], "relative": relative, "level": level, "value": sensor["value"], "desc": sensor["desc"], }, key=[sensor["name"], relative, level], )
async def check(self): if not await self.enabled(): return alerts = [] for pool in await self.middleware.call("pool.query"): if not pool["is_decrypted"]: continue if not pool['healthy']: if not (await self.middleware.call("system.is_freenas")): try: await self.middleware.call("enclosure.sync_zpool", pool["name"]) except Exception: pass alerts.append( Alert( VolumeStatusAlertClass, { "volume": pool["name"], "state": pool["status"], "status": pool["status_detail"], })) return alerts
def check_sync(self): systemname = pipeopen( "/usr/local/sbin/dmidecode -s system-product-name").communicate( )[0].strip() boardname = pipeopen( "/usr/local/sbin/dmidecode -s baseboard-product-name").communicate( )[0].strip() if "freenas" in systemname.lower() and boardname == "C2750D4I": mcinfo = pipeopen( "/usr/local/bin/ipmitool mc info").communicate()[0] reg = re.search(r"Firmware Revision.*: (\S+)", mcinfo, flags=re.M) if not reg: return fwver = reg.group(1) try: fwver = [int(i) for i in fwver.split(".")] except ValueError: logger.warning( "Failed to parse BMC firmware version: {}".format(fwver)) return if len(fwver) < 2 or not (fwver[0] == 0 and fwver[1] < 30): return return Alert( "FreeNAS Mini Critical IPMI Firmware Update - Your " "Mini has an available IPMI firmware update, please " "click <a href=\"%s\" target=\"_blank\">here</a> for " "installation instructions", "https://support.ixsystems.com/index.php?/Knowledgebase/Article/View/287" )
def check_sync(self): try: with LockFile(VMWARELOGIN_FAILS): with open(VMWARELOGIN_FAILS, "rb") as f: fails = pickle.load(f) except Exception: return alerts = [] for oid, errmsg in list(fails.items()): try: vmware = self.middleware.call_sync("datastore.query", "storage.vmwareplugin", [["id", "=", oid]], {"get": True}) except IndexError: continue alerts.append( Alert(VMWareLoginFailedAlertClass, { "hostname": vmware["hostname"], "error": errmsg, })) return alerts
def check_sync(self): data = self.middleware.call_sync('system.dmidecode_info') systemname = data['system-product-name'] boardname = data['baseboard-product-name'] if "freenas" in systemname.lower() and boardname == "C2750D4I": mcinfo = subprocess.run( ["ipmitool", "mc", "info"], capture_output=True, text=True, ).stdout reg = re.search(r"Firmware Revision.*: (\S+)", mcinfo, flags=re.M) if not reg: return fwver = reg.group(1) try: fwver = [int(i) for i in fwver.split(".")] except ValueError: logger.warning( "Failed to parse BMC firmware version: {}".format(fwver)) return if len(fwver) < 2 or not (fwver[0] == 0 and fwver[1] < 30): return return Alert(FreeNASBMCAlertClass)
async def check(self): alerts = [] for replication in await self.middleware.call( "replication.query", [["enabled", "=", True]]): message = replication["lastresult"].get("msg") if message in ("Succeeded", "Up to date", "Waiting", "Running", "", None): continue alerts.append( Alert( "Replication %(replication)s failed: %(message)s", { "replication": "%s -> %s:%s" % ( replication["filesystem"], replication["remote_hostname"], replication["zfs"], ), "message": message, }, )) return alerts
async def _produce_alerts_for_ipmitool_output(self, output): alerts = [] records = parse_ipmitool_output(output) if records: if await self.middleware.call("keyvalue.has_key", self.dismissed_datetime_kv_key): dismissed_datetime = ((await self.middleware.call( "keyvalue.get", self.dismissed_datetime_kv_key)).replace(tzinfo=None)) else: # Prevent notifying about existing alerts on first install/upgrade dismissed_datetime = max(record.datetime for record in records) await self.middleware.call("keyvalue.set", self.dismissed_datetime_kv_key, dismissed_datetime) for record in records: if record.datetime <= dismissed_datetime: continue args = dict(record._asdict()) args.pop("id") args.pop("datetime") alerts.append( Alert( IPMISELAlertClass, args, key=[args, record.datetime.isoformat()], datetime=record.datetime, )) return alerts
def check_sync(self): ports = set() for e in (etree.fromstring( subprocess.check_output( ["ctladm", "portlist", "-x"], encoding="utf-8")).xpath( "//frontend_type[text()='camtgt']")): port = e.getparent() ports.add((port.xpath("./port_name")[0].text, port.xpath("./physical_port")[0].text, port.xpath("./virtual_port")[0].text)) alerts = [] for channeltotarget in self.middleware.call_sync( "datastore.query", "services.fibrechanneltotarget"): fq_fc_port = channeltotarget["fc_port"] if fq_fc_port.count("/") == 0: fq_fc_port += "/0" if fq_fc_port.count("/") == 1: fq_fc_port += "/0" port_name, physical_port, virtual_port = fq_fc_port.split("/", 2) if (port_name, physical_port, virtual_port) not in ports: alerts.append( Alert( FCHBANotPresentAlertClass, { "port": channeltotarget["fc_port"], "target": channeltotarget["fc_target"]["iscsi_target_name"], })) return alerts
def check_sync(self): systemname = pipeopen( "/usr/local/sbin/dmidecode -s system-product-name").communicate( )[0].strip() boardname = pipeopen( "/usr/local/sbin/dmidecode -s baseboard-product-name").communicate( )[0].strip() if "freenas" in systemname.lower() and boardname == "C2750D4I": mcinfo = pipeopen( "/usr/local/bin/ipmitool mc info").communicate()[0] reg = re.search(r"Firmware Revision.*: (\S+)", mcinfo, flags=re.M) if not reg: return fwver = reg.group(1) try: fwver = [int(i) for i in fwver.split(".")] except ValueError: logger.warning( "Failed to parse BMC firmware version: {}".format(fwver)) return if len(fwver) < 2 or not (fwver[0] == 0 and fwver[1] < 30): return return Alert(FreeNASBMCAlertClass)
async def initialize(self): if not await self.middleware.call("system.is_freenas"): if await self.middleware.call("notifier.failover_node") == "B": self.node = "B" for alert in await self.middleware.call("datastore.query", "system.alert"): del alert["id"] alert["level"] = AlertLevel(alert["level"]) alert = Alert(**alert) self.alerts[alert.node][alert.source][alert.key] = alert for policy in self.policies.values(): policy.receive_alerts(datetime.utcnow(), self.alerts) main_sources_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.path.pardir, "alert", "source") sources_dirs = [os.path.join(overlay_dir, "alert", "source") for overlay_dir in self.middleware.overlay_dirs] sources_dirs.insert(0, main_sources_dir) for sources_dir in sources_dirs: for module in load_modules(sources_dir): for cls in load_classes(module, AlertSource, (FilePresenceAlertSource, ThreadedAlertSource, OneShotAlertSource)): source = cls(self.middleware) ALERT_SOURCES[source.name] = source main_services_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.path.pardir, "alert", "service") services_dirs = [os.path.join(overlay_dir, "alert", "service") for overlay_dir in self.middleware.overlay_dirs] services_dirs.insert(0, main_services_dir) for services_dir in services_dirs: for module in load_modules(services_dir): for cls in load_classes(module, _AlertService, (ThreadedAlertService, ProThreadedAlertService)): ALERT_SERVICES_FACTORIES[cls.name()] = cls