Example #1
0
    async def check(self):
        if not await self.middleware.call('failover.licensed'):
            return []
        elif not await self.middleware.call('failover.internal_interfaces'):
            return [Alert(FailoverInterfaceNotFoundAlertClass)]

        try:
            if not await self.middleware.call('failover.call_remote',
                                              'system.ready'):
                raise UnavailableException()

            local_version = await self.middleware.call('system.version')
            remote_version = await self.middleware.call(
                'failover.call_remote', 'system.version')
            if local_version != remote_version:
                return [Alert(TrueNASVersionsMismatchAlertClass)]

            local = await self.middleware.call('failover.vip.get_states')
            remote = await self.middleware.call('failover.call_remote',
                                                'failover.vip.get_states')
            if err := await self.middleware.call('failover.vip.check_states',
                                                 local, remote):
                return [
                    Alert(VRRPStatesDoNotAgreeAlertClass, {'error': i})
                    for i in err
                ]
        except CallError as e:
            if e.errno != errno.ECONNREFUSED:
                return [Alert(FailoverStatusCheckFailedAlertClass, [str(e)])]

        if await self.middleware.call('failover.status') in ('ERROR',
                                                             'UNKNOWN'):
            return [Alert(FailoverFailedAlertClass)]

        return []
Example #2
0
    async def check(self):
        alerts = []
        for pool in await self.middleware.call("zfs.pool.query"):
            try:
                capacity = int(pool["properties"]["capacity"]["parsed"])
            except (KeyError, ValueError):
                continue

            for target_capacity, klass in [
                (90, ZpoolCapacityCriticalAlertClass),
                (80, ZpoolCapacityWarningAlertClass),
                (70, ZpoolCapacityNoticeAlertClass),
            ]:
                if capacity >= target_capacity:
                    alerts.append(
                        Alert(
                            klass,
                            {
                                "volume": pool["name"],
                                "capacity": capacity,
                            },
                            key=[pool["name"]],
                        ))
                    break
                elif capacity == target_capacity - 1:
                    # If pool capacity is 89%, 79%, 69%, leave the alert in its previous state.
                    # In other words, don't flap alert in case if pool capacity is oscilating around threshold value.
                    raise UnavailableException()

        return alerts
Example #3
0
    def check_sync(self):
        rrd_size_alert_threshold = 1073741824

        try:
            used = psutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        if used > rrd_size_alert_threshold:
            return Alert('Reporting database size (%s) is above 1 GiB',
                         args=[humanfriendly.format_size(used)])
    def check_sync(self):
        rrd_size_alert_threshold = 1073741824

        try:
            used = psutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        if used > rrd_size_alert_threshold:
            return Alert(ReportingDbAlertClass,
                         args=humanfriendly.format_size(used),
                         key=None)
Example #5
0
    async def __call__(self, args):
        result = await run(["ipmitool"] + args,
                           check=False,
                           encoding="utf8",
                           errors="ignore")
        if result.returncode != 0:
            self.errors += 1
            if self.errors < 5:
                raise UnavailableException()

            raise CallError(
                f"ipmitool failed (code={result.returncode}): {result.stderr}")

        self.errors = 0
        return result.stdout
Example #6
0
    def check_sync(self):
        rrd_size_alert_threshold = 1610611911  # bytes

        try:
            used = shutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        if used > rrd_size_alert_threshold:
            # zfs list reports in kibi/mebi/gibi(bytes) but
            # format_size() calculates in kilo/mega/giga by default
            # so the report that we send the user needs to match
            # up with what zfs list reports as to not confuse anyone
            used = format_size(used, binary=True)
            threshold = format_size(rrd_size_alert_threshold, binary=True)

            return Alert(ReportingDbAlertClass, {
                'used': used,
                'threshold': threshold
            },
                         key=None)
Example #7
0
    def check_sync(self):
        try:
            used = shutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        threshold = 1073741824 + len(
            self.middleware.call_sync('disk.query')) * 1024 * 1024

        if used > threshold:
            # zfs list reports in kibi/mebi/gibi(bytes) but
            # format_size() calculates in kilo/mega/giga by default
            # so the report that we send the user needs to match
            # up with what zfs list reports as to not confuse anyone
            used = format_size(used, binary=True)
            threshold = format_size(threshold, binary=True)

            return Alert(ReportingDbAlertClass, {
                'used': used,
                'threshold': threshold
            },
                         key=None)
Example #8
0
    def check_sync(self):
        alerts = []

        if not self.middleware.call_sync('failover.licensed'):
            return alerts

        if not self.middleware.call_sync('failover.internal_interfaces'):
            alerts.append(Alert(FailoverInterfaceNotFoundAlertClass))
            return alerts

        try:
            self.middleware.call_sync('failover.call_remote', 'core.ping')

            local_version = self.middleware.call_sync('system.version')
            remote_version = self.middleware.call_sync('failover.call_remote',
                                                       'system.version')
            if local_version != remote_version:
                return [Alert(TrueNASVersionsMismatchAlertClass)]

            if not self.middleware.call_sync('failover.call_remote',
                                             'system.ready'):
                raise UnavailableException()

            local = self.middleware.call_sync('failover.vip.get_states')
            remote = self.middleware.call_sync('failover.call_remote',
                                               'failover.vip.get_states')

            errors = self.middleware.call_sync('failover.vip.check_states',
                                               local, remote)
            for error in errors:
                alerts.append(
                    Alert(
                        CARPStatesDoNotAgreeAlertClass,
                        {"error": error},
                    ))

        except CallError as e:
            if e.errno != errno.ECONNREFUSED:
                return [Alert(FailoverStatusCheckFailedAlertClass, [str(e)])]

        status = self.middleware.call_sync('failover.status')

        if status == 'ERROR':
            errmsg = None
            if os.path.exists('/tmp/.failover_failed'):
                with open('/tmp/.failover_failed', 'r') as fh:
                    errmsg = fh.read()
            if not errmsg:
                errmsg = 'Unknown error'

            alerts.append(Alert(FailoverFailedAlertClass, [errmsg]))

        elif status not in ('MASTER', 'BACKUP', 'SINGLE'):
            alerts.append(Alert(ExternalFailoverLinkStatusAlertClass))

        internal_ifaces = self.middleware.call_sync(
            'failover.internal_interfaces')
        if internal_ifaces:
            p1 = subprocess.Popen(
                "/sbin/ifconfig %s|grep -E 'vhid (10|20) '|grep 'carp:'" %
                internal_ifaces[0],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                shell=True,
                encoding='utf8',
            )
            stdout = p1.communicate()[0].strip()
            if status != "SINGLE" and stdout.count("\n") != 1:
                alerts.append(Alert(InternalFailoverLinkStatusAlertClass))

        if status != "SINGLE":
            try:
                if sysctl.filter('kern.cam.ctl.ha_link')[0].value == 1:
                    alerts.append(Alert(CTLHALinkAlertClass))
            except Exception:
                pass

        if status == 'BACKUP':
            fobj = None
            try:
                with open(FAILOVER_JSON, 'r') as f:
                    fobj = json.loads(f.read())
            except Exception:
                pass
            try:
                if len(fobj['phrasedvolumes']) > 0:
                    keys = self.middleware.call_sync(
                        'failover.encryption_keys')['geli']
                    not_found = False
                    for pool in fobj['phrasedvolumes']:
                        if pool not in keys:
                            not_found = True
                            alerts.append(
                                Alert(NoFailoverPassphraseKeysAlertClass,
                                      {'pool': pool}))
                    if not_found:
                        # Kick a syncfrompeer if we don't.
                        self.middleware.call_sync(
                            'failover.call_remote',
                            'failover.sync_keys_to_remote_node')
            except Exception:
                pass

        return alerts