def check(zone=None, slowdown=80, _max=90): # search for hottest or use given zone hottest = 0 for x in [zone] if zone else range(3): try: with open("/sys/class/thermal/thermal_zone%s/temp" % x) as f: reading = f.read() value = int(reading.strip()[:-3]) if value > hottest: hottest = value except OSError: continue if not hottest: raise Exception("Could not find any thermal zone") status = { "value": "%s°C" % hottest, "bar_min": "15°C", "bar_max": "%s°C" % _max, "bar_percent": bar_percent(hottest, _max, 15), } if hottest < slowdown: status["healthy"] = True status["reason"] = "CPU temperature nominal" else: status["healthy"] = False status["reason"] = "CPU overheated" return status
def check(_min=216, _max=253): info = get_ups_data() return { "bar_min": "%sV" % _min, "bar_max": "%sV" % _max, "bar_percent": bar_percent(info["LINEV"], _max, _min), "value": "%sV" % info["LINEV"], "healthy": (info["LINEV"] < _max) and (info["LINEV"] > _min), }
def check(host, _min=10, _max=35): r = requests.get("http://%s/fresh.xml" % host, timeout=5) r.raise_for_status() root = ET.fromstring(r.text) value = root[0].attrib["val"] value = int(float(value)) return { "value": "%s°C" % value, "bar_min": "%s°C" % _min, "bar_max": "%s°C" % _max, "bar_percent": bar_percent(value, _max, _min), "healthy": value < _max and value > _min, }
def check(slowdown=88, _max=93): result = run( ["nvidia-smi", "-q", "-d", "TEMPERATURE"], timeout=10, check=True, stdout=PIPE, ) state = dict() for line in result.stdout.decode().splitlines(): try: key, val = line.split(":") except ValueError: continue key, val = key.strip(), val.strip() state[key] = val value = int(state["GPU Current Temp"][:-2]) try: slowdown = int(state["GPU Slowdown Temp"][:-2]) _max = int(state["GPU Shutdown Temp"][:-2]) except ValueError: # not specified, so dot change the defaults pass status = { "value": "%s°C" % value, "bar_min": "15°C", "bar_max": "%s°C" % _max, "bar_percent": bar_percent(value, _max, 15), } if value < slowdown: status["healthy"] = True status["reason"] = "GPU temperature nominal" else: status["healthy"] = False status["reason"] = "GPU overheated" return status
def check(_max=300): # "return normalised % load (avg num of processes waiting per processor)" load = os.getloadavg()[0] load = load / multiprocessing.cpu_count() value = int(load * 100) status = { "value": "%s%%" % value, "bar_min": "0%", "bar_max": "100%", "bar_percent": bar_percent(value, 100), "healthy": value < _max, } if value < _max: status["healthy"] = True status["reason"] = "CPU usage nominal" else: status["healthy"] = False status["reason"] = "CPU overloaded" return status
def check(mountpoint="/"): s = os.statvfs(mountpoint) free = s.f_bsize * s.f_bavail total = s.f_bsize * s.f_blocks usage = total - free percent = bar_percent(usage, total) status = { "value": human_bytes(usage), "bytes": usage, "bar_min": "0 GB", "bar_max": human_bytes(total), "bar_percent": percent, } if usage < 0.9 * total: status["healthy"] = True status["reason"] = "Disk usage nominal" else: status["healthy"] = False status["reason"] = "Disk is nearly full" return status
def check(): # http://www.linuxatemyram.com/ with open("/proc/meminfo") as f: lines = f.readlines() # in kB info = {} for line in lines: m = re.search("(\w+):\s*(\d+)", line) if m: info[m.group(1)] = int(m.group(2)) used = info["MemTotal"] - info["MemFree"] - info["Buffers"] - info["Cached"] total = info["MemTotal"] * 1024 # used by applications, not cache/buffers value = used * 1024 status = { "value": human_bytes(value), "bytes": value, "bar_min": "0 GB", "bar_max": human_bytes(total), "bar_percent": bar_percent(value, total), } if value < 0.9 * total: status["healthy"] = True status["reason"] = "RAM usage nominal" else: status["healthy"] = False status["reason"] = "RAM usage too high" return status