def wmi_calculate_raw_average_time(table, row, column): measure = int(table.get(row, column)) base = int(table.get(row, column + "_Base")) sample_time = get_wmi_time(table, row) measure_per_sec = get_rate("%s_%s" % (column, table.name()), sample_time, measure) base_per_sec = get_rate("%s_%s_Base" % (column, table.name()), sample_time, base) if base_per_sec == 0: return 0 return measure_per_sec / base_per_sec # fixed: true-division
def wmi_yield_raw_persec( table: WMITable, row: Union[str, int], column: Union[str, int], infoname: Optional[str], perfvar: Optional[str], levels=None, ): if table is None: # This case may be when a check was discovered with a table which subsequently # disappeared again. We expect to get None in this case and return some "nothing happened" return 0, "", [] if row == "": row = 0 try: value = table.get(row, column) assert value except KeyError: return 3, "Item not present anymore", [] value_per_sec = get_rate("%s_%s" % (column, table.name), get_wmi_time(table, row), int(value)) return check_levels( value_per_sec, perfvar, get_levels_quadruple(levels), infoname=infoname, )
def check_firewall_if(item, params, data): infotext_names = { "ip4_in_blocked": "Incoming IPv4 packets blocked: ", } this_time = time.time() for what, counter in data.items(): rate = get_rate("firewall_if-%s.%s" % (what, item), this_time, counter, onwrap=RAISE) if params.get("averaging"): backlog_minutes = params["averaging"] avgrate = get_average( "firewall_if-%s.%s" % (what, item), this_time, rate, backlog_minutes ) check_against = avgrate else: check_against = rate status, infotext, extraperf = check_levels( check_against, what, params.get(what), human_readable_func=lambda x: "%.2f pkts/s" % x, infoname=infotext_names[what], ) perfdata: List[Any] perfdata = [(what, rate)] + extraperf[:1] # type: ignore[operator] yield status, infotext, perfdata
def check_hp_msa_io(item, params, parsed): disks = {} now = time.time() for key, values in parsed.items(): disk = disks.setdefault(key, {}) try: read = int(values["data-read-numeric"]) disk["read_throughput"] = get_rate("%s_read" % key, now, read) except (KeyError, ValueError, TypeError): pass try: written = int(values["data-written-numeric"]) disk["write_throughput"] = get_rate("%s_write" % key, now, written) except (KeyError, ValueError, TypeError): pass return check_diskstat_dict(item, params, disks)
def wmi_calculate_raw_average_time( table: WMITable, row: Union[str, int], column: str, ) -> float: measure = table.get(row, column) base = table.get(row, column + "_Base") assert measure assert base sample_time = get_wmi_time(table, row) measure_per_sec = get_rate("%s_%s" % (column, table.name), sample_time, int(measure)) base_per_sec = get_rate("%s_%s_Base" % (column, table.name), sample_time, int(base)) if base_per_sec == 0: return 0 return measure_per_sec / base_per_sec # fixed: true-division
def check_cpu_util_linux_container(_no_item, params, parsed): con_ticks = parsed.get("container_ticks") sys_ticks = parsed.get("system_ticks") num_cpus = parsed.get("num_cpus") if None in (con_ticks, sys_ticks, num_cpus): return cpu_tick_rate = get_rate("container_ticks", sys_ticks, con_ticks) cpu_usage = cpu_tick_rate * num_cpus * 100.0 return check_cpu_util(cpu_usage, params, perf_max=num_cpus * 100)
def handle_graylog_messages(messages, params): msgs_levels_upper = params.get("msgs_upper", (None, None)) msgs_levels_lower = params.get("msgs_lower", (None, None)) yield check_levels( messages, "messages", msgs_levels_upper + msgs_levels_lower, human_readable_func=int, infoname="Total number of messages", ) avg_key = "msgs_avg" avg = params.get(avg_key, 30) msgs_avg_levels_upper = params.get("msgs_avg_upper", (None, None)) msgs_avg_levels_lower = params.get("msgs_avg_lower", (None, None)) this_time = time.time() rate = get_rate("graylog_%s.rate" % avg_key, this_time, messages) avg_rate = get_average("graylog_%s.avg" % avg_key, this_time, rate, avg) yield check_levels( avg_rate, avg_key, msgs_avg_levels_upper + msgs_avg_levels_lower, infoname="Average number of messages (%s)" % get_age_human_readable(avg * 60), ) diff_key = "msgs_diff" timespan = params.get(diff_key, 1800) diff_levels_upper = params.get("%s_upper" % diff_key, (None, None)) diff_levels_lower = params.get("%s_lower" % diff_key, (None, None)) diff = _get_value_diff("graylog_%s" % diff_key, messages, timespan) yield check_levels( diff, "graylog_diff", diff_levels_upper + diff_levels_lower, infoname="Total number of messages last %s" % get_age_human_readable(timespan), )
def check_azure_metric( # pylint: disable=too-many-locals resource, metric_key, cmk_key, display_name, levels=None, levels_lower=None, use_rate=False): metric = resource.get('metrics', {}).get(metric_key) if metric is None: return None if use_rate: countername = "%s.%s" % (resource['id'], metric_key) value = get_rate(countername, time.time(), metric.value) unit = "%s_rate" % metric.unit else: value = metric.value unit = metric.unit if value is None: return 3, "Metric %s is 'None'" % display_name, [] # convert to SI-unit if unit == "milli_seconds": value /= 1000. elif unit == "seconds_rate": # we got seconds, but we computed the rate -> seconds per second: # how long happend something / time period = percent of the time # e.g. CPU time: how much percent of of the time was the CPU busy. value *= 100. unit = "percent" return check_levels( value, cmk_key, (levels or (None, None)) + (levels_lower or (None, None)), infoname=display_name, human_readable_func=_AZURE_METRIC_FMT.get( unit, str), # type: ignore[arg-type] boundaries=(0, None), )
def _check_diskstat_old(item, params, this_time, info): # sum up over all devices if item == 'read': index = 2 # sectors read elif item == 'write': index = 3 # sectors written else: return (3, "invalid item %s" % (item,)) this_val = 0 for line in info: if line[0] is not None: return 3, "read/write mode not supported in a cluster" if ' ' not in line[1]: this_val += int(line[index]) per_sec = get_rate("diskstat." + item, this_time, this_val) mb_per_s = per_sec / 2048.0 # Diskstat output is in sectors a 512 Byte kb_per_s = per_sec / 2.0 perfdata = [(item, "%f" % kb_per_s)] return (0, "%.1f MB/s" % mb_per_s, perfdata)
def _check_diskstat_old( item: str, params: Any, this_time: float, info: Sequence[Sequence[Any]] ) -> Union[Tuple[int, str], Tuple[int, str, Sequence[Tuple[str, str]]]]: # sum up over all devices if item == "read": index = 2 # sectors read elif item == "write": index = 3 # sectors written else: return (3, "invalid item %s" % (item, )) this_val = 0 for line in info: if line[0] is not None: return 3, "read/write mode not supported in a cluster" if " " not in line[1]: this_val += int(line[index]) per_sec = get_rate("diskstat." + item, this_time, this_val) mb_per_s = per_sec / 2048.0 # Diskstat output is in sectors a 512 Byte kb_per_s = per_sec / 2.0 perfdata = [(item, "%f" % kb_per_s)] return (0, "%.1f MB/s" % mb_per_s, perfdata)
def size_trend( check, item, resource, levels, used_mb, size_mb: float, timestamp=None, ): # pylint: disable=function-redefined """Trend computation for size related checks of disks, ram, etc. Trends are computed in two steps. In the first step the delta to the last check is computed, using a normal check_mk counter. In the second step an average over that counter is computed to make a long-term prediction. Note: This function is experimental and may change in future releases. Use at your own risk! Args: check (str): The name of the check, e.g. "df". item (str): The name of the item, e.g. the mountpoint "/" for df. resource (str): The resource in question, e.g. "disk", "ram", "swap". levels (dict): Level parameters for the trend computation. Items: "trend_range" : 24, # interval for the trend in hours "trend_perfdata" : True # generate perfomance data for trends "trend_bytes" : (10, 20), # change during trend_range "trend_shrinking_bytes": (16, 32), # Bytes of shrinking during trend_range "trend_perc" : (1, 2), # percent change during trend_range "trend_shrinking_perc" : (1, 2), # percent decreasing change during trend_range "trend_timeleft" : (72, 48) # time left in hours until full "trend_showtimeleft : True # display time left in infotext The item "trend_range" is required. All other items are optional. timestamp (float, optional): Time in secs used to calculate the rate and average. Defaults to "None". used_mb (float): Used space in MB. size_mb (float): Max. available space in MB. Returns: A tuple of (state, infotext, perfdata) for the trend computation. If a MKCounterWrapped occurs (i.e. there is not enough data present for the trend computation) the tuple (0, '', []) is returned. """ perfdata: List[ Union[ # Tuple[str, float], # Tuple[str, float, Optional[float], Optional[float], Optional[float], Optional[float]], ] ] state, infotext, perfdata, problems = 0, "", [], [] MB = 1024.0 * 1024.0 H24 = 60 * 60 * 24 range_hours = levels["trend_range"] range_sec = range_hours * 3600.0 if not timestamp: timestamp = time.time() # compute current rate in MB/s by computing delta since last check try: rate = get_rate( "%s.%s.delta" % (check, item), timestamp, used_mb, allow_negative=True, onwrap=RAISE ) except MKCounterWrapped: # need more data for computing a trend return 0, "", [] if levels.get("trend_perfdata"): perfdata.append(("growth", rate * H24)) # average trend in MB/s, initialized with zero (by default) rate_avg = get_average("%s.%s.trend" % (check, item), timestamp, rate, range_sec / 60.0) trend = rate_avg * range_sec sign = "+" if trend > 0 else "" infotext += ", trend: %s%s / %g hours" % ( sign, get_bytes_human_readable(trend * MB), range_hours, ) # levels for performance data warn_perf: Optional[float] = None crit_perf: Optional[float] = None # apply levels for absolute growth / interval trend_bytes = levels.get("trend_bytes") if trend_bytes: wa, cr = trend_bytes warn_perf, crit_perf = wa / MB, cr / MB if trend * MB >= wa: problems.append( "growing too fast (warn/crit at %s/%s per %.1f h)(!" % ( get_bytes_human_readable(wa), get_bytes_human_readable(cr), range_hours, ) ) state = max(1, state) if trend * MB >= cr: state = 2 problems[-1] += "!" problems[-1] += ")" tmp_state, tmp_problem = _check_shrinking( trend * MB, levels.get("trend_shrinking_bytes"), range_hours, get_bytes_human_readable, ) if tmp_state > 0: state = max(state, tmp_state) problems.append(tmp_problem) # apply levels for growth relative to filesystem size trend_perc: Optional[Tuple[float, float]] = levels.get("trend_perc") if trend_perc: wa_perc, cr_perc = trend_perc wa = wa_perc / 100.0 * size_mb cr = cr_perc / 100.0 * size_mb if warn_perf is not None: assert crit_perf is not None warn_perf = min(warn_perf, wa) crit_perf = min(crit_perf, cr) else: warn_perf, crit_perf = wa, cr if trend >= wa: problems.append( "growing too fast (warn/crit at %s/%s per %.1f h)(!" % ( get_percent_human_readable(wa_perc), get_percent_human_readable(cr_perc), range_hours, ) ) state = max(1, state) if trend >= cr: state = 2 problems[-1] += "!" problems[-1] += ")" tmp_state, tmp_problem = _check_shrinking( 100 * trend / size_mb, levels.get("trend_shrinking_perc"), range_hours, get_percent_human_readable, ) if tmp_state > 0: state = max(state, tmp_state) problems.append(tmp_problem) # compute time until filesystem is full (only for positive trend, of course) # The start value of hours_left is negative. The pnp graph and the perfometer # will interpret this as inifinite -> not growing hours_left = -1 if trend > 0: def format_hours(hours): if hours > 365 * 24: return "more than a year" elif hours > 90 * 24: return "%0d months" % (hours / (30 * 24)) # fixed: true-division elif hours > 4 * 7 * 24: # 4 weeks return "%0d weeks" % (hours / (7 * 24)) # fixed: true-division elif hours > 7 * 24: # 1 week return "%0.1f weeks" % (hours / (7 * 24)) # fixed: true-division elif hours > 2 * 24: # 2 days return "%0.1f days" % (hours / 24) # fixed: true-division return "%d hours" % hours hours_left = (size_mb - used_mb) / trend * range_hours hours_txt = format_hours(hours_left) timeleft = levels.get("trend_timeleft") if timeleft: wa, cr = timeleft if hours_left <= cr: state = 2 problems.append("only %s until %s full(!!)" % (hours_txt, resource)) elif hours_left <= wa: state = max(state, 1) problems.append("only %s until %s full(!)" % (hours_txt, resource)) elif hours_left <= wa * 2 or levels.get("trend_showtimeleft"): problems.append("time left until %s full: %s" % (resource, hours_txt)) elif levels.get("trend_showtimeleft"): problems.append("time left until %s full: %s" % (resource, hours_txt)) if levels.get("trend_perfdata"): perfdata.append( ( "trend", rate_avg * H24, (warn_perf / range_sec * H24) if warn_perf is not None else None, (crit_perf / range_sec * H24) if crit_perf is not None else None, 0, 1.0 * size_mb / range_hours, ) ) if levels.get("trend_showtimeleft"): perfdata.append(("trend_hoursleft", hours_left)) if problems: infotext += " - %s" % ", ".join(problems) return state, infotext, perfdata
def check_temperature_trend(temp, params, output_unit, crit, crit_lower, unique_name): def combiner(status, infotext): if "status" in dir(combiner): combiner.status = max(combiner.status, status) else: combiner.status = status if "infotext" in dir(combiner): combiner.infotext += ", " + infotext else: combiner.infotext = infotext try: trend_range_min = params["period"] this_time = time.time() # first compute current rate in C/s by computing delta since last check rate = get_rate("temp.%s.delta" % unique_name, this_time, temp, allow_negative=True) # average trend, initialize with zero (by default), rate_avg is in C/s rate_avg = get_average("temp.%s.trend" % unique_name, this_time, rate, trend_range_min) # rate_avg is growth in C/s, trend is in C per trend range minutes trend = float(rate_avg * trend_range_min * 60.0) sign = "+" if trend > 0 else "" combiner( 0, "rate: %s%s/%g min" % (sign, render_temp(trend, output_unit, True), trend_range_min)) if "trend_levels" in params: warn_upper_trend, crit_upper_trend = params["trend_levels"] else: warn_upper_trend = crit_upper_trend = None # it may be unclear to the user if he should specify temperature decrease as a negative # number or positive. This works either way. Having a positive lower bound makes no # sense anyway. if "trend_levels_lower" in params: warn_lower_trend, crit_lower_trend = [ abs(x) * -1 for x in params["trend_levels_lower"] ] else: warn_lower_trend = crit_lower_trend = None if crit_upper_trend is not None and trend > crit_upper_trend: combiner( 2, "rising faster than %s/%g min(!!)" % (render_temp( crit_upper_trend, output_unit, True), trend_range_min), ) elif warn_upper_trend is not None and trend > warn_upper_trend: combiner( 1, "rising faster than %s/%g min(!)" % (render_temp( warn_upper_trend, output_unit, True), trend_range_min), ) elif crit_lower_trend is not None and trend < crit_lower_trend: combiner( 2, "falling faster than %s/%g min(!!)" % (render_temp( crit_lower_trend, output_unit, True), trend_range_min), ) elif warn_lower_trend is not None and trend < warn_lower_trend: combiner( 1, "falling faster than %s/%g min(!)" % (render_temp( warn_lower_trend, output_unit, True), trend_range_min), ) if "trend_timeleft" in params: # compute time until temperature limit is reached # The start value of minutes_left is negative. The pnp graph and the perfometer # will interpret this as infinite -> not growing limit = crit if trend > 0 else crit_lower if limit: # crit levels may not be set, especially lower level diff_to_limit = limit - temp if rate_avg != 0.0: minutes_left = (diff_to_limit / rate_avg) / 60.0 # fixed: true-division else: minutes_left = float("inf") def format_minutes(minutes): if minutes > 60: # hours hours = int(minutes / 60.0) minutes += -int(hours) * 60 return "%dh %02dm" % (hours, minutes) return "%d minutes" % minutes warn, crit = params["trend_timeleft"] if minutes_left <= crit: combiner( 2, "%s until temp limit reached(!!)" % format_minutes(minutes_left)) elif minutes_left <= warn: combiner( 1, "%s until temp limit reached(!)" % format_minutes(minutes_left)) except MKCounterWrapped: pass return combiner.status, combiner.infotext
def check_diskstat_line(this_time, item, params, line, mode='sectors'): average_range = params.get("average") if average_range == 0: average_range = None # disable averaging when 0 is set perfdata = [] infos = [] status = 0 node = line[0] if node is not None and node != "": infos.append("Node %s" % node) for what, ctr in [("read", line[2]), ("write", line[3])]: if node: countername = "diskstat.%s.%s.%s" % (node, item, what) else: countername = "diskstat.%s.%s" % (item, what) # unpack levels now, need also for perfdata levels = params.get(what) if isinstance(levels, tuple): warn, crit = levels else: warn, crit = None, None per_sec = get_rate(countername, this_time, int(ctr)) if mode == 'sectors': # compute IO rate in bytes/sec bytes_per_sec = per_sec * 512 elif mode == 'bytes': bytes_per_sec = per_sec dsname = what # compute average of the rate over ___ minutes if average_range is not None: perfdata.append((dsname, bytes_per_sec, warn, crit)) bytes_per_sec = get_average(countername + ".avg", this_time, bytes_per_sec, average_range) dsname += ".avg" # check levels state, text, extraperf = check_levels(bytes_per_sec, dsname, levels, scale=1048576, statemarkers=True, unit='/s', human_readable_func=get_bytes_human_readable, infoname=what) if text: infos.append(text) status = max(state, status) perfdata += extraperf # Add performance data for averaged IO if average_range is not None: perfdata = [perfdata[0], perfdata[2], perfdata[1], perfdata[3]] # Process IOs when available ios_per_sec = None if len(line) >= 6 and line[4] >= 0 and line[5] > 0: reads, writes = map(int, line[4:6]) if "read_ios" in params: warn, crit = params["read_ios"] if reads >= crit: infos.append('Read operations: %d (!!)' % (reads)) status = 2 elif reads >= warn: infos.append('Read operations: %d (!)' % (reads)) status = max(status, 1) else: warn, crit = None, None if "write_ios" in params: warn, crit = params["write_ios"] if writes >= crit: infos.append('Write operations: %d (!!)' % (writes)) status = 2 elif writes >= warn: infos.append('Write operations: %d (!)' % (writes)) status = max(status, 1) else: warn, crit = None, None ios = reads + writes ios_per_sec = get_rate(countername + ".ios", this_time, ios) infos.append("IOs: %.2f/sec" % ios_per_sec) if params.get("latency_perfdata"): perfdata.append(("ios", ios_per_sec)) # Do Latency computation if this information is available: if len(line) >= 7 and line[6] >= 0: timems = int(line[6]) timems_per_sec = get_rate(countername + ".time", this_time, timems) if not ios_per_sec: latency = 0.0 else: latency = timems_per_sec / ios_per_sec # fixed: true-division infos.append("Latency: %.2fms" % latency) if "latency" in params: warn, crit = params["latency"] if latency >= crit: status = 2 infos[-1] += "(!!)" elif latency >= warn: status = max(status, 1) infos[-1] += "(!)" else: warn, crit = None, None if params.get("latency_perfdata"): perfdata.append(("latency", latency, warn, crit)) # Queue Lengths (currently only Windows). Windows uses counters here. # I have not understood, why.... if len(line) >= 9: for what, ctr in [("read", line[7]), ("write", line[8])]: countername = "diskstat.%s.ql.%s" % (item, what) levels = params.get(what + "_ql") if levels: warn, crit = levels else: warn, crit = None, None qlx = get_rate(countername, this_time, int(ctr)) ql = qlx / 10000000.0 infos.append(what.title() + " Queue: %.2f" % ql) # check levels if levels is not None: if ql >= crit: status = 2 infos[-1] += "(!!)" elif ql >= warn: status = max(status, 1) infos[-1] += "(!)" if params.get("ql_perfdata"): perfdata.append((what + "_ql", ql)) return (status, ", ".join(infos), perfdata)