def _check( params: Params, section: PodContainers, curr_timestamp_seconds: int, host_value_store: MutableMapping[str, Any], ) -> CheckResult: restart_count = sum(container.restart_count for container in section.containers.values()) yield from check_levels( restart_count, levels_upper=params["restart_count"][1] if params["restart_count"] != "no_levels" else None, metric_name="kube_pod_restart_count", render_func=str, label="Total", ) restart_rate = _calc_restart_rate_in_last_hour( restart_count, curr_timestamp_seconds, host_value_store, ) if restart_rate is not None: yield from check_levels( restart_rate, levels_upper=params["restart_rate"][1] if params["restart_rate"] != "no_levels" else None, metric_name="kube_pod_restart_rate", render_func=str, label="In last hour", )
def check_resource_quota_resource( params: Params, resource_usage: Optional[PerformanceUsage], hard_requirement: Optional[HardResourceRequirement], resource_type: ResourceType, render_func: Callable[[float], str], ): """Check result for resource quota usage & requirement While the general picture is similar to check_resource, there is one key difference: * for resources in check_resource, the resource section contains an aggregation of the request and limit values of the underlying containers. In resource quota, the configured hard spec value is taken instead (aggregated configured values vs single configured value) -> while the API data is mandatory for check_resource, it is optional for resource quota and the service is allowed to only display the performance usage value """ usage = resource_usage.resource.usage if resource_usage is not None else None if usage is not None: yield from check_levels( usage, label="Usage", levels_upper=params["usage"][1] if params["usage"] != "no_levels" else None, metric_name=f"kube_{resource_type}_usage", render_func=render_func, boundaries=(0.0, None), ) if hard_requirement is None: return for requirement_type, requirement_value in [ ("request", hard_requirement.request), ("limit", hard_requirement.limit), ]: if requirement_value is None: # user has not configured a value for this requirement continue requirement_type = cast(RequirementType, requirement_type) if requirement_value != 0.0 and usage is not None: yield from check_with_utilization( usage, resource_type=resource_type, requirement_type=requirement_type, kubernetes_object=None, requirement_value=requirement_value, params=params, render_func=render_func, ) else: # requirements with no usage yield from check_levels( requirement_value, label=absolute_title[requirement_type], metric_name=f"kube_{resource_type}_{requirement_type}", render_func=render_func, boundaries=(0.0, None), )
def check(params: Mapping[str, Any], section: PodConditions) -> CheckResult: """Check every condition in the section. Return one result if all conditions passed. Otherwise, return four results if one or more conditions are faulty or missing, defining each state according to `last_transition_time` and the respective levels in `params`. A pod transitions through the conditions in the order specified in `LOGICAL_ORDER`. The last two conditions, `containersready` and `ready`, can be in a failed state simultaneously. When a condition is missing (i.e. is `None`), it means that the previous condition is in a failed state.""" if all(cond and cond.status for _, cond in section): yield Result(state=State.OK, summary="Ready, all conditions passed") return section_dict = section.dict() curr_timestamp = time.time() for name in LOGICAL_ORDER: cond_service_text = ADDITIONAL_SERVICE_TEXT[name] cond = section_dict[name] if cond is not None: time_diff = curr_timestamp - cond[ "last_transition_time"] # keep the last-seen one if cond["status"] is True: yield Result(state=State.OK, summary=cond_service_text.passed) continue summary_prefix = f"{cond_service_text.not_passed} ({cond['reason']}: {cond['detail']})" else: summary_prefix = cond_service_text.not_passed for result in check_levels(time_diff, levels_upper=get_levels_for(params, name), render_func=render.timespan): yield Result(state=result.state, summary=f"{summary_prefix} for {result.summary}")
def check_with_utilization( usage: float, resource_type: Literal["memory", "cpu"], requirement_type: Literal["limit", "request"], requirement_value: float, param: Param, render_func: Callable[[float], str], ) -> Iterable[Union[Metric, Result]]: utilization = usage * 100.0 / requirement_value result, metric = check_levels( utilization, levels_upper=param[1] if param != "no_levels" else None, metric_name=f"kube_{resource_type}_{requirement_type}_utilization", render_func=render.percent, boundaries=(0.0, None), ) assert isinstance(result, Result) percentage, *warn_crit = result.summary.split() yield Result( state=result.state, summary=" ".join([ f"{requirement_type.title()} utilization: {percentage} - {render_func(usage)} of {render_func(requirement_value)}" ] + warn_crit), ) yield metric
def check_free_pods(vs_result: VSResultPercent, pod_resources: PodResources, allocatable_pods: int) -> CheckResult: # At the cluster level there can be more pods pending than space available. Thus, the number of # free pods may be negative. num_free_pods = max( 0, allocatable_pods - len(pod_resources.pending) - len(pod_resources.running)) if vs_result == "no_levels": levels = None elif vs_result[0] == "levels_abs": levels = Levels(*vs_result[1]) else: # vs_result[0] == "levels_perc" levels = Levels(*tuple( math.ceil(level * allocatable_pods / 100) for level in vs_result[1])) yield from check_levels( value=num_free_pods, label="Free", metric_name="kube_pod_free", levels_lower=levels, render_func=lambda x: str(int(x)), notice_only=True, )
def check(params: Mapping[str, Tuple[int, int]], section: PodContainers) -> CheckResult: restart_count = sum(container.restart_count for container in section.containers.values()) yield from check_levels( restart_count, levels_upper=params.get("restart_count"), metric_name="kube_pod_restart_count", render_func=str, label="Total", ) yield from check_levels( _calc_restart_rate_in_last_hour(restart_count), levels_upper=params.get("restart_rate"), metric_name="kube_pod_restart_rate", render_func=str, label="In last hour", )
def check_resource( params: Params, resource_usage: Optional[PerformanceUsage], resources: Resources, allocatable_resource: Optional[AllocatableResource], resource_type: ResourceType, render_func: Callable[[float], str], ) -> CheckResult: if resource_usage is not None: usage = resource_usage.resource.usage yield from check_levels( usage, label="Usage", levels_upper=params["usage"][1] if params["usage"] != "no_levels" else None, metric_name=f"kube_{resource_type}_usage", render_func=render_func, boundaries=(0.0, None), ) for requirement_type, kubernetes_object, requirement in requirements_for_object( resources, allocatable_resource ): if requirement != 0.0 and resource_usage is not None: result, metric = check_with_utilization( usage, resource_type, requirement_type, kubernetes_object, requirement, params, render_func, ) yield Metric(f"kube_{resource_type}_{requirement_type}", requirement) else: # requirements with no usage result, metric = check_levels( requirement, label=absolute_title[requirement_type], metric_name=f"kube_{resource_type}_{requirement_type}", render_func=render_func, boundaries=(0.0, None), ) assert isinstance(result, Result) summary = result.summary if requirement_type in ["request", "limit"]: summary = f"{result.summary} ({count_overview(resources, requirement_type)})" yield Result(state=result.state, summary=summary) yield metric
def check_resource( params: Params, usage: Optional[Usage], resources: Resources, resource_type: Literal["memory", "cpu"], render_func: Callable[[float], str], ) -> CheckResult: if usage is not None: total_usage = usage.usage yield from check_levels( total_usage, label="Usage", levels_upper=params["usage"][1] if params["usage"] != "no_levels" else None, metric_name=f"kube_{resource_type}_usage", render_func=render_func, boundaries=(0.0, None), ) for requirement_name, requirement in iterate_resources(resources): if requirement != 0.0 and usage is not None: result, metric = check_with_utilization( total_usage, resource_type, requirement_name, requirement, params[requirement_name], render_func, ) yield Metric(f"kube_{resource_type}_{requirement_name}", requirement) else: # requirements with no usage result, metric = check_levels( requirement, label=requirement_name.title(), metric_name=f"kube_{resource_type}_{requirement_name}", render_func=render_func, boundaries=(0.0, None), ) assert isinstance(result, Result) yield Result( state=result.state, summary= f"{result.summary} ({count_overview(resources, requirement_name)})", ) yield metric
def check(params: Params, section: PodContainers) -> CheckResult: restart_count = sum(container.restart_count for container in section.containers.values()) yield from check_levels( restart_count, levels_upper=params["restart_count"][1] if params["restart_count"] != "no_levels" else None, metric_name="kube_pod_restart_count", render_func=str, label="Total", ) yield from check_levels( _calc_restart_rate_in_last_hour(restart_count), levels_upper=params["restart_rate"][1] if params["restart_rate"] != "no_levels" else None, metric_name="kube_pod_restart_rate", render_func=str, label="In last hour", )
def check_proxmox_ve_vm_backup_status( now: datetime, params: Mapping[str, Any], section: Section, ) -> CheckResult: """If conditions provided calculate and compare age of last backup agains provided levels and define result status accordingly >>> for result in check_proxmox_ve_vm_backup_status( ... datetime.strptime("2020-12-07 21:28:02", '%Y-%m-%d %H:%M:%S'), ... {'age_levels_upper': (93600, 180000)}, ... parse_proxmox_ve_vm_backup_status([[ ... '{"last_backup": {' ... ' "archive_name": "/some/where/vzdump-qemu-109-2020_12_06-21_28_02.vma.zst",' ... ' "archive_size": 1099511627776,' ... ' "started_time": "2020-12-06 21:28:02",' ... ' "transfer_time": 100}}']])): ... print(result) Result(state=<State.OK: 0>, summary='Age: 1 day 0 hours') Metric('age', 86400.0, levels=(93600.0, 180000.0)) Result(state=<State.OK: 0>, summary='Time: 2020-12-06 21:28:02') Result(state=<State.OK: 0>, summary='Size: 1.00 TiB') Result(state=<State.OK: 0>, summary='Bandwidth: 11.0 GB/s') """ age_levels_upper = params.get("age_levels_upper") last_backup = section.get("last_backup") if not last_backup: yield (Result(state=State.CRIT, summary="No backup found") if age_levels_upper else # Result(state=State.OK, summary="No backup found and none needed")) return if "error" in last_backup: yield Result( state=State.CRIT, summary=f"Last backup failed with message {last_backup['error']!r}", ) return # Proxmox VE logs only provide time stamps w/o time zone so we have to hope the Proxmox VE node # is located close to us started_time = last_backup.get("started_time") if started_time: yield from check_levels( value=(now - started_time).total_seconds(), levels_upper=age_levels_upper, metric_name="age", render_func=render.timespan, label="Age", ) yield Result(state=State.OK, summary=f"Time: {last_backup.get('started_time')}") yield Result(state=State.OK, summary=f"Size: {render.bytes(last_backup['archive_size'])}") transfer_size = last_backup.get("transfer_size", last_backup.get("archive_size", 0)) yield Result( state=State.OK, summary=f"Bandwidth: {render.iobandwidth(transfer_size / last_backup['transfer_time'])}", )
def check_mobileiron_misc(params: Mapping[str, Any], section: Section) -> CheckResult: if availableCapacity := section.availableCapacity: yield from check_levels( label="Available capacity", value=availableCapacity, levels_upper=params.get("available_capacity"), metric_name="capacity_perc", render_func=render.percent, )
def check_checkpoint_connections( params, section: Section, ) -> CheckResult: yield from check_levels( value=section.current, levels_upper=params["levels"], metric_name="connections", label="Current connections", render_func=str, )
def check_kube_pod_status( params: Params, section_kube_pod_containers: Optional[PodContainers], section_kube_pod_init_containers: Optional[PodContainers], section_kube_pod_lifecycle: Optional[PodLifeCycle], ) -> CheckResult: assert section_kube_pod_lifecycle is not None, "Missing Api data" pod_containers = _pod_containers(section_kube_pod_containers) pod_init_containers = _pod_containers(section_kube_pod_init_containers) status_message = _pod_status_message( pod_containers, pod_init_containers, section_kube_pod_lifecycle, ) now = time.time() value_store = get_value_store() group_levels, group_statuses = _get_group_from_params( status_message, params) if value_store.get("group") != group_statuses: value_store["group"] = group_statuses value_store["duration_per_status"] = {status_message: 0.0} else: previous_status = value_store["previous_status"] value_store["duration_per_status"][ previous_status] += now - value_store["previous_time"] value_store["duration_per_status"].setdefault(status_message, 0.0) value_store["previous_time"] = now value_store["previous_status"] = status_message levels = None if group_levels == "no_levels" else group_levels[1] if levels is None: yield Result(state=State.OK, summary=status_message) else: for result in check_levels( sum(time for time in value_store["duration_per_status"].values()), render_func=render.timespan, levels_upper=levels, ): yield Result(state=result.state, summary=f"{status_message}: since {result.summary}") if len(value_store["duration_per_status"]) > 1: seen_statuses = ", ".join( f"{s} ({render.timespan(t)})" for s, t in value_store["duration_per_status"].items()) yield Result(state=State.OK, notice=f"Seen: {seen_statuses}") yield from _container_status_details(pod_init_containers) yield from _container_status_details(pod_containers)
def check(params: KubeContainersLevelsUpperLower, section: ContainerCount) -> CheckResult: """Computes `total` and uses `check_levels` for each section element, setting levels from `params` individually""" section_dict = section.dict() section_dict["total"] = sum(section_dict.values()) for name, value in section_dict.items(): yield from check_levels( value, levels_upper=params.get(f"{name}_upper"), levels_lower=params.get(f"{name}_lower"), metric_name=f"kube_node_container_count_{name}", label=f"{name.title()}", )
def check(params: K8sContainersLevelsUpperLower, section: ContainerCount) -> CheckResult: """Computes `total` and uses `check_levels` for each section element, setting levels from `params` individually""" section_dict = section.dict() section_dict["total"] = sum(section_dict.values()) for name, value in section_dict.items(): levels = params.get(name, {}) assert isinstance(levels, dict) yield from check_levels( value, levels_upper=levels.get("levels_upper"), levels_lower=levels.get("levels_lower"), metric_name=f"k8s_node_container_count_{name}", label=f"Number of {name} node containers", )
def check(params: Mapping[str, Optional[Tuple[float, float]]], section: PodConditions) -> CheckResult: curr_timestamp = int(time()) for name, value in section: if value.status: yield Result(state=State.OK, summary=f"{name.title()} condition passed") continue time_diff = curr_timestamp - value.last_transition_time summary = f"{name.title()} condition not passed ({value.reason}: {value.detail}) for {{}}" for result in check_levels(time_diff, levels_upper=params.get(name), render_func=render.timespan): yield Result(state=result.state, summary=summary.format(result.summary))
def check_proxmox_ve_snapshot_age(params: Mapping[str, Any], section: Section) -> CheckResult: if not section["snaptimes"]: yield Result(state=State.OK, summary="No snapshot found") return # timestamps and timezones... age = max(time.time() - min(section["snaptimes"]), 0) yield from check_levels( age, levels_upper=params["oldest_levels"], metric_name="age", render_func=render.timespan, label="Age", boundaries=params["oldest_levels"], )
def check_apache_status(item: str, params: Mapping[str, Any], section: Section) -> CheckResult: if item.endswith(":None"): # fix item name discovered before werk 2763 item = item[:-5] data = section.get(item) if data is None: return this_time = int(time.time()) value_store = get_value_store() if "Total Accesses" in data: data["ReqPerSec"] = get_rate(value_store, "apache_status_%s_accesses" % item, this_time, data.pop("Total Accesses")) if "Total kBytes" in data: data["BytesPerSec"] = get_rate(value_store, "apache_status_%s_bytes" % item, this_time, data.pop("Total kBytes") * 1024) for key, label in ((k, l) for k, l in _CHECK_LEVEL_ENTRIES if k in data): value = data[key] levels_are_lower = key == "OpenSlots" notice_only = key not in { "Uptime", "IdleWorkers", "BusyWorkers", "TotalSlots" } renderer = None if key == "Uptime": renderer = render.timespan elif not isinstance(value, float): renderer = lambda i: "%d" % int(i) yield from check_levels( value, metric_name=key.replace(" ", "_"), levels_lower=params.get(key) if levels_are_lower else None, levels_upper=None if levels_are_lower else params.get(key), render_func=renderer, label=label, notice_only=notice_only, ) yield from _scoreboard_results(data)
def check(params: Mapping[str, VSResultAge], section: PodConditions) -> CheckResult: """Check every condition in the section. Return one result if all conditions passed. Otherwise, return four results if one or more conditions are faulty or missing, defining each state according to `last_transition_time` and the respective levels in `params`. A pod transitions through the conditions in the order specified in `LOGICAL_ORDER`. The last two conditions, `containersready` and `ready`, can be in a failed state simultaneously. When a condition is missing (i.e. is `None`), it means that the previous condition is in a failed state.""" section_dict = section.dict() if all(cond and cond.status for _, cond in section): yield Result( state=State.OK, summary="Ready, all conditions passed", details="\n".join([ condition_detailed_description(name, cond["status"], cond["reason"], cond["detail"]) for name in LOGICAL_ORDER if (cond := section_dict.get(name)) is not None ]), ) return curr_timestamp = time.time() for name in LOGICAL_ORDER: cond = section_dict[name] if cond is not None: time_diff = curr_timestamp - cond[ "last_transition_time"] # keep the last-seen one if (status := cond["status"]) is True: yield Result(state=State.OK, summary=condition_short_description( name, str(status))) continue summary_prefix = condition_detailed_description( name, status, cond["reason"], cond["detail"]) else: summary_prefix = condition_short_description(name, "False") for result in check_levels(time_diff, levels_upper=get_levels_for(params, name), render_func=render.timespan): yield Result(state=result.state, summary=f"{summary_prefix} for {result.summary}")
def _fileinfo_check_function( check_definition: List[MetricInfo], params: Mapping[str, Any], ) -> CheckResult: for metric in check_definition: if metric.value is None: continue max_levels = params.get("max" + metric.key, (None, None)) min_levels = params.get("min" + metric.key, (None, None)) yield from check_levels( metric.value, levels_upper=max_levels, levels_lower=min_levels, metric_name=metric.key, label=metric.title, render_func=metric.verbose_func, )
def _check_individual_files( params: Mapping[str, Any], file_name: str, file_size: int, file_age: int, skip_ok_files: bool, ) -> CheckResult: ''' This function checks individual files against levels defined for the file group. This is done to generate information for the long output. ''' for key, value in [ ("age_oldest", file_age), ("age_newest", file_age), ("size_smallest", file_size), ("size_largest", file_size), ]: levels_upper = params.get("max" + key, (None, None)) levels_lower = params.get("min" + key, (None, None)) results = check_levels( value, metric_name=key, levels_upper=levels_upper, levels_lower=levels_lower, ) overall_state = max(r.state.value for r in results if isinstance(r, Result)) if skip_ok_files and State(overall_state) == State.OK: return age = render.timespan(file_age) size = render.filesize(file_size) yield Result( state=State.OK, notice=f"[{file_name}] Age: {age}, Size: {size}", )
def check_with_utilization( usage: float, resource_type: ResourceType, requirement_type: RequirementType, kubernetes_object: Optional[AllocatableKubernetesObject], requirement_value: float, params: Params, render_func: Callable[[float], str], ) -> Iterable[Union[Metric, Result]]: utilization = usage * 100.0 / requirement_value if kubernetes_object is None: metric_name = f"kube_{resource_type}_{requirement_type}_utilization" assert requirement_type != "allocatable" param = params[requirement_type] title = utilization_title[requirement_type] else: metric_name = f"kube_{resource_type}_{kubernetes_object}_{requirement_type}_utilization" param = params[kubernetes_object] title = utilization_title[kubernetes_object] result, metric = check_levels( utilization, levels_upper=param[1] if param != "no_levels" else None, metric_name=metric_name, render_func=render.percent, boundaries=(0.0, None), ) assert isinstance(result, Result) percentage, *warn_crit = result.summary.split() yield Result( state=result.state, summary=" ".join( [f"{title}: {percentage} - {render_func(usage)} of {render_func(requirement_value)}"] + warn_crit ), ) yield metric
def check_kube_pod_status( params: Params, section_kube_pod_containers: Optional[PodContainers], section_kube_pod_init_containers: Optional[PodContainers], section_kube_pod_lifecycle: Optional[PodLifeCycle], ) -> CheckResult: assert section_kube_pod_lifecycle is not None, "Missing Api data" pod_containers = _pod_containers(section_kube_pod_containers) pod_init_containers = _pod_containers(section_kube_pod_init_containers) status_message = _pod_status_message( pod_containers, pod_init_containers, section_kube_pod_lifecycle, ) now = time.time() value_store = get_value_store() if status_message not in value_store: value_store.clear() value_store[status_message] = now levels = _get_levels_from_params(status_message, params) if levels is None: yield Result(state=State.OK, summary=status_message) else: for result in check_levels( now - value_store[status_message], render_func=render.timespan, levels_upper=levels, ): yield Result(state=result.state, summary=f"{status_message}: since {result.summary}") yield from _container_status_details(pod_init_containers) yield from _container_status_details(pod_containers)
def check_proxmox_ve_disk_usage(params: Mapping[str, Any], section: Section) -> CheckResult: """ >>> for result in check_proxmox_ve_disk_usage( ... {"levels": (80., 90.)}, ... parse_proxmox_ve_disk_usage([['{"disk": 1073741824, "max_disk": 2147483648}']])): ... print(result) Result(state=<State.OK: 0>, summary='Usage: 1.07 GB') Metric('fs_used', 1073741824.0, levels=(1717986918.4, 1932735283.2), boundaries=(0.0, 2147483648.0)) """ used_bytes, total_bytes = section.get("disk", 0), section.get("max_disk", 0) warn, crit = params.get("levels", (0., 0.)) if total_bytes == 0: yield Result(state=State.WARN, summary="Size of filesystem is 0 MB") return yield from check_levels( value=used_bytes, levels_upper=(warn / 100 * total_bytes, crit / 100 * total_bytes), boundaries=(0, total_bytes), metric_name="fs_used", render_func=render.disksize, label="Usage", )
def check_proxmox_ve_vm_backup_status( now: datetime, params: Mapping[str, Any], section: Section, ) -> CheckResult: """If conditions provided calculate and compare age of last backup agains provided levels and define result status accordingly >>> for result in check_proxmox_ve_vm_backup_status( ... datetime.strptime("2020-12-07 21:28:02", '%Y-%m-%d %H:%M:%S'), ... {'age_levels_upper': (93600, 180000)}, ... parse_proxmox_ve_vm_backup_status([[ ... ' {"last_backup": {' ... ' "started_time": "2020-12-06 21:28:02",' ... ' "total_duration": 140,' ... ' "archive_name": "/tmp/vzdump-qemu-109-2020_12_06-21_28_02.vma.zst",' ... ' "upload_amount": 10995116277,' ... ' "upload_total": 1099511627776,' ... ' "upload_time": 120' ... ' }}' ... ]])): ... print(result) Result(state=<State.OK: 0>, summary='Age: 1 day 0 hours') Metric('age', 86400.0, levels=(93600.0, 180000.0), boundaries=(0.0, None)) Result(state=<State.OK: 0>, summary='Time: 2020-12-06 21:28:02') Result(state=<State.OK: 0>, summary='Duration: 2 minutes 20 seconds') Result(state=<State.OK: 0>, summary='Name: /tmp/vzdump-qemu-109-2020_12_06-21_28_02.vma.zst') Result(state=<State.OK: 0>, summary='Dedup rate: 100.00') Result(state=<State.OK: 0>, summary='Bandwidth: 91.6 MB/s') """ age_levels_upper = params.get("age_levels_upper") last_backup = section.get("last_backup") if not last_backup: yield (Result(state=State.CRIT, summary="No backup found") if age_levels_upper else # Result(state=State.OK, summary="No backup found and none needed")) return if "error" in last_backup: yield Result( state=State.CRIT, summary=f"Last backup failed with message {last_backup['error']!r}", ) return # Proxmox VE backup logs only provide time stamps without time zone so we have to hope # the Proxmox VE node is located close to us started_time = last_backup.get("started_time") if started_time: yield from check_levels( value=(now - started_time).total_seconds(), levels_upper=age_levels_upper, metric_name="age", render_func=render.timespan, label="Age", boundaries=(0, None), ) yield Result( state=State.OK, summary=f"Time: {started_time}", ) yield Result( state=State.OK, summary=f"Duration: {render.timespan(last_backup['total_duration'])}", ) if 'archive_name' in last_backup: yield Result(state=State.OK, summary=f"Name: {last_backup['archive_name']}") if 'archive_size' in last_backup: yield Result( state=State.OK, summary=f"Size: {render.bytes(last_backup['archive_size'])}") if all(k in last_backup for k in {'bytes_written_size', 'bytes_written_bandwidth'}): bandwidth = last_backup['bytes_written_bandwidth'] elif all(k in last_backup for k in {'transfer_size', 'transfer_time'}): if last_backup['transfer_time'] == 0: return bandwidth = last_backup['transfer_size'] / last_backup['transfer_time'] elif all(k in last_backup for k in {'upload_amount', 'upload_total', 'upload_time'}): if last_backup['upload_amount'] > 0: dedup_rate = last_backup['upload_total'] / last_backup[ 'upload_amount'] yield Result(state=State.OK, summary=f"Dedup rate: {dedup_rate:.2f}") if last_backup['upload_time'] == 0: return bandwidth = last_backup['upload_amount'] / last_backup['upload_time'] elif all(k in last_backup for k in {'backup_amount', 'backup_total', 'backup_time'}): if last_backup['backup_amount'] > 0: dedup_rate = last_backup['backup_total'] / last_backup[ 'backup_amount'] yield Result(state=State.OK, summary=f"Dedup rate: {dedup_rate:.2f}") if last_backup['backup_time'] == 0: return bandwidth = last_backup['backup_amount'] / last_backup['backup_time'] else: return yield Result(state=State.OK, summary=f"Bandwidth: {render.iobandwidth(bandwidth)}")
condition_name = name.upper() if (status := condition["status"]) is CONDITIONS_OK_MAPPINGS[name]: yield Result( state=State.OK, summary=condition_short_description(condition_name, status), details=condition_detailed_description(condition_name, status, condition["reason"], condition["message"]), ) continue time_difference = current_timestamp - condition["last_transition_time"] check_result = list( check_levels( time_difference, levels_upper=condition_levels(params=params, condition=name), render_func=render.timespan, )) result = check_result[0] yield Result( state=result.state, summary= f"{condition_detailed_description(condition_name, condition['status'], condition['reason'], condition['message'])} for {result.summary}", ) register.check_plugin( name="kube_deployment_conditions", service_name="Condition", discovery_function=discovery, check_function=check,
def check_entity_sensors_fan( item: str, params: Mapping[str, Any], section: EntitySensorSection, ) -> CheckResult: if not (sensor_reading := section.get('fan', {}).get(item)): return yield Result(state=sensor_reading.state, summary=f"Operational status: {sensor_reading.status_descr}") yield from check_levels( value=sensor_reading.reading, metric_name="fan" if params.get('output_metrics') else None, levels_upper=params.get("upper"), levels_lower=params["lower"], render_func=lambda r: f'{int(r)} {sensor_reading.unit}', label="Speed", boundaries=(0, None), ) register.check_plugin( name='entity_sensors_fan', sections=['entity_sensors'], service_name='Fan %s', discovery_function=discover_entity_sensors_fan, check_function=check_entity_sensors_fan, check_ruleset_name='hw_fans', check_default_parameters={'lower': (2000, 1000)}, # customer request )
def check_proxmox_ve_vm_backup_status( now: datetime, params: Mapping[str, Any], section: Section, ) -> CheckResult: """If conditions provided calculate and compare age of last backup agains provided levels and define result status accordingly >>> for result in check_proxmox_ve_vm_backup_status( ... datetime.strptime("2020-12-07 21:28:02+01:00", '%Y-%m-%d %H:%M:%S%z'), ... {'age_levels_upper': (93600, 180000)}, ... parse_proxmox_ve_vm_backup_status([[ ... ' {"last_backup": {' ... ' "started_time": "2020-12-06 21:28:02+0000",' ... ' "total_duration": 140,' ... ' "archive_name": "/tmp/vzdump-qemu-109-2020_12_06-21_28_02.vma.zst",' ... ' "upload_amount": 10995116277,' ... ' "upload_total": 1099511627776,' ... ' "upload_time": 120}}' ... ]])): ... print(result) Result(state=<State.OK: 0>, summary='Age: 23 hours 0 minutes') Metric('age', 82800.0, levels=(93600.0, 180000.0), boundaries=(0.0, None)) Result(state=<State.OK: 0>, summary='Server local start time: 2020-12-06 21:28:02+00:00') Result(state=<State.OK: 0>, summary='Duration: 2 minutes 20 seconds') Metric('backup_duration', 140.0, boundaries=(0.0, None)) Result(state=<State.OK: 0>, summary='Name: /tmp/vzdump-qemu-109-2020_12_06-21_28_02.vma.zst') Result(state=<State.OK: 0>, summary='Dedup rate: 100.00') Result(state=<State.OK: 0>, summary='Bandwidth: 91.6 MB/s') Metric('backup_avgspeed', 91625968.975, boundaries=(0.0, None)) """ age_levels_upper = params.get("age_levels_upper") duration_levels_upper = params.get("duration_levels_upper") bandwidth_levels_lower_bytes = params.get("bandwidth_levels_lower") bandwidth_levels_lower = ( ( bandwidth_levels_lower_bytes[0] * 1000 * 1000, bandwidth_levels_lower_bytes[1] * 1000 * 1000, ) if bandwidth_levels_lower_bytes else None ) last_backup = section.get("last_backup") if not last_backup: yield ( Result(state=State.CRIT, summary="No backup found") if age_levels_upper else Result(state=State.OK, summary="No backup found and none needed") # ) return if "error" in last_backup: yield Result( state=State.CRIT, summary=f"Last backup failed with message {last_backup['error']!r}", ) return # Proxmox VE backup logs only provide time stamps without time zone so the special agent # explicitly converted them to utc started_time = last_backup.get("started_time") if started_time: yield from check_levels( value=(now - started_time.astimezone(timezone.utc)).total_seconds(), levels_upper=age_levels_upper, metric_name="age", render_func=render.timespan, label="Age", boundaries=(0, None), ) yield Result( state=State.OK, summary=f"Server local start time: {started_time}", ) yield from check_levels( value=last_backup["total_duration"], levels_upper=duration_levels_upper, metric_name="backup_duration", render_func=render.timespan, label="Duration", boundaries=(0, None), ) if "archive_name" in last_backup: yield Result(state=State.OK, summary=f"Name: {last_backup['archive_name']}") if "archive_size" in last_backup: yield Result(state=State.OK, summary=f"Size: {render.bytes(last_backup['archive_size'])}") if all(k in last_backup for k in ("bytes_written_size", "bytes_written_bandwidth")): bandwidth = last_backup["bytes_written_bandwidth"] elif all(k in last_backup for k in ("transfer_size", "transfer_time")): if last_backup["transfer_time"] == 0: return bandwidth = last_backup["transfer_size"] / last_backup["transfer_time"] elif all(k in last_backup for k in ("upload_amount", "upload_total", "upload_time")): if last_backup["upload_amount"] > 0: dedup_rate = last_backup["upload_total"] / last_backup["upload_amount"] yield Result(state=State.OK, summary=f"Dedup rate: {dedup_rate:.2f}") if last_backup["upload_time"] == 0: return bandwidth = last_backup["upload_amount"] / last_backup["upload_time"] elif all(k in last_backup for k in ("backup_amount", "backup_total", "backup_time")): if last_backup["backup_amount"] > 0: dedup_rate = last_backup["backup_total"] / last_backup["backup_amount"] yield Result(state=State.OK, summary=f"Dedup rate: {dedup_rate:.2f}") if last_backup["backup_time"] == 0: return bandwidth = last_backup["backup_amount"] / last_backup["backup_time"] else: return yield from check_levels( value=bandwidth, levels_lower=bandwidth_levels_lower, metric_name="backup_avgspeed", render_func=render.iobandwidth, label="Bandwidth", boundaries=(0, None), )
if pool.status == "Running" and pool.cache_mode == "ReadWrite": state = State.OK elif pool.status == "Running" and pool.cache_mode != "ReadWrite": state = State.WARN else: state = State.CRIT yield Result( state=state, summary=f"{pool.pool_type} pool {pool.name} is {pool.status}, its cache is in {pool.cache_mode} mode", ) yield from check_levels( value=pool.percent_allocated, metric_name="pool_allocation", levels_upper=params["allocated_pools_percentage_upper"], render_func=render.percent, label="Pool allocation", boundaries=(0, 100), ) register.check_plugin( name="sansymphony_pool", discovery_function=discover_sansymphony_pool, check_function=check_sansymphony_pool, service_name="Sansymphony Pool %s", check_ruleset_name="sansymphony_pool", check_default_parameters={"allocated_pools_percentage_upper": (80.0, 90.0)}, )