Ejemplo n.º 1
0
    def get_summaries(self, items, item_type, get_hour_data_function):

        log_console(
            f"Calculating {item_type} summaries for {self.current_hour}")
        hourly_summaries = dict()

        for item in items:
            service_status_data = get_hour_data_function(
                item["id"], self.current_hour)

            hourly_summary = dict()
            hourly_summary["id"] = item["id"]
            hourly_summary["hour"] = str(
                datetime.fromisoformat(self.current_hour))
            hourly_summary["availability"] = 0
            hourly_summary["response_time"] = 0

            num_availability_records = 0
            num_response_time_records = 0

            for service_status_data_item in service_status_data:
                num_availability_records += 1
                if service_status_data_item["availability"]:
                    hourly_summary["availability"] += 100
                    hourly_summary[
                        "response_time"] += service_status_data_item[
                            "response_time"]
                    num_response_time_records += 1

            if num_response_time_records > 0:
                hourly_summary["response_time"] = (
                    hourly_summary["response_time"] /
                    num_response_time_records)
            if num_availability_records > 0:
                hourly_summary["availability"] = (
                    hourly_summary["availability"] / num_availability_records)

            log_console(
                f"Summary: {item_type} hourly summary for {item['name']}: {hourly_summary}"
            )
            hourly_summaries[item["id"]] = hourly_summary

            rsp_time_in_seconds = hourly_summary["response_time"] / 1000
            if "sla_response_time" in item and rsp_time_in_seconds > item[
                    "sla_response_time"]:
                info = f"SLA response time violation, {rsp_time_in_seconds:.2f} > {item['sla_response_time']}"
                log_event(
                    str(datetime.now())[:-3], item_type, item["name"],
                    "WARNING", info)
            if ("sla_availability" in item and
                    hourly_summary["availability"] < item["sla_availability"]):
                info = f"SLA availability violation, {hourly_summary['availability']:.2f} < {item['sla_availability']}"
                log_event(
                    str(datetime.now())[:-3], item_type, item["name"],
                    "WARNING", info)

        return hourly_summaries
    def monitor(self, interval):

        while True and not self.terminate:

            device_ids = get_all_device_ids()
            log_console(
                f"Monitor: Beginning Configuration monitoring for {len(device_ids)} devices"
            )

            for device_id in device_ids:

                if self.terminate:
                    break

                result, device = get_device(
                    device_id=device_id
                )  # re-retrieve device as it may have been changed

                if result != "success":
                    log_console(
                        f"Configuration Monitor: Error retrieving device from DB. id: {device_id}, error: {device}"
                    )
                    continue

                try:
                    result, config = get_device_info(device,
                                                     "config",
                                                     get_live_info=True)
                    if result != "success":
                        log_console(
                            f"!!! Unable to get device info (config) for {device['name']}"
                        )
                        continue

                except BaseException as e:
                    log_console(
                        f"!!! Exception getting device info in configuration monitoring for {device['name']}: {repr(e)}"
                    )
                    continue

                # If we made it here, we got the configuration, so store it in the DB
                record_device_config(device_id, config["config"]["running"])
                log_event(
                    str(datetime.now())[:-3],
                    "configuration",
                    device['name'],
                    "INFO",
                    f"Stored configuration for: {device['name']}",
                )

            for _ in range(0, int(interval / 10)):
                sleep(10)
                if self.terminate:
                    break

        log_console("...gracefully exiting monitor:configuration")
Ejemplo n.º 3
0
    def monitor(self, interval):

        while True and not self.terminate:

            hosts = get_all_hosts()
            log_console(
                f"monitor:host Beginning monitoring for {len(hosts)} hosts")
            for host in hosts:

                if self.terminate:
                    break

                log_console(f"--- monitor:host pinging {host['ip_address']}")
                try:
                    ping_output = subprocess.check_output([
                        "ping", "-c3", "-n", "-i0.5", "-W2",
                        str(host["ip_address"])
                    ])
                    host["availability"] = True
                    host["response_time"] = get_response_time(str(ping_output))
                    host["last_heard"] = str(datetime.now())[:-3]

                except subprocess.CalledProcessError:
                    host["availability"] = False
                    log_event(
                        str(datetime.now())[:-3],
                        "host monitor",
                        host["name"],
                        "INFO",
                        f"Availability failed for host: {host['name']}",
                    )

                record_host_status(host)
                set_host(host)

            for _ in range(0, int(interval / 10)):
                time.sleep(10)
                if self.terminate:
                    break

        log_console("...gracefully exiting monitor:host")
Ejemplo n.º 4
0
    def monitor(self, interval):

        log_console(f"Service monitoring starting, interval={interval}")
        while True and not self.terminate:

            services = get_all_services()
            log_console(
                f"Monitor: Beginning monitoring for {len(services)} services")
            for service in services:

                if self.terminate:
                    break

                log_console(f"--- service monitor for {service['name']}")
                availability, response_time = get_avail_and_rsp_time(service)
                service["availability"] = availability
                if not availability:
                    record_service_status(service)
                    set_service(service)
                    log_event(
                        str(datetime.now())[:-3],
                        "service monitor",
                        service["name"],
                        "WARNING",
                        f"Availability failed for service: {service['name']}",
                    )
                    continue

                service["response_time"] = int(response_time * 1000)
                service["last_heard"] = str(datetime.now())[:-3]

                record_service_status(service)
                set_service(service)

            for _ in range(0, int(interval / 10)):
                time.sleep(10)
                if self.terminate:
                    break

        log_console("...gracefully exiting monitor:service")
Ejemplo n.º 5
0
def get_device_status(device):

    device_status = dict()
    device_status["availability"] = False
    device_status["response_time"] = None
    device_status["cpu"] = None
    device_status["memory"] = None
    device_status["last_heard"] = None

    env = None
    response_time = None

    if device["os"] in {"ios", "iosxe", "nxos-ssh"
                        } and device["transport"] == "napalm":

        try:
            time_start = time.time()
            result, env = get_device_info(device, "environment")
            response_time = time.time() - time_start
        except BaseException as e:
            info = f"!!! Exception in monitoring device, get environment: {repr(e)}"
            log_console(info)
            log_event(
                str(datetime.now())[:-3], "device", device["name"], "SEVERE",
                info)
            result = "failed"

    else:

        try:
            time_start = time.time()
            result, facts = get_device_info(device,
                                            "facts",
                                            get_live_info=True)
            response_time = time.time() - time_start
        except BaseException as e:
            info = f"!!! Exception in monitoring device, get facts: {repr(e)}"
            log_console(info)
            log_event(
                str(datetime.now())[:-3], "device", device["name"], "SEVERE",
                info)
            result = "failed"

    if result != "success":
        log_event(
            str(datetime.now())[:-3],
            "device monitor",
            device["name"],
            "SEVERE",
            f"Availability failed for device: {device['name']}",
        )

    else:
        device_status["availability"] = True
        if response_time:
            device_status["response_time"] = int(response_time * 1000)
        device_status["last_heard"] = str(datetime.now())[:-3]

        if env:
            device_status["cpu"] = calculate_cpu(env["environment"]["cpu"])
            device_status["memory"] = calculate_memory(
                env["environment"]["memory"])

    return device_status
Ejemplo n.º 6
0
    def monitor(self, interval):

        while True and not self.terminate:

            # We get device IDs every time through, so that we can then re-retrieve the device object.
            # The reason for this is because other entities may have changed device (e.g. SDWAN heartbeats)
            device_ids = get_all_device_ids()
            log_console(
                f"Monitor: Beginning monitoring for {len(device_ids)} devices")

            for device_id in device_ids:

                result, device = get_device(
                    device_id=device_id
                )  # re-retrieve device as it may have been changed

                if result != "success":
                    log_console(
                        f"Device Monitor: Error retrieving device from DB. id: {device_id}, error: {device}"
                    )
                    continue

                if device["transport"] == "HTTP-REST":
                    if not device["last_heard"]:
                        continue

                    last_heard_time = datetime.strptime(
                        device["last_heard"], "%Y-%m-%d %H:%M:%S.%f")
                    print(
                        f"now: {datetime.now()}, last_heard: {last_heard_time}"
                    )
                    if (datetime.now() - last_heard_time) > timedelta(
                            seconds=MAX_NOT_HEARD_SECONDS):
                        device["availability"] = False
                        record_device_status(device)
                        set_device(device)

                    continue  # HTTP-REST devices (e.g. sdwan) communicate to us, we don't poll them

                try:
                    ip_address = socket.gethostbyname(device["hostname"])
                except (socket.error, socket.gaierror) as e:
                    info = f"!!! Caught socket error {repr(e)}, continuing to next device"
                    log_console(info)
                    log_event(
                        str(datetime.now())[:-3], "device", device['name'],
                        "SEVERE", info)
                    ip_address = None

                if self.terminate:
                    break

                log_console(
                    f"--- monitor:device get environment {device['name']}")
                device_status = get_device_status(device)

                device["ip_address"] = ip_address
                device["availability"] = device_status["availability"]
                device["response_time"] = device_status["response_time"]
                device["cpu"] = device_status["cpu"]
                device["memory"] = device_status["memory"]

                if device_status["last_heard"]:
                    device["last_heard"] = device_status["last_heard"]

                record_device_status(device)
                set_device(device)

            for _ in range(0, int(interval / 10)):
                sleep(10)
                if self.terminate:
                    break

        log_console("...gracefully exiting monitor:device")