Beispiel #1
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host health
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # cgroup data period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)
    # log network configuration
    LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1)
    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def daemon(self, init_data=False):

        if init_data:
            self.init_protocols()
            self.init_imds_client()

        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while not self.stopped():
            try:
                self.protocol.update_host_plugin_from_goal_state()
                self._send_telemetry_heartbeat_op.run()
                self._poll_telemetry_metrics_op.run()
                # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for
                # each cgroup.
                self._send_telemetry_metrics_op.run()
                self._collect_and_send_events_op.run()
                self._send_host_plugin_heartbeat_op.run()
                self._send_imds_heartbeat_op.run()
                self._log_altered_network_configuration_op.run()
                self._reset_loggers_op.run()
            except Exception as e:
                logger.warn(
                    "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                    ustr(e))
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        logger.reset_periodic()

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_telemetry_heartbeat(self):
        io_errors = IOErrorCounter.get_and_reset()
        hostplugin_errors = io_errors.get("hostplugin")
        protocol_errors = io_errors.get("protocol")
        other_errors = io_errors.get("other")

        if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
            msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                hostplugin_errors, protocol_errors, other_errors)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HttpErrors,
                      is_success=True,
                      message=msg,
                      log_event=False)

    def poll_telemetry_metrics(self):
        """
        This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly.

        :return: List of Metrics (which would be sent to PerfCounterMetrics directly.
        """
        metrics = CGroupsTelemetry.poll_all_tracked()

        if metrics:
            for metric in metrics:
                report_metric(metric.category, metric.counter, metric.instance,
                              metric.value)

    def send_telemetry_metrics(self):
        """
        The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly.
        """
        performance_metrics = CGroupsTelemetry.report_all_tracked()

        if performance_metrics:
            message = generate_extension_metrics_telemetry_dictionary(
                schema_version=1.0, performance_metrics=performance_metrics)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionMetricsData,
                      is_success=True,
                      message=ustr(message),
                      log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Beispiel #2
0
class MonitorHandler(object):

    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
            self.sysinfo.append(
                TelemetryEventParam("ContainerId", vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name, param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            logger.error(
                "monitor: Exception tracking wrapper and agent: {0} [{1}]", e,
                traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked(
                ).items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]",
                            e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: updating tracked extensions raised {0}: {1}", e,
                    traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # extension metrics period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_reset_loggers_time = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_polling_telemetry = None
        self.last_cgroup_report_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("ExecutionMode", AGENT_EXECUTION_MODE))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", ustr(e))

    @staticmethod
    def collect_event(evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8")
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except (IOError, UnicodeDecodeError) as e:
            os.remove(evt_file_name)
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        """
        Periodically read, parse, and send events located in the events folder. Currently, this is done every minute.
        Any .tld file dropped in the events folder will be emitted. These event files can be created either by the
        agent or the extensions. We don't have control over extension's events parameters, but we will override
        any values they might have set for sys_info parameters.
        """
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", ustr(e))
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}",
                                    ustr(e))
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", ustr(e))
            except Exception as e:
                logger.warn("Failed to send events: {0}", ustr(e))

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.poll_telemetry_metrics()
            self.send_telemetry_metrics()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            self.reset_loggers()
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        time_now = datetime.datetime.utcnow()
        if not self.last_reset_loggers_time:
            self.last_reset_loggers_time = time_now

        if time_now >= (self.last_reset_loggers_time +
                        MonitorHandler.RESET_LOGGERS_PERIOD):
            try:
                logger.reset_periodic()
            finally:
                self.last_reset_loggers_time = time_now

    def add_sysinfo(self, event):
        """
        This method is called after parsing the event file in the events folder and before emitting it. This means
        all events, either coming from the agent or from the extensions, are passed through this method. The purpose
        is to add a static list of sys_info parameters such as VMName, Region, RAM, etc. If the sys_info parameters
        are already populated in the event, they will be overwritten by the sys_info values obtained from the agent.
        Since the ContainerId parameter is only populated on the fly for the agent events because it is not a static
        sys_info parameter, an event coming from an extension will not have it, so we explicitly add it.
        :param event: Event to be enriched with sys_info parameters
        :return: Event with all parameters added, ready to be reported
        """
        sysinfo_names = [v.name for v in self.sysinfo]
        final_parameters = []

        # Refer: azurelinuxagent.common.event.EventLogger.add_default_parameters_to_event for agent specific values.
        #
        # Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't
        # have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly,
        # but not for extension events. Add it if it's missing.
        default_values = [("ContainerId", get_container_id_from_env()),
                          ("GAVersion", CURRENT_AGENT), ("OpcodeName", ""),
                          ("EventTid", 0), ("EventPid", 0), ("TaskName", ""),
                          ("KeywordName", "")]

        for param in event.parameters:
            # Discard any sys_info parameters already in the event, since they will be overwritten
            if param.name in sysinfo_names:
                continue
            final_parameters.append(param)

        # Add sys_info params populated by the agent
        final_parameters.extend(self.sysinfo)

        for default_value in default_values:
            if default_value[0] not in event:
                final_parameters.append(
                    TelemetryEventParam(default_value[0], default_value[1]))

        event.parameters = final_parameters

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(
                            self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", ustr(e))

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    def poll_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()
        if not self.last_cgroup_polling_telemetry:
            self.last_cgroup_polling_telemetry = time_now

        if time_now >= (self.last_cgroup_polling_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD):
            CGroupsTelemetry.poll_all_tracked()
            self.last_cgroup_polling_telemetry = time_now

    def send_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()

        if not self.last_cgroup_report_telemetry:
            self.last_cgroup_report_telemetry = time_now

        if time_now >= (self.last_cgroup_report_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD):
            performance_metrics = CGroupsTelemetry.report_all_tracked()
            self.last_cgroup_report_telemetry = time_now

            if performance_metrics:
                message = generate_extension_metrics_telemetry_dictionary(
                    schema_version=1.0,
                    performance_metrics=performance_metrics)
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ExtensionMetricsData,
                          is_success=True,
                          message=ustr(message),
                          log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Beispiel #4
0
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME,
                                                 DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName",
                                                    vminfo.vmName))
            self.sysinfo.append(TelemetryEventParam("TenantName",
                                                    vminfo.tenantName))
            self.sysinfo.append(TelemetryEventParam("RoleName",
                                                    vminfo.roleName))
            self.sysinfo.append(TelemetryEventParam("RoleInstanceName",
                                                    vminfo.roleInstanceName))
            self.sysinfo.append(TelemetryEventParam("ContainerId",
                                                    vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(TelemetryEventParam('Location',
                                                    vminfo.location))
            self.sysinfo.append(TelemetryEventParam('SubscriptionId',
                                                    vminfo.subscriptionId))
            self.sysinfo.append(TelemetryEventParam('ResourceGroupName',
                                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId',
                                                    vminfo.vmId))
            self.sysinfo.append(TelemetryEventParam('ImageOrigin',
                                                    vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name,
                               param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.ImdsHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            # when a hierarchy is not mounted, we raise an exception
            # and we should therefore only issue a warning, since this
            # is not unexpected
            logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
            logger.verbose(traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            if value >= thresholds["memory"]:
                                msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["memory"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds["cpu"]:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["cpu"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Beispiel #5
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._periodic_operations = [
            ResetPeriodicLogMessagesOperation(),
            PeriodicOperation("collect_and_send_events",
                              self.collect_and_send_events,
                              self.EVENT_COLLECTION_PERIOD),
            ReportNetworkErrorsOperation(),
            PollResourceUsageOperation(),
            PeriodicOperation("send_host_plugin_heartbeat",
                              self.send_host_plugin_heartbeat,
                              self.HOST_PLUGIN_HEARTBEAT_PERIOD),
            PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat,
                              self.IMDS_HEARTBEAT_PERIOD),
            ReportNetworkConfigurationChangesOperation(),
        ]
        self.protocol = None
        self.protocol_util = None
        self.health_service = None

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def daemon(self, init_data=False):
        try:
            if init_data:
                self.init_protocols()
                self.init_imds_client()

            while not self.stopped():
                try:
                    self.protocol.update_host_plugin_from_goal_state()

                    for op in self._periodic_operations:
                        op.run()

                except Exception as e:
                    logger.error(
                        "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:
            logger.error(
                "An error occurred in the monitor thread; will exit the thread.\n{0}",
                ustr(e))

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)