class MonitorHandler(object): # telemetry EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) # host health TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) # cgroup data period CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5) CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) # log network configuration LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1) # Resetting loggers period RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12) def __init__(self): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._reset_loggers_op = PeriodicOperation("reset_loggers", self.reset_loggers, self.RESET_LOGGERS_PERIOD) self._collect_and_send_events_op = PeriodicOperation( "collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD) self._send_telemetry_heartbeat_op = PeriodicOperation( "send_telemetry_heartbeat", self.send_telemetry_heartbeat, self.TELEMETRY_HEARTBEAT_PERIOD) self._poll_telemetry_metrics_op = PeriodicOperation( "poll_telemetry_metrics usage", self.poll_telemetry_metrics, self.CGROUP_TELEMETRY_POLLING_PERIOD) self._send_telemetry_metrics_op = PeriodicOperation( "send_telemetry_metrics usage", self.send_telemetry_metrics, self.CGROUP_TELEMETRY_REPORTING_PERIOD) self._send_host_plugin_heartbeat_op = PeriodicOperation( "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD) self._send_imds_heartbeat_op = PeriodicOperation( "send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD) self._log_altered_network_configuration_op = PeriodicOperation( "log_altered_network_configuration", self.log_altered_network_configuration, self.LOG_NETWORK_CONFIGURATION_PERIOD) self.protocol = None self.protocol_util = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.start(init_data=True) def stop(self): self.should_run = False if self.is_alive(): self.join() def join(self): self.event_thread.join() def stopped(self): return not self.should_run def init_protocols(self): # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each # thread would now have its own ProtocolUtil object as per the SingletonPerThread model. self.protocol_util = get_protocol_util() self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.get_endpoint()) def init_imds_client(self): wireserver_endpoint = self.protocol_util.get_wireserver_endpoint() self.imds_client = get_imds_client(wireserver_endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self, init_data=False): self.event_thread = threading.Thread(target=self.daemon, args=(init_data, )) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def collect_and_send_events(self): """ Periodically send any events located in the events folder """ event_list = collect_events() if len(event_list.events) > 0: self.protocol.report_event(event_list) def daemon(self, init_data=False): if init_data: self.init_protocols() self.init_imds_client() min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD, MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while not self.stopped(): try: self.protocol.update_host_plugin_from_goal_state() self._send_telemetry_heartbeat_op.run() self._poll_telemetry_metrics_op.run() # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for # each cgroup. self._send_telemetry_metrics_op.run() self._collect_and_send_events_op.run() self._send_host_plugin_heartbeat_op.run() self._send_imds_heartbeat_op.run() self._log_altered_network_configuration_op.run() self._reset_loggers_op.run() except Exception as e: logger.warn( "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}", ustr(e)) time.sleep(min_delta) def reset_loggers(self): """ The loggers maintain hash-tables in memory and they need to be cleaned up from time to time. For reference, please check azurelinuxagent.common.logger.Logger and azurelinuxagent.common.event.EventLogger classes """ logger.reset_periodic() def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) def send_telemetry_heartbeat(self): io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) def poll_telemetry_metrics(self): """ This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly. :return: List of Metrics (which would be sent to PerfCounterMetrics directly. """ metrics = CGroupsTelemetry.poll_all_tracked() if metrics: for metric in metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value) def send_telemetry_metrics(self): """ The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly. """ performance_metrics = CGroupsTelemetry.report_all_tracked() if performance_metrics: message = generate_extension_metrics_telemetry_dictionary( schema_version=1.0, performance_metrics=performance_metrics) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionMetricsData, is_success=True, message=ustr(message), log_event=False) def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_host_plugin_heartbeat = None self.protocol = None self.health_service = None self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append(TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append(TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append(TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append(TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append(TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append(TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append(TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append(TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.collect_and_send_events() self.send_host_plugin_heartbeat() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow()
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.init_cgroups() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append(TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append( TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append( TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append( TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append( TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append( TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append( TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append( TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append( TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow( ) - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= ( self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.send_cgroup_telemetry() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered( ) is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets( self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow() @staticmethod def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error( "monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc()) def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked( ).items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow() def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.init_cgroups() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append(TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append(TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append(TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append(TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append(TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append(TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append(TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append(TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.send_cgroup_telemetry() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format(self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow() @staticmethod def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc()) def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": if value >= thresholds["memory"]: msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["memory"]) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds["cpu"]: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["cpu"]) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow() def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
def test_reporting(self, patch_post, patch_add_event): health_service = HealthService('endpoint') health_service.report_host_plugin_status(is_healthy=True, response='response') self.assertEqual(1, patch_post.call_count) self.assertEqual(0, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, is_healthy=True, value='response', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_status(is_healthy=False, response='error') self.assertEqual(2, patch_post.call_count) self.assertEqual(1, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='error') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, is_healthy=False, value='error', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=True, source='source', response='response') self.assertEqual(3, patch_post.call_count) self.assertEqual(1, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, is_healthy=True, value='response', description='source') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=False, source='source', response='response') self.assertEqual(4, patch_post.call_count) self.assertEqual(2, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='response') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, is_healthy=False, value='response', description='source') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=True) self.assertEqual(5, patch_post.call_count) self.assertEqual(2, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=True, value='', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=False) self.assertEqual(3, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args) self.assertEqual(6, patch_post.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=False, value='', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=True, response='response') self.assertEqual(7, patch_post.call_count) self.assertEqual(3, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=True, value='response', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=False, response='response') self.assertEqual(8, patch_post.call_count) self.assertEqual(4, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='response') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=False, value='response', description='') self.assertEqual(0, len(health_service.observations)) patch_post.side_effect = HttpError() health_service.report_host_plugin_versions(is_healthy=True, response='') self.assertEqual(9, patch_post.call_count) self.assertEqual(4, patch_add_event.call_count) self.assertEqual(0, len(health_service.observations))
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) # extension metrics period CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5) CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) # Resetting loggers period RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_reset_loggers_time = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_polling_telemetry = None self.last_cgroup_report_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("ExecutionMode", AGENT_EXECUTION_MODE)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", ustr(e)) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append( TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append( TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append( TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", ustr(e)) try: vminfo = self.imds_client.get_compute() self.sysinfo.append( TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append( TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append( TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append( TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", ustr(e)) @staticmethod def collect_event(evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8") logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except (IOError, UnicodeDecodeError) as e: os.remove(evt_file_name) msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): """ Periodically read, parse, and send events located in the events folder. Currently, this is done every minute. Any .tld file dropped in the events folder will be emitted. These event files can be created either by the agent or the extensions. We don't have control over extension's events parameters, but we will override any values they might have set for sys_info parameters. """ if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow( ) - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= ( self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", ustr(e)) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", ustr(e)) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", ustr(e)) except Exception as e: logger.warn("Failed to send events: {0}", ustr(e)) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD, MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.poll_telemetry_metrics() self.send_telemetry_metrics() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() self.reset_loggers() time.sleep(min_delta) def reset_loggers(self): """ The loggers maintain hash-tables in memory and they need to be cleaned up from time to time. For reference, please check azurelinuxagent.common.logger.Logger and azurelinuxagent.common.event.EventLogger classes """ time_now = datetime.datetime.utcnow() if not self.last_reset_loggers_time: self.last_reset_loggers_time = time_now if time_now >= (self.last_reset_loggers_time + MonitorHandler.RESET_LOGGERS_PERIOD): try: logger.reset_periodic() finally: self.last_reset_loggers_time = time_now def add_sysinfo(self, event): """ This method is called after parsing the event file in the events folder and before emitting it. This means all events, either coming from the agent or from the extensions, are passed through this method. The purpose is to add a static list of sys_info parameters such as VMName, Region, RAM, etc. If the sys_info parameters are already populated in the event, they will be overwritten by the sys_info values obtained from the agent. Since the ContainerId parameter is only populated on the fly for the agent events because it is not a static sys_info parameter, an event coming from an extension will not have it, so we explicitly add it. :param event: Event to be enriched with sys_info parameters :return: Event with all parameters added, ready to be reported """ sysinfo_names = [v.name for v in self.sysinfo] final_parameters = [] # Refer: azurelinuxagent.common.event.EventLogger.add_default_parameters_to_event for agent specific values. # # Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't # have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly, # but not for extension events. Add it if it's missing. default_values = [("ContainerId", get_container_id_from_env()), ("GAVersion", CURRENT_AGENT), ("OpcodeName", ""), ("EventTid", 0), ("EventPid", 0), ("TaskName", ""), ("KeywordName", "")] for param in event.parameters: # Discard any sys_info parameters already in the event, since they will be overwritten if param.name in sysinfo_names: continue final_parameters.append(param) # Add sys_info params populated by the agent final_parameters.extend(self.sysinfo) for default_value in default_values: if default_value[0] not in event: final_parameters.append( TelemetryEventParam(default_value[0], default_value[1])) event.parameters = final_parameters def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered( ) is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets( self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", ustr(e)) self.last_telemetry_heartbeat = datetime.datetime.utcnow() def poll_telemetry_metrics(self): time_now = datetime.datetime.utcnow() if not self.last_cgroup_polling_telemetry: self.last_cgroup_polling_telemetry = time_now if time_now >= (self.last_cgroup_polling_telemetry + MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): CGroupsTelemetry.poll_all_tracked() self.last_cgroup_polling_telemetry = time_now def send_telemetry_metrics(self): time_now = datetime.datetime.utcnow() if not self.last_cgroup_report_telemetry: self.last_cgroup_report_telemetry = time_now if time_now >= (self.last_cgroup_report_telemetry + MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD): performance_metrics = CGroupsTelemetry.report_all_tracked() self.last_cgroup_report_telemetry = time_now if performance_metrics: message = generate_extension_metrics_telemetry_dictionary( schema_version=1.0, performance_metrics=performance_metrics) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionMetricsData, is_success=True, message=ustr(message), log_event=False) def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class MonitorHandler(object): # telemetry EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD), ReportNetworkErrorsOperation(), PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), ReportNetworkConfigurationChangesOperation(), ] self.protocol = None self.protocol_util = None self.health_service = None self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.start(init_data=True) def stop(self): self.should_run = False if self.is_alive(): self.join() def join(self): self.event_thread.join() def stopped(self): return not self.should_run def init_protocols(self): # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each # thread would now have its own ProtocolUtil object as per the SingletonPerThread model. self.protocol_util = get_protocol_util() self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.get_endpoint()) def init_imds_client(self): wireserver_endpoint = self.protocol_util.get_wireserver_endpoint() self.imds_client = get_imds_client(wireserver_endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self, init_data=False): self.event_thread = threading.Thread(target=self.daemon, args=(init_data, )) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def daemon(self, init_data=False): try: if init_data: self.init_protocols() self.init_imds_client() while not self.stopped(): try: self.protocol.update_host_plugin_from_goal_state() for op in self._periodic_operations: op.run() except Exception as e: logger.error( "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}", ustr(e)) finally: PeriodicOperation.sleep_until_next_operation( self._periodic_operations) except Exception as e: logger.error( "An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e)) def collect_and_send_events(self): """ Periodically send any events located in the events folder """ event_list = collect_events() if len(event_list.events) > 0: self.protocol.report_event(event_list) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False)