def init_protocols(self): # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each # thread would now have its own ProtocolUtil object as per the SingletonPerThread model. self.protocol_util = get_protocol_util() self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.get_endpoint())
def test_observation_length(self): health_service = HealthService('endpoint') # make 100 observations for i in range(0, 100): health_service._observe(is_healthy=True, name='{0}'.format(i)) # ensure we keep only 10 self.assertEqual(10, len(health_service.observations)) # ensure we keep the most recent 10 self.assertEqual('90', health_service.observations[0].name) self.assertEqual('99', health_service.observations[9].name)
def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: raise ProtocolError("HostGAPlugin: Endpoint not provided") self.is_initialized = False self.is_available = False self.api_versions = None self.endpoint = endpoint self.container_id = container_id self.deployment_id = None self.role_config_name = role_config_name self.manifest_uri = None self.health_service = HealthService(endpoint) self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.fetch_last_timestamp = None self.status_last_timestamp = None
def test_it_should_not_send_a_health_signal_when_the_hearbeat_fails(self): with _mock_wire_protocol() as protocol: with patch('azurelinuxagent.common.event.EventLogger.add_event') as add_event_patcher: health_service_post_requests = [] def http_get_handler(url, *_, **__): if self.is_host_plugin_health_request(url): del health_service_post_requests[:] # clear the requests; after this error there should be no more POSTs raise IOError('A CLIENT ERROR') return None def http_post_handler(url, _, **__): if self.is_health_service_request(url): health_service_post_requests.append(url) return MockHttpResponse(status=200) return None protocol.set_http_handlers(http_get_handler=http_get_handler, http_post_handler=http_post_handler) health_service = HealthService(protocol.get_endpoint()) SendHostPluginHeartbeat(protocol, health_service).run() self.assertEqual(0, len(health_service_post_requests), "No health signals should have been posted: {0}".format(health_service_post_requests)) heartbeat_events = [kwargs for _, kwargs in add_event_patcher.call_args_list if kwargs['op'] == 'HostPluginHeartbeat'] self.assertTrue(len(heartbeat_events) == 1, "The monitor thread should have reported exactly 1 telemetry event for an unhealthy host ga plugin") self.assertFalse(heartbeat_events[0]['is_success'], 'The reported event should indicate failure') self.assertIn('A CLIENT ERROR', heartbeat_events[0]['message'], 'The failure does not include the expected message')
def daemon(self): try: # The protocol needs to be instantiated in the monitor thread itself (to avoid concurrency issues with the protocol object each # thread uses a different instance as per the SingletonPerThread model. protocol_util = get_protocol_util() protocol = protocol_util.get_protocol() health_service = HealthService(protocol.get_endpoint()) periodic_operations = [ ResetPeriodicLogMessages(), ReportNetworkErrors(), PollResourceUsage(), SendHostPluginHeartbeat(protocol, health_service), SendImdsHeartbeat(protocol_util, health_service) ] report_network_configuration_changes = ReportNetworkConfigurationChanges() if conf.get_monitor_network_configuration_changes(): periodic_operations.append(report_network_configuration_changes) else: logger.info("Monitor.NetworkConfigurationChanges is disabled.") report_network_configuration_changes.log_network_configuration() while not self.stopped(): try: for op in periodic_operations: op.run() finally: PeriodicOperation.sleep_until_next_operation(periodic_operations) except Exception as e: logger.error("An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e))
def test_it_should_report_a_telemetry_event_when_host_plugin_is_not_healthy( self): with _mock_wire_protocol() as protocol: # the error triggers only after ERROR_STATE_DELTA_DEFAULT with patch( 'azurelinuxagent.common.errorstate.ErrorState.is_triggered', return_value=True): with patch('azurelinuxagent.common.event.EventLogger.add_event' ) as add_event_patcher: def http_get_handler(url, *_, **__): if self.is_host_plugin_health_request(url): return MockHttpResponse(status=503) return None protocol.set_http_handlers( http_get_handler=http_get_handler) health_service = HealthService(protocol.get_endpoint()) SendHostPluginHeartbeat(protocol, health_service).run() heartbeat_events = [ kwargs for _, kwargs in add_event_patcher.call_args_list if kwargs['op'] == 'HostPluginHeartbeatExtended' ] self.assertTrue( len(heartbeat_events) == 1, "The monitor thread should have reported exactly 1 telemetry event for an unhealthy host ga plugin" ) self.assertFalse( heartbeat_events[0]['is_success'], 'The reported event should indicate failure')
def test_it_should_report_host_ga_health(self): with _mock_wire_protocol() as protocol: def http_post_handler(url, _, **__): if self.is_health_service_request(url): http_post_handler.health_service_posted = True return MockHttpResponse(status=200) return None http_post_handler.health_service_posted = False protocol.set_http_handlers(http_post_handler=http_post_handler) health_service = HealthService(protocol.get_endpoint()) SendHostPluginHeartbeat(protocol, health_service).run() self.assertTrue(http_post_handler.health_service_posted, "The monitor thread did not report host ga plugin health")
def test_observation_json(self): health_service = HealthService('endpoint') health_service.observations.append(Observation(name='name', is_healthy=True, value='value', description='description')) expected_json = '{"Source": "WALinuxAgent", ' \ '"Api": "reporttargethealth", ' \ '"Version": "1.0", ' \ '"Observations": [{' \ '"Value": "value", ' \ '"ObservationName": "name", ' \ '"Description": "description", ' \ '"IsHealthy": true' \ '}]}' expected = sorted(json.loads(expected_json).items()) actual = sorted(json.loads(health_service.as_json).items()) self.assertEqual(expected, actual)
def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint)
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_host_plugin_heartbeat = None self.protocol = None self.health_service = None self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append(TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append(TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append(TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append(TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append(TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append(TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append(TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append(TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.collect_and_send_events() self.send_host_plugin_heartbeat() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow()
class MonitorHandler(object): # telemetry EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD), ReportNetworkErrorsOperation(), PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), ReportNetworkConfigurationChangesOperation(), ] self.protocol = None self.protocol_util = None self.health_service = None self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.start(init_data=True) def stop(self): self.should_run = False if self.is_alive(): self.join() def join(self): self.event_thread.join() def stopped(self): return not self.should_run def init_protocols(self): # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each # thread would now have its own ProtocolUtil object as per the SingletonPerThread model. self.protocol_util = get_protocol_util() self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.get_endpoint()) def init_imds_client(self): wireserver_endpoint = self.protocol_util.get_wireserver_endpoint() self.imds_client = get_imds_client(wireserver_endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self, init_data=False): self.event_thread = threading.Thread(target=self.daemon, args=(init_data, )) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def daemon(self, init_data=False): try: if init_data: self.init_protocols() self.init_imds_client() while not self.stopped(): try: self.protocol.update_host_plugin_from_goal_state() for op in self._periodic_operations: op.run() except Exception as e: logger.error( "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}", ustr(e)) finally: PeriodicOperation.sleep_until_next_operation( self._periodic_operations) except Exception as e: logger.error( "An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e)) def collect_and_send_events(self): """ Periodically send any events located in the events folder """ event_list = collect_events() if len(event_list.events) > 0: self.protocol.report_event(event_list) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False)
class MonitorHandler(object): # telemetry EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) # host health TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) # cgroup data period CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5) CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) # log network configuration LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1) # Resetting loggers period RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12) def __init__(self): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._reset_loggers_op = PeriodicOperation("reset_loggers", self.reset_loggers, self.RESET_LOGGERS_PERIOD) self._collect_and_send_events_op = PeriodicOperation( "collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD) self._send_telemetry_heartbeat_op = PeriodicOperation( "send_telemetry_heartbeat", self.send_telemetry_heartbeat, self.TELEMETRY_HEARTBEAT_PERIOD) self._poll_telemetry_metrics_op = PeriodicOperation( "poll_telemetry_metrics usage", self.poll_telemetry_metrics, self.CGROUP_TELEMETRY_POLLING_PERIOD) self._send_telemetry_metrics_op = PeriodicOperation( "send_telemetry_metrics usage", self.send_telemetry_metrics, self.CGROUP_TELEMETRY_REPORTING_PERIOD) self._send_host_plugin_heartbeat_op = PeriodicOperation( "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD) self._send_imds_heartbeat_op = PeriodicOperation( "send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD) self._log_altered_network_configuration_op = PeriodicOperation( "log_altered_network_configuration", self.log_altered_network_configuration, self.LOG_NETWORK_CONFIGURATION_PERIOD) self.protocol = None self.protocol_util = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.start(init_data=True) def stop(self): self.should_run = False if self.is_alive(): self.join() def join(self): self.event_thread.join() def stopped(self): return not self.should_run def init_protocols(self): # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each # thread would now have its own ProtocolUtil object as per the SingletonPerThread model. self.protocol_util = get_protocol_util() self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.get_endpoint()) def init_imds_client(self): wireserver_endpoint = self.protocol_util.get_wireserver_endpoint() self.imds_client = get_imds_client(wireserver_endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self, init_data=False): self.event_thread = threading.Thread(target=self.daemon, args=(init_data, )) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def collect_and_send_events(self): """ Periodically send any events located in the events folder """ event_list = collect_events() if len(event_list.events) > 0: self.protocol.report_event(event_list) def daemon(self, init_data=False): if init_data: self.init_protocols() self.init_imds_client() min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD, MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while not self.stopped(): try: self.protocol.update_host_plugin_from_goal_state() self._send_telemetry_heartbeat_op.run() self._poll_telemetry_metrics_op.run() # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for # each cgroup. self._send_telemetry_metrics_op.run() self._collect_and_send_events_op.run() self._send_host_plugin_heartbeat_op.run() self._send_imds_heartbeat_op.run() self._log_altered_network_configuration_op.run() self._reset_loggers_op.run() except Exception as e: logger.warn( "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}", ustr(e)) time.sleep(min_delta) def reset_loggers(self): """ The loggers maintain hash-tables in memory and they need to be cleaned up from time to time. For reference, please check azurelinuxagent.common.logger.Logger and azurelinuxagent.common.event.EventLogger classes """ logger.reset_periodic() def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) def send_telemetry_heartbeat(self): io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) def poll_telemetry_metrics(self): """ This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly. :return: List of Metrics (which would be sent to PerfCounterMetrics directly. """ metrics = CGroupsTelemetry.poll_all_tracked() if metrics: for metric in metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value) def send_telemetry_metrics(self): """ The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly. """ performance_metrics = CGroupsTelemetry.report_all_tracked() if performance_metrics: message = generate_extension_metrics_telemetry_dictionary( schema_version=1.0, performance_metrics=performance_metrics) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionMetricsData, is_success=True, message=ustr(message), log_event=False) def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.init_cgroups() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append(TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append( TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append( TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append( TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append( TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append( TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append( TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append( TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append( TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow( ) - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= ( self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.send_cgroup_telemetry() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered( ) is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets( self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow() @staticmethod def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error( "monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc()) def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked( ).items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow() def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class HostPluginProtocol(object): _is_default_channel = False FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1) STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1) def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: raise ProtocolError("HostGAPlugin: Endpoint not provided") self.is_initialized = False self.is_available = False self.api_versions = None self.endpoint = endpoint self.container_id = container_id self.deployment_id = None self.role_config_name = role_config_name self.manifest_uri = None self.health_service = HealthService(endpoint) self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.fetch_last_timestamp = None self.status_last_timestamp = None @staticmethod def is_default_channel(): return HostPluginProtocol._is_default_channel @staticmethod def set_default_channel(is_default): HostPluginProtocol._is_default_channel = is_default def ensure_initialized(self): if not self.is_initialized: self.api_versions = self.get_api_versions() self.is_available = API_VERSION in self.api_versions self.is_initialized = self.is_available from azurelinuxagent.common.event import WALAEventOperation, report_event report_event(WALAEventOperation.InitializeHostPlugin, is_success=self.is_available) return self.is_available def get_health(self): """ Call the /health endpoint :return: True if 200 received, False otherwise """ url = URI_FORMAT_HEALTH.format(self.endpoint, HOST_PLUGIN_PORT) logger.verbose("HostGAPlugin: Getting health from [{0}]", url) response = restutil.http_get(url, max_retry=1) return restutil.request_succeeded(response) def get_api_versions(self): url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint, HOST_PLUGIN_PORT) logger.verbose("HostGAPlugin: Getting API versions at [{0}]" .format(url)) return_val = [] error_response = '' is_healthy = False try: headers = {HEADER_CONTAINER_ID: self.container_id} response = restutil.http_get(url, headers) if restutil.request_failed(response): error_response = restutil.read_response_error(response) logger.error("HostGAPlugin: Failed Get API versions: {0}".format(error_response)) is_healthy = not restutil.request_failed_at_hostplugin(response) else: return_val = ustr(remove_bom(response.read()), encoding='utf-8') is_healthy = True except HttpError as e: logger.error("HostGAPlugin: Exception Get API versions: {0}".format(e)) self.health_service.report_host_plugin_versions(is_healthy=is_healthy, response=error_response) return return_val def get_artifact_request(self, artifact_url, artifact_manifest_url=None): if not self.ensure_initialized(): raise ProtocolError("HostGAPlugin: Host plugin channel is not available") if textutil.is_str_none_or_whitespace(artifact_url): raise ProtocolError("HostGAPlugin: No extension artifact url was provided") url = URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) headers = {HEADER_VERSION: API_VERSION, HEADER_CONTAINER_ID: self.container_id, HEADER_HOST_CONFIG_NAME: self.role_config_name, HEADER_ARTIFACT_LOCATION: artifact_url} if artifact_manifest_url is not None: headers[HEADER_ARTIFACT_MANIFEST_LOCATION] = artifact_manifest_url return url, headers def report_fetch_health(self, uri, is_healthy=True, source='', response=''): if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT): return if self.should_report(is_healthy, self.fetch_error_state, self.fetch_last_timestamp, HostPluginProtocol.FETCH_REPORTING_PERIOD): self.fetch_last_timestamp = datetime.datetime.utcnow() health_signal = self.fetch_error_state.is_triggered() is False self.health_service.report_host_plugin_extension_artifact(is_healthy=health_signal, source=source, response=response) def report_status_health(self, is_healthy, response=''): if self.should_report(is_healthy, self.status_error_state, self.status_last_timestamp, HostPluginProtocol.STATUS_REPORTING_PERIOD): self.status_last_timestamp = datetime.datetime.utcnow() health_signal = self.status_error_state.is_triggered() is False self.health_service.report_host_plugin_status(is_healthy=health_signal, response=response) @staticmethod def should_report(is_healthy, error_state, last_timestamp, period): """ Determine whether a health signal should be reported :param is_healthy: whether the current measurement is healthy :param error_state: the error state which is tracking time since failure :param last_timestamp: the last measurement time stamp :param period: the reporting period :return: True if the signal should be reported, False otherwise """ if is_healthy: # we only reset the error state upon success, since we want to keep # reporting the failure; this is different to other uses of error states # which do not have a separate periodicity error_state.reset() else: error_state.incr() if last_timestamp is None: last_timestamp = datetime.datetime.utcnow() - period return datetime.datetime.utcnow() >= (last_timestamp + period) def put_vm_log(self, content): raise NotImplementedError("Unimplemented") def put_vm_status(self, status_blob, sas_url, config_blob_type=None): """ Try to upload the VM status via the host plugin /status channel :param sas_url: the blob SAS url to pass to the host plugin :param config_blob_type: the blob type from the extension config :type status_blob: StatusBlob """ if not self.ensure_initialized(): raise ProtocolError("HostGAPlugin: HostGAPlugin is not available") if status_blob is None or status_blob.vm_status is None: raise ProtocolError("HostGAPlugin: Status blob was not provided") logger.verbose("HostGAPlugin: Posting VM status") blob_type = status_blob.type if status_blob.type else config_blob_type if blob_type == "BlockBlob": self._put_block_blob_status(sas_url, status_blob) else: self._put_page_blob_status(sas_url, status_blob) def _put_block_blob_status(self, sas_url, status_blob): url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT) response = restutil.http_put(url, data=self._build_status_data( sas_url, status_blob.get_block_blob_headers(len(status_blob.data)), bytearray(status_blob.data, encoding='utf-8')), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}" .format(error_response)) else: self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: Put BlockBlob status succeeded") def _put_page_blob_status(self, sas_url, status_blob): url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT) # Convert the status into a blank-padded string whose length is modulo 512 status = bytearray(status_blob.data, encoding='utf-8') status_size = int((len(status) + 511) / 512) * 512 status = bytearray(status_blob.data.ljust(status_size), encoding='utf-8') # First, initialize an empty blob response = restutil.http_put(url, data=self._build_status_data( sas_url, status_blob.get_page_blob_create_headers(status_size)), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin: Failed PageBlob clean-up: {0}" .format(error_response)) else: self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: PageBlob clean-up succeeded") # Then, upload the blob in pages if sas_url.count("?") <= 0: sas_url = "{0}?comp=page".format(sas_url) else: sas_url = "{0}&comp=page".format(sas_url) start = 0 end = 0 while start < len(status): # Create the next page end = start + min(len(status) - start, MAXIMUM_PAGEBLOB_PAGE_SIZE) page_size = int((end - start + 511) / 512) * 512 buf = bytearray(page_size) buf[0: end - start] = status[start: end] # Send the page response = restutil.http_put(url, data=self._build_status_data( sas_url, status_blob.get_page_blob_page_headers(start, end), buf), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError( "HostGAPlugin Error: Put PageBlob bytes " "[{0},{1}]: {2}".format(start, end, error_response)) # Advance to the next page (if any) start = end def _build_status_data(self, sas_url, blob_headers, content=None): headers = [] for name in iter(blob_headers.keys()): headers.append({ 'headerName': name, 'headerValue': blob_headers[name] }) data = { 'requestUri': sas_url, 'headers': headers } if not content is None: data['content'] = self._base64_encode(content) return json.dumps(data, sort_keys=True) def _build_status_headers(self): return { HEADER_VERSION: API_VERSION, "Content-type": "application/json", HEADER_CONTAINER_ID: self.container_id, HEADER_HOST_CONFIG_NAME: self.role_config_name } def _base64_encode(self, data): s = base64.b64encode(bytes(data)) if PY_VERSION_MAJOR > 2: return s.decode('utf-8') return s
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.init_cgroups() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("GAVersion", CURRENT_AGENT)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append(TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append(TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append(TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) self.sysinfo.append(TelemetryEventParam("ContainerId", vminfo.containerId)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", e) try: vminfo = self.imds_client.get_compute() self.sysinfo.append(TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append(TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append(TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append(TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", e) def collect_event(self, evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8", 'ignore') logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except IOError as e: msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", e) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", e) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", e) except Exception as e: logger.warn("Failed to send events: {0}", e) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.send_cgroup_telemetry() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] for param in event.parameters: if param.name in sysinfo_names: logger.verbose("Remove existing event parameter: [{0}:{1}]", param.name, param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format(self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow() @staticmethod def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc()) def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": if value >= thresholds["memory"]: msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["memory"]) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds["cpu"]: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["cpu"]) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow() def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
class MonitorHandler(object): EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) # extension metrics period CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5) CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) # Resetting loggers period RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12) def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() self.event_thread = None self.last_reset_loggers_time = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_cgroup_polling_telemetry = None self.last_cgroup_report_telemetry = None self.last_host_plugin_heartbeat = None self.last_imds_heartbeat = None self.protocol = None self.health_service = None self.last_route_table_hash = b'' self.last_nic_state = {} self.counter = 0 self.sysinfo = [] self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD) def run(self): self.init_protocols() self.init_sysinfo() self.start() def stop(self): self.should_run = False if self.is_alive(): self.event_thread.join() def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) self.event_thread.setDaemon(True) self.event_thread.setName("MonitorHandler") self.event_thread.start() def init_sysinfo(self): osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, platform.release()) self.sysinfo.append(TelemetryEventParam("OSVersion", osversion)) self.sysinfo.append( TelemetryEventParam("ExecutionMode", AGENT_EXECUTION_MODE)) try: ram = self.osutil.get_total_mem() processors = self.osutil.get_processor_cores() self.sysinfo.append(TelemetryEventParam("RAM", ram)) self.sysinfo.append(TelemetryEventParam("Processors", processors)) except OSUtilError as e: logger.warn("Failed to get system info: {0}", ustr(e)) try: vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append( TelemetryEventParam("TenantName", vminfo.tenantName)) self.sysinfo.append( TelemetryEventParam("RoleName", vminfo.roleName)) self.sysinfo.append( TelemetryEventParam("RoleInstanceName", vminfo.roleInstanceName)) except ProtocolError as e: logger.warn("Failed to get system info: {0}", ustr(e)) try: vminfo = self.imds_client.get_compute() self.sysinfo.append( TelemetryEventParam('Location', vminfo.location)) self.sysinfo.append( TelemetryEventParam('SubscriptionId', vminfo.subscriptionId)) self.sysinfo.append( TelemetryEventParam('ResourceGroupName', vminfo.resourceGroupName)) self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId)) self.sysinfo.append( TelemetryEventParam('ImageOrigin', vminfo.image_origin)) except (HttpError, ValueError) as e: logger.warn("failed to get IMDS info: {0}", ustr(e)) @staticmethod def collect_event(evt_file_name): try: logger.verbose("Found event file: {0}", evt_file_name) with open(evt_file_name, "rb") as evt_file: # if fail to open or delete the file, throw exception data_str = evt_file.read().decode("utf-8") logger.verbose("Processed event file: {0}", evt_file_name) os.remove(evt_file_name) return data_str except (IOError, UnicodeDecodeError) as e: os.remove(evt_file_name) msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) def collect_and_send_events(self): """ Periodically read, parse, and send events located in the events folder. Currently, this is done every minute. Any .tld file dropped in the events folder will be emitted. These event files can be created either by the agent or the extensions. We don't have control over extension's events parameters, but we will override any values they might have set for sys_info parameters. """ if self.last_event_collection is None: self.last_event_collection = datetime.datetime.utcnow( ) - MonitorHandler.EVENT_COLLECTION_PERIOD if datetime.datetime.utcnow() >= ( self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): try: event_list = TelemetryEventList() event_dir = os.path.join(conf.get_lib_dir(), "events") event_files = os.listdir(event_dir) for event_file in event_files: if not event_file.endswith(".tld"): continue event_file_path = os.path.join(event_dir, event_file) try: data_str = self.collect_event(event_file_path) except EventError as e: logger.error("{0}", ustr(e)) continue try: event = parse_event(data_str) self.add_sysinfo(event) event_list.events.append(event) except (ValueError, ProtocolError) as e: logger.warn("Failed to decode event file: {0}", ustr(e)) continue if len(event_list.events) == 0: return try: self.protocol.report_event(event_list) except ProtocolError as e: logger.error("{0}", ustr(e)) except Exception as e: logger.warn("Failed to send events: {0}", ustr(e)) self.last_event_collection = datetime.datetime.utcnow() def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD, MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD, MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds while self.should_run: self.send_telemetry_heartbeat() self.poll_telemetry_metrics() self.send_telemetry_metrics() self.collect_and_send_events() self.send_host_plugin_heartbeat() self.send_imds_heartbeat() self.log_altered_network_configuration() self.reset_loggers() time.sleep(min_delta) def reset_loggers(self): """ The loggers maintain hash-tables in memory and they need to be cleaned up from time to time. For reference, please check azurelinuxagent.common.logger.Logger and azurelinuxagent.common.event.EventLogger classes """ time_now = datetime.datetime.utcnow() if not self.last_reset_loggers_time: self.last_reset_loggers_time = time_now if time_now >= (self.last_reset_loggers_time + MonitorHandler.RESET_LOGGERS_PERIOD): try: logger.reset_periodic() finally: self.last_reset_loggers_time = time_now def add_sysinfo(self, event): """ This method is called after parsing the event file in the events folder and before emitting it. This means all events, either coming from the agent or from the extensions, are passed through this method. The purpose is to add a static list of sys_info parameters such as VMName, Region, RAM, etc. If the sys_info parameters are already populated in the event, they will be overwritten by the sys_info values obtained from the agent. Since the ContainerId parameter is only populated on the fly for the agent events because it is not a static sys_info parameter, an event coming from an extension will not have it, so we explicitly add it. :param event: Event to be enriched with sys_info parameters :return: Event with all parameters added, ready to be reported """ sysinfo_names = [v.name for v in self.sysinfo] final_parameters = [] # Refer: azurelinuxagent.common.event.EventLogger.add_default_parameters_to_event for agent specific values. # # Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't # have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly, # but not for extension events. Add it if it's missing. default_values = [("ContainerId", get_container_id_from_env()), ("GAVersion", CURRENT_AGENT), ("OpcodeName", ""), ("EventTid", 0), ("EventPid", 0), ("TaskName", ""), ("KeywordName", "")] for param in event.parameters: # Discard any sys_info parameters already in the event, since they will be overwritten if param.name in sysinfo_names: continue final_parameters.append(param) # Add sys_info params populated by the agent final_parameters.extend(self.sysinfo) for default_value in default_values: if default_value[0] not in event: final_parameters.append( TelemetryEventParam(default_value[0], default_value[1])) event.parameters = final_parameters def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow() def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered( ) is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) if not is_healthy: add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeatExtended, is_success=False, message='{0} since successful heartbeat'.format( self.host_plugin_errorstate.fail_time), log_event=False) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow() def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets( self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format( hostplugin_errors, protocol_errors, other_errors) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", ustr(e)) self.last_telemetry_heartbeat = datetime.datetime.utcnow() def poll_telemetry_metrics(self): time_now = datetime.datetime.utcnow() if not self.last_cgroup_polling_telemetry: self.last_cgroup_polling_telemetry = time_now if time_now >= (self.last_cgroup_polling_telemetry + MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): CGroupsTelemetry.poll_all_tracked() self.last_cgroup_polling_telemetry = time_now def send_telemetry_metrics(self): time_now = datetime.datetime.utcnow() if not self.last_cgroup_report_telemetry: self.last_cgroup_report_telemetry = time_now if time_now >= (self.last_cgroup_report_telemetry + MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD): performance_metrics = CGroupsTelemetry.report_all_tracked() self.last_cgroup_report_telemetry = time_now if performance_metrics: message = generate_extension_metrics_telemetry_dictionary( schema_version=1.0, performance_metrics=performance_metrics) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionMetricsData, is_success=True, message=ustr(message), log_event=False) def log_altered_network_configuration(self): """ Check various pieces of network configuration and, if altered since the last check, log the new state. """ raw_route_list = self.osutil.read_route_table() digest = hash_strings(raw_route_list) if digest != self.last_route_table_hash: self.last_route_table_hash = digest route_list = self.osutil.get_list_of_routes(raw_route_list) logger.info("Route table: [{0}]".format(",".join( map(networkutil.RouteEntry.to_json, route_list)))) nic_state = self.osutil.get_nic_state() if nic_state != self.last_nic_state: description = "Initial" if self.last_nic_state == {} else "Updated" logger.info("{0} NIC state: [{1}]".format( description, ", ".join(map(str, nic_state.values())))) self.last_nic_state = nic_state
def test_reporting(self, patch_post, patch_add_event): health_service = HealthService('endpoint') health_service.report_host_plugin_status(is_healthy=True, response='response') self.assertEqual(1, patch_post.call_count) self.assertEqual(0, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, is_healthy=True, value='response', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_status(is_healthy=False, response='error') self.assertEqual(2, patch_post.call_count) self.assertEqual(1, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='error') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, is_healthy=False, value='error', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=True, source='source', response='response') self.assertEqual(3, patch_post.call_count) self.assertEqual(1, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, is_healthy=True, value='response', description='source') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=False, source='source', response='response') self.assertEqual(4, patch_post.call_count) self.assertEqual(2, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='response') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, is_healthy=False, value='response', description='source') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=True) self.assertEqual(5, patch_post.call_count) self.assertEqual(2, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=True, value='', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=False) self.assertEqual(3, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args) self.assertEqual(6, patch_post.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=False, value='', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=True, response='response') self.assertEqual(7, patch_post.call_count) self.assertEqual(3, patch_add_event.call_count) self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=True, value='response', description='') self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=False, response='response') self.assertEqual(8, patch_post.call_count) self.assertEqual(4, patch_add_event.call_count) self.assert_telemetry(call_args=patch_add_event.call_args, response='response') self.assert_observation(call_args=patch_post.call_args, name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=False, value='response', description='') self.assertEqual(0, len(health_service.observations)) patch_post.side_effect = HttpError() health_service.report_host_plugin_versions(is_healthy=True, response='') self.assertEqual(9, patch_post.call_count) self.assertEqual(4, patch_add_event.call_count) self.assertEqual(0, len(health_service.observations))
class HostPluginProtocol(object): _is_default_channel = False FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1) STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1) def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: raise ProtocolError("HostGAPlugin: Endpoint not provided") self.is_initialized = False self.is_available = False self.api_versions = None self.endpoint = endpoint self.container_id = container_id self.deployment_id = None self.role_config_name = role_config_name self.manifest_uri = None self.health_service = HealthService(endpoint) self.fetch_error_state = ErrorState( min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.status_error_state = ErrorState( min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.fetch_last_timestamp = None self.status_last_timestamp = None @staticmethod def is_default_channel(): return HostPluginProtocol._is_default_channel @staticmethod def set_default_channel(is_default): HostPluginProtocol._is_default_channel = is_default def ensure_initialized(self): if not self.is_initialized: self.api_versions = self.get_api_versions() self.is_available = API_VERSION in self.api_versions self.is_initialized = self.is_available from azurelinuxagent.common.event import WALAEventOperation, report_event report_event(WALAEventOperation.InitializeHostPlugin, is_success=self.is_available) return self.is_available def get_health(self): """ Call the /health endpoint :return: True if 200 received, False otherwise """ url = URI_FORMAT_HEALTH.format(self.endpoint, HOST_PLUGIN_PORT) logger.verbose("HostGAPlugin: Getting health from [{0}]", url) status_ok = False try: response = restutil.http_get(url, max_retry=1) status_ok = restutil.request_succeeded(response) except HttpError as e: logger.verbose("HostGAPlugin: Exception getting health", ustr(e)) return status_ok def get_api_versions(self): url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint, HOST_PLUGIN_PORT) logger.verbose( "HostGAPlugin: Getting API versions at [{0}]".format(url)) return_val = [] error_response = '' is_healthy = False try: headers = {HEADER_CONTAINER_ID: self.container_id} response = restutil.http_get(url, headers) if restutil.request_failed(response): error_response = restutil.read_response_error(response) logger.error( "HostGAPlugin: Failed Get API versions: {0}".format( error_response)) else: return_val = ustr(remove_bom(response.read()), encoding='utf-8') is_healthy = True except HttpError as e: logger.error( "HostGAPlugin: Exception Get API versions: {0}".format(e)) self.health_service.report_host_plugin_versions( is_healthy=is_healthy, response=error_response) return return_val def get_artifact_request(self, artifact_url, artifact_manifest_url=None): if not self.ensure_initialized(): raise ProtocolError( "HostGAPlugin: Host plugin channel is not available") if textutil.is_str_none_or_whitespace(artifact_url): raise ProtocolError( "HostGAPlugin: No extension artifact url was provided") url = URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) headers = { HEADER_VERSION: API_VERSION, HEADER_CONTAINER_ID: self.container_id, HEADER_HOST_CONFIG_NAME: self.role_config_name, HEADER_ARTIFACT_LOCATION: artifact_url } if artifact_manifest_url is not None: headers[HEADER_ARTIFACT_MANIFEST_LOCATION] = artifact_manifest_url return url, headers def report_fetch_health(self, uri, is_healthy=True, source='', response=''): if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format( self.endpoint, HOST_PLUGIN_PORT): return if self.should_report(is_healthy, self.fetch_error_state, self.fetch_last_timestamp, HostPluginProtocol.FETCH_REPORTING_PERIOD): self.fetch_last_timestamp = datetime.datetime.utcnow() health_signal = self.fetch_error_state.is_triggered() is False self.health_service.report_host_plugin_extension_artifact( is_healthy=health_signal, source=source, response=response) def report_status_health(self, is_healthy, response=''): if self.should_report(is_healthy, self.status_error_state, self.status_last_timestamp, HostPluginProtocol.STATUS_REPORTING_PERIOD): self.status_last_timestamp = datetime.datetime.utcnow() health_signal = self.status_error_state.is_triggered() is False self.health_service.report_host_plugin_status( is_healthy=health_signal, response=response) @staticmethod def should_report(is_healthy, error_state, last_timestamp, period): """ Determine whether a health signal should be reported :param is_healthy: whether the current measurement is healthy :param error_state: the error state which is tracking time since failure :param last_timestamp: the last measurement time stamp :param period: the reporting period :return: True if the signal should be reported, False otherwise """ if is_healthy: # we only reset the error state upon success, since we want to keep # reporting the failure; this is different to other uses of error states # which do not have a separate periodicity error_state.reset() else: error_state.incr() if last_timestamp is None: last_timestamp = datetime.datetime.utcnow() - period return datetime.datetime.utcnow() >= (last_timestamp + period) def put_vm_log(self, content): raise NotImplementedError("Unimplemented") def put_vm_status(self, status_blob, sas_url, config_blob_type=None): """ Try to upload the VM status via the host plugin /status channel :param sas_url: the blob SAS url to pass to the host plugin :param config_blob_type: the blob type from the extension config :type status_blob: StatusBlob """ if not self.ensure_initialized(): raise ProtocolError("HostGAPlugin: HostGAPlugin is not available") if status_blob is None or status_blob.vm_status is None: raise ProtocolError("HostGAPlugin: Status blob was not provided") logger.verbose("HostGAPlugin: Posting VM status") blob_type = status_blob.type if status_blob.type else config_blob_type if blob_type == "BlockBlob": self._put_block_blob_status(sas_url, status_blob) else: self._put_page_blob_status(sas_url, status_blob) def _put_block_blob_status(self, sas_url, status_blob): url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT) response = restutil.http_put(url, data=self._build_status_data( sas_url, status_blob.get_block_blob_headers( len(status_blob.data)), bytearray(status_blob.data, encoding='utf-8')), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}".format( error_response)) else: self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: Put BlockBlob status succeeded") def _put_page_blob_status(self, sas_url, status_blob): url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT) # Convert the status into a blank-padded string whose length is modulo 512 status = bytearray(status_blob.data, encoding='utf-8') status_size = int((len(status) + 511) / 512) * 512 status = bytearray(status_blob.data.ljust(status_size), encoding='utf-8') # First, initialize an empty blob response = restutil.http_put( url, data=self._build_status_data( sas_url, status_blob.get_page_blob_create_headers(status_size)), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError( "HostGAPlugin: Failed PageBlob clean-up: {0}".format( error_response)) else: self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: PageBlob clean-up succeeded") # Then, upload the blob in pages if sas_url.count("?") <= 0: sas_url = "{0}?comp=page".format(sas_url) else: sas_url = "{0}&comp=page".format(sas_url) start = 0 end = 0 while start < len(status): # Create the next page end = start + min(len(status) - start, MAXIMUM_PAGEBLOB_PAGE_SIZE) page_size = int((end - start + 511) / 512) * 512 buf = bytearray(page_size) buf[0:end - start] = status[start:end] # Send the page response = restutil.http_put( url, data=self._build_status_data( sas_url, status_blob.get_page_blob_page_headers(start, end), buf), headers=self._build_status_headers()) if restutil.request_failed(response): error_response = restutil.read_response_error(response) is_healthy = not restutil.request_failed_at_hostplugin( response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin Error: Put PageBlob bytes " "[{0},{1}]: {2}".format( start, end, error_response)) # Advance to the next page (if any) start = end def _build_status_data(self, sas_url, blob_headers, content=None): headers = [] for name in iter(blob_headers.keys()): headers.append({ 'headerName': name, 'headerValue': blob_headers[name] }) data = {'requestUri': sas_url, 'headers': headers} if not content is None: data['content'] = self._base64_encode(content) return json.dumps(data, sort_keys=True) def _build_status_headers(self): return { HEADER_VERSION: API_VERSION, "Content-type": "application/json", HEADER_CONTAINER_ID: self.container_id, HEADER_HOST_CONFIG_NAME: self.role_config_name } def _base64_encode(self, data): s = base64.b64encode(bytes(data)) if PY_VERSION_MAJOR > 2: return s.decode('utf-8') return s