def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ if self.last_host_plugin_heartbeat is None: self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): try: host_plugin = self.protocol.client.get_host_plugin() host_plugin.ensure_initialized() is_currently_healthy = host_plugin.get_health() if is_currently_healthy: self.host_plugin_errorstate.reset() else: self.host_plugin_errorstate.incr() is_healthy = self.host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) self.health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, log_event=False) self.last_host_plugin_heartbeat = datetime.datetime.utcnow()
def is_log_collection_allowed(): # There are three conditions that need to be met in order to allow periodic log collection: # 1) It should be enabled in the configuration. # 2) The system must be using cgroups to manage services. Needed for resource limiting of the log collection. # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting. conf_enabled = conf.get_collect_logs() cgroups_enabled = CGroupConfigurator.get_instance().enabled() supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3 is_allowed = conf_enabled and cgroups_enabled and supported_python msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \ "configuration enabled [{1}], cgroups enabled [{2}], python supported: [{3}]".format(is_allowed, conf_enabled, cgroups_enabled, supported_python) logger.info(msg) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, is_success=is_allowed, message=msg, log_event=False) return is_allowed
def test_save_event_cleanup(self): tmp_evt = tempfile.mkdtemp() init_event_logger(tmp_evt) for i in range(0, 2000): evt = os.path.join(tmp_evt, '{0}.tld'.format(ustr(1491004920536531 + i))) with open(evt, 'w') as fh: fh.write('test event {0}'.format(i)) events = os.listdir(tmp_evt) self.assertTrue(len(events) == 2000, "{0} events found, 2000 expected".format(len(events))) add_event('test', message='last event') events = os.listdir(tmp_evt) events.sort() self.assertTrue(len(events) == 1000, "{0} events found, 1000 expected".format(len(events))) first_event = os.path.join(tmp_evt, events[0]) with open(first_event) as first_fh: first_event_text = first_fh.read() self.assertTrue('test event 1001' in first_event_text) last_event = os.path.join(tmp_evt, events[-1]) with open(last_event) as last_fh: last_event_text = last_fh.read() self.assertTrue('last event' in last_event_text)
def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ if self.last_imds_heartbeat is None: self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD): try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False) self.last_imds_heartbeat = datetime.datetime.utcnow()
def _send_heartbeat_telemetry(self, protocol): if self._last_telemetry_heartbeat is None: self._last_telemetry_heartbeat = datetime.utcnow( ) - UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.utcnow() >= (self._last_telemetry_heartbeat + UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD): dropped_packets = self.osutil.get_firewall_dropped_packets( protocol.get_endpoint()) auto_update_enabled = 1 if conf.get_autoupdate_enabled() else 0 telemetry_msg = "{0};{1};{2};{3};{4}".format( self._heartbeat_counter, self._heartbeat_id, dropped_packets, self._heartbeat_update_goal_state_error_count, auto_update_enabled) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=telemetry_msg, log_event=False) self._heartbeat_counter += 1 self._heartbeat_update_goal_state_error_count = 0 debug_log_msg = "[DEBUG HeartbeatCounter: {0};HeartbeatId: {1};DroppedPackets: {2};" \ "UpdateGSErrors: {3};AutoUpdate: {4}]".format(self._heartbeat_counter, self._heartbeat_id, dropped_packets, self._heartbeat_update_goal_state_error_count, auto_update_enabled) logger.info( u"[HEARTBEAT] Agent {0} is running as the goal state agent {1}", CURRENT_AGENT, debug_log_msg) self._last_telemetry_heartbeat = datetime.utcnow()
def report_ext_handlers_status(self): """Go thru handler_state dir, collect and report status""" vm_status = VMStatus() vm_status.vmAgent.version = str(CURRENT_VERSION) vm_status.vmAgent.status = "Ready" vm_status.vmAgent.message = "Guest Agent is running" if self.ext_handlers is not None: for ext_handler in self.ext_handlers.extHandlers: try: self.report_ext_handler_status(vm_status, ext_handler) except ExtensionError as e: add_event( AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=ustr(e)) logger.verbose("Report vm agent status") try: self.protocol.report_vm_status(vm_status) if self.log_report: logger.verbose("Successfully reported vm agent status") except ProtocolError as e: message = "Failed to report vm agent status: {0}".format(e) add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=message)
def disable(self, reason): self._cgroups_enabled = False message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) logger.info(message) # log as INFO for now, in the future it should be logged as WARNING add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False) self.__reset_cpu_quota() CGroupsTelemetry.reset()
def log_cgroup_warn(format_string, *args): message = format_string.format(*args) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False)
def test_save_event_cleanup(self): for i in range(0, 2000): evt = os.path.join(self.tmp_dir, '{0}.tld'.format(ustr(1491004920536531 + i))) with open(evt, 'w') as fh: fh.write('test event {0}'.format(i)) events = os.listdir(self.tmp_dir) self.assertTrue( len(events) == 2000, "{0} events found, 2000 expected".format(len(events))) add_event('test', message='last event') events = os.listdir(self.tmp_dir) events.sort() self.assertTrue( len(events) == 1000, "{0} events found, 1000 expected".format(len(events))) first_event = os.path.join(self.tmp_dir, events[0]) with open(first_event) as first_fh: first_event_text = first_fh.read() self.assertTrue('test event 1002' in first_event_text) last_event = os.path.join(self.tmp_dir, events[-1]) with open(last_event) as last_fh: last_event_text = last_fh.read() self.assertTrue('last event' in last_event_text)
def __collect_azure_unit_telemetry(): azure_units = [] try: units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all']) for line in units.split('\n'): match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE) if match is not None: azure_units.append((match.group(1), line)) except shellutil.CommandError as command_error: _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error)) for unit_name, unit_description in azure_units: unit_slice = "Unknown" try: unit_slice = systemd.get_unit_property(unit_name, "Slice") except Exception as exception: _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception)) _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description) if len(azure_units) == 0: try: cgroups = shellutil.run_command('systemd-cgls') for line in cgroups.split('\n'): if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE): logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups)) # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice") except shellutil.CommandError as command_error: _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
def send_telemetry_metrics(self): """ The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly. :return: """ time_now = datetime.datetime.utcnow() try: # If there is an issue in reporting, it should not take down whole monitor thread. if not self.last_cgroup_report_telemetry: self.last_cgroup_report_telemetry = time_now if time_now >= (self.last_cgroup_report_telemetry + MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD): performance_metrics = CGroupsTelemetry.report_all_tracked() self.last_cgroup_report_telemetry = time_now if performance_metrics: message = generate_extension_metrics_telemetry_dictionary( schema_version=1.0, performance_metrics=performance_metrics) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionMetricsData, is_success=True, message=ustr(message), log_event=False) except Exception as e: logger.warn( "Could not report all the tracked telemetry due to {0}", ustr(e))
def _download(self): for uri in self.pkg.uris: if self._fetch(uri.uri): break else: if self.host is not None and self.host.ensure_initialized(): logger.warn( "Download unsuccessful, falling back to host plugin") uri, headers = self.host.get_artifact_request( uri.uri, self.host.manifest_uri) if uri is not None \ and headers is not None \ and self._fetch(uri, headers=headers): break else: logger.warn( "Download unsuccessful, host plugin not available") if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format( self.name) add_event(AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg) return
def send_extension_healthstore_heartbeat(self): """ Send health signals for extensions to the health store every EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD. """ if self.last_extension_healthstore_heartbeat is None: self.last_extension_healthstore_heartbeat = datetime.datetime.utcnow( ) - MonitorHandler.EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= ( self.last_extension_healthstore_heartbeat + MonitorHandler.EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD): try: self.health_service.report_extension_health_observations() except Exception as e: msg = "Exception sending extension healthstore heartbeat: {0}".format( ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionHeathstoreHeartbeat, is_success=False, message=msg, log_event=False) self.last_extension_healthstore_heartbeat = datetime.datetime.utcnow( )
def get_status_file_path(self, extension=None): path = None seq_no = self.get_largest_seq_no() # Issue 1116: use the sequence number from goal state where possible if extension is not None and extension.sequenceNumber is not None: try: gs_seq_no = int(extension.sequenceNumber) if gs_seq_no != seq_no: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.SequenceNumberMismatch, is_success=False, message="Goal state: {0}, disk: {1}".format( gs_seq_no, seq_no), log_event=False) seq_no = gs_seq_no except ValueError: logger.error( 'Sequence number [{0}] does not appear to be valid'.format( extension.sequenceNumber)) if seq_no > -1: path = os.path.join(self.get_status_dir(), "{0}.status".format(seq_no)) return seq_no, path
def _ensure_downloaded(self): logger.verbose(u"Ensuring Agent {0} is downloaded", self.name) if self.is_downloaded: logger.verbose( u"Agent {0} was previously downloaded - skipping download", self.name) return if self.pkg is None: raise UpdateError( u"Agent {0} is missing package and download URIs".format( self.name)) self._download() self._unpack() msg = u"Agent {0} downloaded successfully".format(self.name) logger.verbose(msg) add_event(AGENT_NAME, version=self.version, op=WALAEventOperation.Install, is_success=True, message=msg) return
def test_save_event(self): add_event('test', message='test event') self.assertTrue(len(os.listdir(self.tmp_dir)) == 1) # checking the extension of the file created. for filename in os.listdir(self.tmp_dir): self.assertEqual(".tld", filename[-4:])
def report_ext_handlers_status(self): """Go through handler_state dir, collect and report status""" vm_status = VMStatus(status="Ready", message="Guest Agent is running") if self.ext_handlers is not None: for ext_handler in self.ext_handlers.extHandlers: try: self.report_ext_handler_status(vm_status, ext_handler) except ExtensionError as e: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionProcessing, is_success=False, message=ustr(e)) logger.verbose("Report vm agent status") try: self.protocol.report_vm_status(vm_status) if self.log_report: logger.verbose("Completed vm agent status report") except ProtocolError as e: message = "Failed to report vm agent status: {0}".format(e) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionProcessing, is_success=False, message=message)
def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have successfully called and validated a response in the last IMDS_HEALTH_PERIOD. """ try: is_currently_healthy, response = self.imds_client.validate() if is_currently_healthy: self.imds_errorstate.reset() else: self.imds_errorstate.incr() is_healthy = self.imds_errorstate.is_triggered() is False logger.verbose("IMDS health: {0} [{1}]", is_healthy, response) self.health_service.report_imds_status(is_healthy, response) except Exception as e: msg = "Exception sending imds heartbeat: {0}".format(ustr(e)) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ImdsHeartbeat, is_success=False, message=msg, log_event=False)
def test_collect_and_send_with_call_wireserver_returns_http_error( self, mock_lib_dir, *args): mock_lib_dir.return_value = self.lib_dir fileutil.mkdir(self.event_dir) add_event(name="MonitorTests", op=WALAEventOperation.HeartBeat, is_success=True, message="Test heartbeat") with _create_monitor_handler( enabled_operations=["collect_and_send_events" ]) as monitor_handler: def http_post_handler(url, _, **__): if self.is_telemetry_request(url): return HttpError("A test exception") return None monitor_handler.get_mock_wire_protocol().set_http_handlers( http_post_handler=http_post_handler) with patch("azurelinuxagent.common.logger.warn") as mock_warn: monitor_handler.run_and_wait() self.assertEqual(1, mock_warn.call_count) self.assertEqual(0, len(os.listdir(self.event_dir)))
def _download(self): for uri in self.pkg.uris: if not HostPluginProtocol.is_default_channel() and self._fetch( uri.uri): break elif self.host is not None and self.host.ensure_initialized(): if not HostPluginProtocol.is_default_channel(): logger.warn( "Download unsuccessful, falling back to host plugin") else: logger.verbose("Using host plugin as default channel") uri, headers = self.host.get_artifact_request( uri.uri, self.host.manifest_uri) if self._fetch(uri, headers=headers): if not HostPluginProtocol.is_default_channel(): logger.verbose( "Setting host plugin as default channel") HostPluginProtocol.set_default_channel(True) break else: logger.warn("Host plugin download unsuccessful") else: logger.error("No download channels available") if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format( self.name) add_event(AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg) return
def run(self, child_args=None): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() CGroups.setup() # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn("Daemon ended with exception -- Sleep 15 seconds and restart daemon") time.sleep(15)
def report_event(self, message="", is_success=True): version = self.ext_handler.properties.version add_event(name=self.ext_handler.name, version=version, message=message, op=self.operation, is_success=is_success)
def run(self, child_args=None): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() CGroups.setup() # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn( "Daemon ended with exception -- Sleep 15 seconds and restart daemon" ) time.sleep(15)
def report_event(self, message, is_success=False, duration=0, operation=WALAEventOperation.Provision): add_event(name=AGENT_NAME, message=message, duration=duration, is_success=is_success, op=operation)
def _evaluate_deployments(self): agents = [] fSuccess = False msg = "" try: agents = [ a for a in self.agents if a.in_safe_deployment_mode and not a.safe_deploy.is_deployed ] self._enable_agents(agents) for blacklist in [a.safe_deploy.blacklisted for a in agents]: self._blacklist_agents(blacklist) for agent in agents: agent.mark_deployed() fSuccess = True except Exception as e: msg = "Exception evaluating agents for safe deployment: {0}".format( e) logger.warn(msg) if len(agents) > 0: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.Deploy, is_success=fSuccess, message=msg)
def get_status_file_path(self, extension=None): path = None seq_no = self.get_largest_seq_no() # Issue 1116: use the sequence number from goal state where possible if extension is not None and extension.sequenceNumber is not None: try: gs_seq_no = int(extension.sequenceNumber) if gs_seq_no != seq_no: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.SequenceNumberMismatch, is_success=False, message="Goal state: {0}, disk: {1}".format(gs_seq_no, seq_no), log_event=False) seq_no = gs_seq_no except ValueError: logger.error('Sequence number [{0}] does not appear to be valid'.format(extension.sequenceNumber)) if seq_no > -1: path = os.path.join( self.get_status_dir(), "{0}.status".format(seq_no)) return seq_no, path
def _download(self): package = None for uri in self.pkg.uris: try: resp = restutil.http_get(uri.uri, chk_proxy=True) if resp.status == restutil.httpclient.OK: package = resp.read() fileutil.write_file(self.get_agent_pkg_path(), bytearray(package), asbin=True) logger.info(u"Agent {0} downloaded from {1}", self.name, uri.uri) break except restutil.HttpError as e: logger.warn(u"Agent {0} download from {1} failed", self.name, uri.uri) if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format( self.name) add_event(AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg) return
def run(self, child_args=None): # # The Container ID in telemetry events is retrieved from the goal state. We can fetch the goal state # only after protocol detection, which is done during provisioning. # # Be aware that telemetry events emitted before that will not include the Container ID. # logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: # pylint: disable=W0612 err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn( "Daemon ended with exception -- Sleep 15 seconds and restart daemon" ) time.sleep(15)
def test_should_log_errors_if_failed_operation_and_not_empty_event_dir( self, mock_logger_info, mock_logger_warn, mock_logger_error, mock_reporter): mock_reporter.event_dir = "dummy" with patch("azurelinuxagent.common.event.should_emit_event", return_value=True) as mock_should_emit_event: with patch("azurelinuxagent.common.event.mark_event_status"): with patch( "azurelinuxagent.common.event.EventLogger._add_event"): add_event("dummy name", version=CURRENT_VERSION, op=WALAEventOperation.Download, is_success=False, message="dummy event message") self.assertEquals(1, mock_should_emit_event.call_count) self.assertEquals(1, mock_logger_error.call_count) self.assertEquals(0, mock_logger_warn.call_count) self.assertEquals(0, mock_logger_info.call_count) args = mock_logger_error.call_args[0] self.assertEquals( ('dummy name', 'Download', 'dummy event message', 0), args[1:])
def test_save_event_message_with_non_ascii_characters(self): test_data_dir = os.path.join( data_dir, "events", "collect_and_send_extension_stdout_stderror") msg = "" with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout: with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr: msg = read_output(stdout, stderr) duration = elapsed_milliseconds(datetime.utcnow()) log_msg = "{0}\n{1}".format( "DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""])) add_event('test_extension', message=log_msg, duration=duration) for tld_file in os.listdir(self.tmp_dir): event_str = MonitorHandler.collect_event( os.path.join(self.tmp_dir, tld_file)) event_json = json.loads(event_str) self.assertEqual(len(event_json["parameters"]), 8) for i in event_json["parameters"]: if i["name"] == "Name": self.assertEqual(i["value"], "test_extension") if i["name"] == "Message": self.assertEqual(i["value"], log_msg)
def _download(self): for uri in self.pkg.uris: if not HostPluginProtocol.is_default_channel() and self._fetch(uri.uri): break elif self.host is not None and self.host.ensure_initialized(): if not HostPluginProtocol.is_default_channel(): logger.warn("Download unsuccessful, falling back to host plugin") else: logger.verbose("Using host plugin as default channel") uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri) if self._fetch(uri, headers=headers): if not HostPluginProtocol.is_default_channel(): logger.verbose("Setting host plugin as default channel") HostPluginProtocol.set_default_channel(True) break else: logger.warn("Host plugin download unsuccessful") else: logger.error("No download channels available") if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format(self.name) add_event( AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg) return
def __log_network_setup_service_logs(self): # Get logs from journalctl - https://www.freedesktop.org/software/systemd/man/journalctl.html cmd = [ "journalctl", "-u", self._network_setup_service_name, "-b", "--utc" ] service_failed = self.__verify_network_setup_service_failed() try: stdout = shellutil.run_command(cmd) msg = ustr("Logs from the {0} since system boot:\n {1}").format( self._network_setup_service_name, stdout) logger.info(msg) except CommandError as error: msg = "Unable to fetch service logs, Command: {0} failed with ExitCode: {1}\nStdout: {2}\nStderr: {3}".format( ' '.join(cmd), error.returncode, error.stdout, error.stderr) logger.warn(msg) except Exception as e: msg = "Ran into unexpected error when getting logs for {0} service. Error: {1}".format( self._network_setup_service_name, textutil.format_exception(e)) logger.warn(msg) # Log service status and logs if we can fetch them from journalctl and send it to Kusto, # else just log the error of the failure of fetching logs add_event(op=WALAEventOperation.PersistFirewallRules, is_success=(not service_failed), message=msg, log_event=False)
def __init__(self): """ Ensures the cgroups file system is mounted and selects the correct API to interact with it """ osutil = get_osutil() self._cgroups_supported = osutil.is_cgroups_supported() if self._cgroups_supported: self._enabled = True try: osutil.mount_cgroups() self._cgroups_api = CGroupsApi.create() status = "The cgroup filesystem is ready to use" except Exception as e: status = ustr(e) self._enabled = False else: self._enabled = False self._cgroups_api = None status = "Cgroups are not supported by the platform" logger.info("CGroups Status: {0}".format(status)) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.InitializeCGroups, is_success=self._enabled, message=status, log_event=False)
def test_save_event_message_with_non_ascii_characters(self): test_data_dir = os.path.join(data_dir, "events", "collect_and_send_extension_stdout_stderror") msg = "" with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout: with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr: msg = read_output(stdout, stderr) duration = elapsed_milliseconds(datetime.utcnow()) log_msg = "{0}\n{1}".format("DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""])) with patch("azurelinuxagent.common.event.datetime") as patch_datetime: patch_datetime.utcnow = Mock(return_value=datetime.strptime("2019-01-01 01:30:00", '%Y-%m-%d %H:%M:%S')) with patch('os.getpid', return_value=42): with patch("threading.Thread.getName", return_value="HelloWorldTask"): add_event('test_extension', message=log_msg, duration=duration) for tld_file in os.listdir(self.tmp_dir): event_str = MonitorHandler.collect_event(os.path.join(self.tmp_dir, tld_file)) event_json = json.loads(event_str) self.assertEqual(len(event_json["parameters"]), 15) # Checking the contents passed above, and also validating the default values that were passed in. for i in event_json["parameters"]: if i["name"] == "Name": self.assertEqual(i["value"], "test_extension") elif i["name"] == "Message": self.assertEqual(i["value"], log_msg) elif i["name"] == "Version": self.assertEqual(i["value"], str(CURRENT_VERSION)) elif i['name'] == 'IsInternal': self.assertEqual(i['value'], False) elif i['name'] == 'Operation': self.assertEqual(i['value'], 'Unknown') elif i['name'] == 'OperationSuccess': self.assertEqual(i['value'], True) elif i['name'] == 'Duration': self.assertEqual(i['value'], 0) elif i['name'] == 'ExtensionType': self.assertEqual(i['value'], '') elif i['name'] == 'ContainerId': self.assertEqual(i['value'], 'UNINITIALIZED') elif i['name'] == 'OpcodeName': self.assertEqual(i['value'], '2019-01-01 01:30:00') elif i['name'] == 'EventTid': self.assertEqual(i['value'], threading.current_thread().ident) elif i['name'] == 'EventPid': self.assertEqual(i['value'], 42) elif i['name'] == 'TaskName': self.assertEqual(i['value'], 'HelloWorldTask') elif i['name'] == 'KeywordName': self.assertEqual(i['value'], '') elif i['name'] == 'GAVersion': self.assertEqual(i['value'], str(CURRENT_AGENT)) else: self.assertFalse(True, "Contains a field outside the defaults expected. Field Name: {0}". format(i['name']))
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked( ) for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": if value >= thresholds["memory"]: msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["memory"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds["cpu"]: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["cpu"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn( "Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value: {1} bytes, Threshold: {2} megabytes." \ .format(cgroup_name, value, thresholds.memory_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. " \ "Current Value: {1}, Threshold: {2}." \ .format(cgroup_name, value, thresholds.cpu_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def daemon(self): last_heartbeat = datetime.datetime.min period = datetime.timedelta(minutes=30) while True: if (datetime.datetime.now() - last_heartbeat) > period: last_heartbeat = datetime.datetime.now() add_event(op=WALAEventOperation.HeartBeat, name=CURRENT_AGENT, version=CURRENT_VERSION, is_success=True) try: self.collect_and_send_events() except Exception as e: logger.warn("Failed to send events: {0}", e) time.sleep(60)
def _report_failures(self): try: logger.verbose("HealthService: report failures as telemetry") from azurelinuxagent.common.event import add_event, WALAEventOperation for o in self.observations: if not o.is_healthy: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HealthObservation, is_success=False, message=json.dumps(o.as_obj)) except Exception as e: logger.verbose("HealthService: could not report failures: {0}".format(ustr(e)))
def _emit_restart_event(self): if not self._is_clean_start: msg = u"{0} did not terminate cleanly".format(CURRENT_AGENT) logger.info(msg) add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.Restart, is_success=False, message=msg) self._set_sentinal() return
def setup(suppress_process_add=False): """ Only needs to be called once, and should be called from the -daemon instance of the agent. Mount the cgroup fs if necessary Create wrapper cgroups for agent-plus-extensions and set limits on them; Add this process to the "agent" cgroup, if required Actual collection of metrics from cgroups happens in the -run-exthandlers instance """ if CGroups.enabled(): try: CGroups._osutil.mount_cgroups() if not suppress_process_add: # Creates /sys/fs/cgroup/{cpu,memory}/WALinuxAgent wrapper cgroup CGroups._setup_wrapper_groups() pid = int(os.getpid()) if CGroups.is_systemd_manager(): # When daemon is running as a service, it's called walinuxagent.service # and is created and tracked by systemd, so we don't explicitly add the PID ourselves, # just track it for our reporting purposes cg = CGroups.for_systemd_service(AGENT_CGROUP_NAME.lower() + ".service") logger.info("Daemon process id {0} is tracked in systemd cgroup {1}".format(pid, cg.name)) # systemd sets limits; any limits we write would be overwritten else: # Creates /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent cgroup cg = CGroups.for_extension(AGENT_CGROUP_NAME) logger.info("Daemon process id {0} is tracked in cgroup {1}".format(pid, cg.name)) cg.add(pid) cg.set_limits() status = "successfully set up agent cgroup" except CGroupsException as cge: status = cge.msg CGroups.disable() except Exception as ge: status = ustr(ge) CGroups.disable() else: status = "not supported by platform" CGroups.disable() logger.info("CGroups: {0}".format(status)) from azurelinuxagent.common.event import add_event, WALAEventOperation add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.InitializeCGroups, is_success=CGroups.enabled(), message=status, log_event=False)
def _ensure_partition_assigned(self): """ Assign the VM to a partition (0 - 99). Downloaded updates may be configured to run on only some VMs; the assigned partition determines eligibility. """ if not os.path.exists(self._partition_file): partition = ustr(int(datetime.utcnow().microsecond / 10000)) fileutil.write_file(self._partition_file, partition) add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.Partition, is_success=True, message=partition)
def __init__(self, path=None, pkg=None, host=None): self.pkg = pkg self.host = host version = None if path is not None: m = AGENT_DIR_PATTERN.match(path) if m == None: raise UpdateError(u"Illegal agent directory: {0}".format(path)) version = m.group(1) elif self.pkg is not None: version = pkg.version if version == None: raise UpdateError(u"Illegal agent version: {0}".format(version)) self.version = FlexibleVersion(version) location = u"disk" if path is not None else u"package" logger.verbose(u"Loading Agent {0} from {1}", self.name, location) self.error = GuestAgentError(self.get_agent_error_file()) self.error.load() try: self._ensure_downloaded() self._ensure_loaded() except Exception as e: if isinstance(e, ResourceGoneError): raise # The agent was improperly blacklisting versions due to a timeout # encountered while downloading a later version. Errors of type # socket.error are IOError, so this should provide sufficient # protection against a large class of I/O operation failures. if isinstance(e, IOError): raise # Note the failure, blacklist the agent if the package downloaded # - An exception with a downloaded package indicates the package # is corrupt (e.g., missing the HandlerManifest.json file) self.mark_failure(is_fatal=os.path.isfile(self.get_agent_pkg_path())) msg = u"Agent {0} install failed with exception: {1}".format( self.name, ustr(e)) logger.warn(msg) add_event( AGENT_NAME, version=self.version, op=WALAEventOperation.Install, is_success=False, message=msg)
def activate_resource_disk(self): logger.info("Activate resource disk") try: mount_point = conf.get_resourcedisk_mountpoint() fs = conf.get_resourcedisk_filesystem() mount_point = self.mount_resource_disk(mount_point, fs) warning_file = os.path.join(mount_point, DATALOSS_WARNING_FILE_NAME) try: fileutil.write_file(warning_file, DATA_LOSS_WARNING) except IOError as e: logger.warn("Failed to write data loss warnning:{0}", e) return mount_point except ResourceDiskError as e: logger.error("Failed to mount resource disk {0}", e) add_event(name="WALA", is_success=False, message=ustr(e), op=WALAEventOperation.ActivateResourceDisk)
def _emit_restart_event(self): try: if not self._is_clean_start: msg = u"Agent did not terminate cleanly: {0}".format( fileutil.read_file(self._sentinel_file_path())) logger.info(msg) add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.Restart, is_success=False, message=msg) except Exception: pass return
def _ensure_downloaded(self): try: logger.verbose(u"Ensuring Agent {0} is downloaded", self.name) if self.is_blacklisted: logger.info(u"Agent {0} is blacklisted - skipping download", self.name) return if self.is_downloaded: logger.verbose(u"Agent {0} was previously downloaded - skipping download", self.name) self._load_manifest() return if self.pkg is None: raise UpdateError(u"Agent {0} is missing package and download URIs".format( self.name)) self._download() self._unpack() self._load_manifest() self._load_error() self._load_supported() msg = u"Agent {0} downloaded successfully".format(self.name) logger.verbose(msg) add_event( AGENT_NAME, version=self.version, op=WALAEventOperation.Install, is_success=True, message=msg) except Exception as e: # Note the failure, blacklist the agent if the package downloaded # - An exception with a downloaded package indicates the package # is corrupt (e.g., missing the HandlerManifest.json file) self.mark_failure(is_fatal=os.path.isfile(self.get_agent_pkg_path())) msg = u"Agent {0} download failed with exception: {1}".format(self.name, ustr(e)) logger.warn(msg) add_event( AGENT_NAME, version=self.version, op=WALAEventOperation.Install, is_success=False, message=msg) return
def run(self): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() while self.running: try: self.daemon() except Exception as e: err_msg = traceback.format_exc() add_event("WALA", is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.info("Sleep 15 seconds and restart daemon") time.sleep(15)
def set_limits(self): """ Set per-hierarchy limits based on the cgroup name (agent or particular extension) """ if not conf.get_cgroups_enforce_limits(): return if self.name is None: return for ext in conf.get_cgroups_excluded(): if ext in self.name.lower(): logger.info('No cgroups limits for {0}'.format(self.name)) return # default values cpu_limit = DEFAULT_CPU_LIMIT_EXT mem_limit = max(DEFAULT_MEM_LIMIT_MIN_MB, round(self._osutil.get_total_mem() * DEFAULT_MEM_LIMIT_PCT / 100, 0)) # agent values if AGENT_NAME.lower() in self.name.lower(): cpu_limit = DEFAULT_CPU_LIMIT_AGENT mem_limit = min(DEFAULT_MEM_LIMIT_MAX_MB, mem_limit) msg = '{0}: {1}% {2}mb'.format(self.name, cpu_limit, mem_limit) logger.info("Setting cgroups limits for {0}".format(msg)) success = False try: self.set_cpu_limit(cpu_limit) self.set_memory_limit(mem_limit) success = True except Exception as ge: msg = '[{0}] {1}'.format(msg, ustr(ge)) raise finally: from azurelinuxagent.common.event import add_event, WALAEventOperation add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.SetCGroupsLimits, is_success=success, message=msg, log_event=False)
def run(self): try: if self.os_util.jit_enabled: self.protocol = self.protocol_util.get_protocol() current_incarnation = self.protocol.get_incarnation() if self.incarnation != current_incarnation: # something changed. Handle remote access if any. self.incarnation = current_incarnation self.remote_access = self.protocol.client.get_remote_access() self.handle_remote_access() except Exception as e: msg = u"Exception processing remote access handler: {0} {1}".format(ustr(e), traceback.format_exc()) logger.error(msg) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.RemoteAccessHandling, is_success=False, message=msg)
def setup(suppress_process_add=False): """ Only needs to be called once, and should be called from the -daemon instance of the agent. Mount the cgroup fs if necessary Create wrapper cgroups for agent-plus-extensions and set limits on them; Add this process to the "agent" cgroup, if required Actual collection of metrics from cgroups happens in the -run-exthandlers instance """ if CGroups.enabled(): try: CGroups._osutil.mount_cgroups() if not suppress_process_add: CGroups._setup_wrapper_groups() pid = int(os.getpid()) if not CGroups.is_systemd_manager(): cg = CGroups.for_extension(AGENT_NAME) logger.info("Add daemon process pid {0} to {1} cgroup".format(pid, cg.name)) cg.add(pid) cg.set_limits() else: cg = CGroups.for_systemd_service(AGENT_NAME) logger.info("Add daemon process pid {0} to {1} systemd cgroup".format(pid, cg.name)) # systemd sets limits; any limits we write would be overwritten status = "ok" except CGroupsException as cge: status = cge.msg CGroups.disable() except Exception as ge: status = ustr(ge) CGroups.disable() else: status = "not supported by platform" CGroups.disable() logger.info("CGroups: {0}".format(status)) from azurelinuxagent.common.event import add_event, WALAEventOperation add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.InitializeCGroups, is_success=CGroups.enabled(), message=status, log_event=False)
def run(self): self.ext_handlers, etag = None, None try: self.protocol = self.protocol_util.get_protocol() self.ext_handlers, etag = self.protocol.get_ext_handlers() except ProtocolError as e: msg = u"Exception retrieving extension handlers: {0}".format(ustr(e)) logger.warn(msg) add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=msg) return msg = u"Handle extensions updates for incarnation {0}".format(etag) logger.verbose(msg) # Log status report success on new config self.log_report = True self.handle_ext_handlers(etag) self.last_etag = etag self.report_ext_handlers_status()
def set_limits(self): """ Set per-hierarchy limits based on the cgroup name (agent or particular extension) """ if not conf.get_cgroups_enforce_limits(): return if self.name is None: return for ext in conf.get_cgroups_excluded(): if ext in self.name.lower(): logger.info('No cgroups limits for {0}'.format(self.name)) return # default values cpu_limit = self.get_cpu_limits() mem_limit = self.get_memory_limits() msg = '{0}: {1}% {2}mb'.format(self.name, cpu_limit, mem_limit) logger.info("Setting cgroups limits for {0}".format(msg)) success = False try: self.set_cpu_limit(cpu_limit) self.set_memory_limit(mem_limit) success = True except Exception as ge: msg = '[{0}] {1}'.format(msg, ustr(ge)) raise finally: from azurelinuxagent.common.event import add_event, WALAEventOperation add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.SetCGroupsLimits, is_success=success, message=msg, log_event=False) # Returning the limits - self.threshold = {"cpu": cpu_limit, "memory": mem_limit}
def run(self): self.ext_handlers, etag = None, None try: self.protocol = self.protocol_util.get_protocol() self.ext_handlers, etag = self.protocol.get_ext_handlers() self.get_artifact_error_state.reset() except Exception as e: msg = u"Exception retrieving extension handlers: {0}".format(ustr(e)) self.get_artifact_error_state.incr() if self.get_artifact_error_state.is_triggered(): add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.GetArtifactExtended, is_success=False, message="Failed to get extension artifact for over " "{0): {1}".format(self.get_artifact_error_state.min_timedelta, msg)) self.get_artifact_error_state.reset() else: logger.warn(msg) return try: msg = u"Handle extensions updates for incarnation {0}".format(etag) logger.verbose(msg) # Log status report success on new config self.log_report = True self.handle_ext_handlers(etag) self.last_etag = etag self.report_ext_handlers_status() self.cleanup_outdated_handlers() except Exception as e: msg = u"Exception processing extension handlers: {0}".format( ustr(e)) logger.warn(msg) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionProcessing, is_success=False, message=msg) return
def _download(self): uris_shuffled = self.pkg.uris random.shuffle(uris_shuffled) for uri in uris_shuffled: if not HostPluginProtocol.is_default_channel() and self._fetch(uri.uri): break elif self.host is not None and self.host.ensure_initialized(): if not HostPluginProtocol.is_default_channel(): logger.warn("Download failed, switching to host plugin") else: logger.verbose("Using host plugin as default channel") uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri) try: if self._fetch(uri, headers=headers, use_proxy=False): if not HostPluginProtocol.is_default_channel(): logger.verbose("Setting host plugin as default channel") HostPluginProtocol.set_default_channel(True) break else: logger.warn("Host plugin download failed") # If the HostPlugin rejects the request, # let the error continue, but set to use the HostPlugin except ResourceGoneError: HostPluginProtocol.set_default_channel(True) raise else: logger.error("No download channels available") if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format(self.name) add_event( AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg)
def send_telemetry_heartbeat(self): if self.last_telemetry_heartbeat is None: self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): try: incarnation = self.protocol.get_incarnation() dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HeartBeat, is_success=True, message=msg, log_event=False) self.counter += 1 io_errors = IOErrorCounter.get_and_reset() hostplugin_errors = io_errors.get("hostplugin") protocol_errors = io_errors.get("protocol") other_errors = io_errors.get("other") if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.HttpErrors, is_success=True, message=msg, log_event=False) except Exception as e: logger.warn("Failed to send heartbeat: {0}", e) self.last_telemetry_heartbeat = datetime.datetime.utcnow()
def _download(self): for uri in self.pkg.uris: if self._fetch(uri.uri): break else: if self.host is not None: logger.info("Download unsuccessful, falling back to host plugin") uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri) if self._fetch(uri, headers=headers): break if not os.path.isfile(self.get_agent_pkg_path()): msg = u"Unable to download Agent {0} from any URI".format(self.name) add_event( AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, message=msg) raise UpdateError(msg) return
def test_save_event_rollover(self): tmp_evt = tempfile.mkdtemp() init_event_logger(tmp_evt) add_event('test', message='first event') for i in range(0, 999): add_event('test', message='test event {0}'.format(i)) events = os.listdir(tmp_evt) events.sort() self.assertTrue(len(events) == 1000) first_event = os.path.join(tmp_evt, events[0]) with open(first_event) as first_fh: first_event_text = first_fh.read() self.assertTrue('first event' in first_event_text) add_event('test', message='last event') events = os.listdir(tmp_evt) events.sort() self.assertTrue(len(events) == 1000, "{0} events found, 1000 expected".format(len(events))) first_event = os.path.join(tmp_evt, events[0]) with open(first_event) as first_fh: first_event_text = first_fh.read() self.assertFalse('first event' in first_event_text) self.assertTrue('test event 0' in first_event_text) last_event = os.path.join(tmp_evt, events[-1]) with open(last_event) as last_fh: last_event_text = last_fh.read() self.assertTrue('last event' in last_event_text) shutil.rmtree(tmp_evt)
def _ensure_downloaded(self): logger.verbose(u"Ensuring Agent {0} is downloaded", self.name) if self.is_downloaded: logger.verbose(u"Agent {0} was previously downloaded - skipping download", self.name) return if self.pkg is None: raise UpdateError(u"Agent {0} is missing package and download URIs".format( self.name)) self._download() self._unpack() msg = u"Agent {0} downloaded successfully".format(self.name) logger.verbose(msg) add_event( AGENT_NAME, version=self.version, op=WALAEventOperation.Install, is_success=True, message=msg)