def test_get_handler(self): osutil.get_osutil() protocol.get_protocol_util() dhcp.get_dhcp_handler() provision.get_provision_handler() deprovision.get_deprovision_handler() daemon.get_daemon_handler() resourcedisk.get_resourcedisk_handler() scvmm.get_scvmm_handler() monitor.get_monitor_handler() update.get_update_handler() exthandlers.get_exthandlers_handler()
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: self._ensure_no_orphans() self._emit_restart_event() # TODO: Add means to stop running while self.running: if self._is_orphaned: logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT) break if self._upgrade_available(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break exthandlers_handler.run() time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) return self._shutdown() sys.exit(0) return
def test_error_heartbeat_creates_no_signal(self, patch_report_heartbeat, patch_http_get, patch_add_event, *args): monitor_handler = get_monitor_handler() protocol = WireProtocol('endpoint') protocol.update_goal_state = MagicMock() with patch( 'azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol): monitor_handler.init_protocols() monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow( ) - timedelta(hours=1) patch_http_get.side_effect = IOError('client error') monitor_handler.send_host_plugin_heartbeat() # health report should not be made self.assertEqual(0, patch_report_heartbeat.call_count) # telemetry with failure details is sent self.assertEqual(1, patch_add_event.call_count) self.assertEqual('HostPluginHeartbeat', patch_add_event.call_args[1]['op']) self.assertTrue( 'client error' in patch_add_event.call_args[1]['message']) self.assertEqual(False, patch_add_event.call_args[1]['is_success']) monitor_handler.stop()
def test_it_should_invoke_all_periodic_operations(self): def periodic_operation_run(self): invoked_operations.append(self.__class__.__name__) with _mock_wire_protocol(): with patch("azurelinuxagent.ga.monitor.MonitorHandler.stopped", side_effect=[False, True, False, True]): with patch("time.sleep"): with patch.object(PeriodicOperation, "run", side_effect=periodic_operation_run, autospec=True): with patch("azurelinuxagent.common.conf.get_monitor_network_configuration_changes") as monitor_network_changes: for network_changes in [True, False]: monitor_network_changes.return_value = network_changes invoked_operations = [] monitor_handler = get_monitor_handler() monitor_handler.run() monitor_handler.join() expected_operations = [ PollResourceUsage.__name__, ReportNetworkErrors.__name__, ResetPeriodicLogMessages.__name__, SendHostPluginHeartbeat.__name__, SendImdsHeartbeat.__name__, ] if network_changes: expected_operations.append(ReportNetworkConfigurationChanges.__name__) invoked_operations.sort() expected_operations.sort() self.assertEqual(invoked_operations, expected_operations, "The monitor thread did not invoke the expected operations")
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler exthandlers_handler = get_exthandlers_handler() # TODO: Add means to stop running try: while self.running: if self._ensure_latest_agent(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break exthandlers_handler.run() time.sleep(25) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) sys.exit(1) sys.exit(0) return
def test_send_extension_metrics_telemetry_for_unsupported_cgroup( self, patch_periodic_warn, patch_add_metric, *args): CGroupsTelemetry._tracked.append( CGroup("cgroup_name", "/test/path", "io")) monitor_handler = get_monitor_handler() monitor_handler.init_protocols() monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.poll_telemetry_metrics() self.assertEqual(1, patch_periodic_warn.call_count) self.assertEqual( 0, patch_add_metric.call_count) # No metrics should be sent. monitor_handler.stop()
def test_send_extension_metrics_telemetry(self, patch_report_all_tracked, patch_poll_all_tracked, patch_add_event, patch_add_metric, *args): patch_poll_all_tracked.return_value = [ MetricValue("Process", "% Processor Time", 1, 1), MetricValue("Memory", "Total Memory Usage", 1, 1), MetricValue("Memory", "Max Memory Usage", 1, 1) ] patch_report_all_tracked.return_value = { "memory": { "cur_mem": [ 1, 1, 1, 1, 1, str(datetime.datetime.utcnow()), str(datetime.datetime.utcnow()) ], "max_mem": [ 1, 1, 1, 1, 1, str(datetime.datetime.utcnow()), str(datetime.datetime.utcnow()) ] }, "cpu": { "cur_cpu": [ 1, 1, 1, 1, 1, str(datetime.datetime.utcnow()), str(datetime.datetime.utcnow()) ] } } monitor_handler = get_monitor_handler() monitor_handler.init_protocols() monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.poll_telemetry_metrics() monitor_handler.send_telemetry_metrics() self.assertEqual(1, patch_poll_all_tracked.call_count) self.assertEqual(1, patch_report_all_tracked.call_count) self.assertEqual(1, patch_add_event.call_count) self.assertEqual( 3, patch_add_metric.call_count) # Three metrics being sent. monitor_handler.stop()
def test_send_extension_metrics_telemetry_for_empty_cgroup( self, patch_report_all_tracked, patch_poll_all_tracked, patch_add_event, patch_add_metric, *args): patch_report_all_tracked.return_value = {} patch_poll_all_tracked.return_value = [] monitor_handler = get_monitor_handler() monitor_handler.init_protocols() monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.poll_telemetry_metrics() monitor_handler.send_telemetry_metrics() self.assertEqual(1, patch_poll_all_tracked.call_count) self.assertEqual(1, patch_report_all_tracked.call_count) self.assertEqual(0, patch_add_event.call_count) self.assertEqual(0, patch_add_metric.call_count) monitor_handler.stop()
def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2( self, patch_periodic_warn, patch_cpu_usage, patch_add_metric, *args): ioerror = IOError() ioerror.errno = 2 patch_cpu_usage.side_effect = ioerror CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name", "/test/path")) monitor_handler = get_monitor_handler() monitor_handler.init_protocols() monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.poll_telemetry_metrics() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual( 0, patch_add_metric.call_count) # No metrics should be sent. monitor_handler.stop()
def _create_monitor_handler(enabled_operations=[], iterations=1): """ Creates an instance of MonitorHandler that * Uses a mock_wire_protocol for network requests, * Executes only the operations given in the 'enabled_operations' parameter, * Runs its main loop only the number of times given in the 'iterations' parameter, and * Does not sleep at the end of each iteration The returned MonitorHandler is augmented with 2 methods: * get_mock_wire_protocol() - returns the mock protocol * run_and_wait() - invokes run() and wait() on the MonitorHandler """ def run(self): if len(enabled_operations) == 0 or self._name in enabled_operations: run.original_definition(self) run.original_definition = PeriodicOperation.run with mock_wire_protocol(DATA_FILE) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) with patch("azurelinuxagent.ga.monitor.get_protocol_util", return_value=protocol_util): with patch.object(PeriodicOperation, "run", side_effect=run, autospec=True): with patch("azurelinuxagent.ga.monitor.MonitorHandler.stopped", side_effect=[False] * iterations + [True]): with patch("time.sleep"): def run_and_wait(): monitor_handler.run() monitor_handler.join() monitor_handler = get_monitor_handler() monitor_handler.get_mock_wire_protocol = lambda: protocol monitor_handler.run_and_wait = run_and_wait yield monitor_handler
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # # Fetch the goal state one time; some components depend on information provided by the goal state and this # call ensures the required info is initialized (e.g telemetry depends on the container ID.) # protocol = self.protocol_util.get_protocol() protocol.update_goal_state() initialize_event_logger_vminfo_common_parameters(protocol) # Log OS-specific info. os_info_msg = u"Distro: {0}-{1}; OSUtil: {2}; AgentService: {3}; Python: {4}.{5}.{6}".format( DISTRO_NAME, DISTRO_VERSION, type(self.osutil).__name__, self.osutil.service_name, PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) logger.info(os_info_msg) add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler(protocol) migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler(protocol) self._ensure_no_orphans() self._emit_restart_event() self._emit_changes_in_default_configuration() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() goal_state_interval = conf.get_goal_state_period( ) if conf.get_extensions_enabled( ) else GOAL_STATE_INTERVAL_DISABLED while self.running: # # Check that the parent process (the agent's daemon) is still running # if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break # # Check that all the threads are still running # if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() # # Process the goal state # if not protocol.try_update_goal_state(): self._heartbeat_update_goal_state_error_count += 1 else: if self._upgrade_available(protocol): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info( 'ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format( exthandlers_handler.last_etag)) self._send_heartbeat_telemetry(protocol) time.sleep(goal_state_interval) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: send_event_time = datetime.utcnow() self._ensure_no_orphans() self._emit_restart_event() while self.running: if self._is_orphaned: logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT) break if self._upgrade_available(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() log_event = last_etag != exthandlers_handler.last_etag or \ (datetime.utcnow() >= send_event_time) add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), log_event=log_event) if log_event: send_event_time += timedelta(minutes=REPORT_STATUS_INTERVAL) test_agent = self.get_test_agent() if test_agent is not None and test_agent.in_slice: test_agent.enable() logger.info(u"Enabled Agent {0} as test agent", test_agent.name) break time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) return self._shutdown() sys.exit(0) return
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() if last_etag != exthandlers_handler.last_etag: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), message="Incarnation {0}".format( exthandlers_handler.last_etag), log_event=True) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), message="Incarnation {0}".format( exthandlers_handler.last_etag), log_event=True) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: # NOTE: Do not add any telemetry events until after the monitoring handler has been started with the # call to 'monitor_thread.run()'. That method call initializes the protocol, which is needed in order to # load the goal state and update the container id in memory. Any telemetry events sent before this happens # will result in an uninitialized container id value. logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Log OS-specific info locally. os_info_msg = u"Distro info: {0} {1}, osutil class being used: {2}, " \ u"agent service name: {3}".format(DISTRO_NAME, DISTRO_VERSION, type(self.osutil).__name__, self.osutil.service_name) logger.info(os_info_msg) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() # NOTE: Any telemetry events added from this point on will be properly populated with the container id. from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() # Send OS-specific info as a telemetry event after the monitoring thread has been initialized, and with # it the container id too. add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) goal_state_interval = GOAL_STATE_INTERVAL \ if conf.get_extensions_enabled() \ else GOAL_STATE_INTERVAL_DISABLED while self.running: if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info( 'ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format( exthandlers_handler.last_etag)) time.sleep(goal_state_interval) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: send_event_time = datetime.utcnow() self._ensure_no_orphans() self._emit_restart_event() while self.running: if self._is_orphaned: logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT) break if self._upgrade_available(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() log_event = last_etag != exthandlers_handler.last_etag or \ (datetime.utcnow() >= send_event_time) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), log_event=log_event) if log_event: send_event_time += timedelta( minutes=REPORT_STATUS_INTERVAL) test_agent = self.get_test_agent() if test_agent is not None and test_agent.in_slice: test_agent.enable() logger.info(u"Enabled Agent {0} as test agent", test_agent.name) break time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) return self._shutdown() sys.exit(0) return
def run(self): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info('ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event( AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format(exthandlers_handler.last_etag)) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format(CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # # Fetch the goal state one time; some components depend on information provided by the goal state and this # call ensures the required info is initialized (e.g telemetry depends on the container ID.) # protocol = self.protocol_util.get_protocol() protocol.update_goal_state() # Initialize the common parameters for telemetry events initialize_event_logger_vminfo_common_parameters(protocol) # Log OS-specific info. os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\ u"OSUtil: {util_name}; AgentService: {service_name}; "\ u"Python: {py_major}.{py_minor}.{py_micro}; "\ u"systemd: {systemd}; "\ u"LISDrivers: {lis_ver}; "\ u"logrotate: {has_logrotate};".format( dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION, util_name=type(self.osutil).__name__, service_name=self.osutil.service_name, py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR, py_micro=PY_VERSION_MICRO, systemd=systemd.is_systemd(), lis_ver=get_lis_version(), has_logrotate=has_logrotate() ) logger.info(os_info_msg) add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # # Perform initialization tasks # from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler(protocol) migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler(protocol) self._ensure_no_orphans() self._emit_restart_event() self._emit_changes_in_default_configuration() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() self._ensure_extension_telemetry_state_configured_properly( protocol) self._ensure_firewall_rules_persisted( dst_ip=protocol.get_endpoint()) # Get all thread handlers telemetry_handler = get_send_telemetry_events_handler( self.protocol_util) all_thread_handlers = [ get_monitor_handler(), get_env_handler(), telemetry_handler, get_collect_telemetry_events_handler(telemetry_handler) ] if is_log_collection_allowed(): all_thread_handlers.append(get_collect_logs_handler()) # Launch all monitoring threads for thread_handler in all_thread_handlers: thread_handler.run() goal_state_interval = conf.get_goal_state_period( ) if conf.get_extensions_enabled( ) else GOAL_STATE_INTERVAL_DISABLED while self.running: # # Check that the parent process (the agent's daemon) is still running # if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break # # Check that all the threads are still running # for thread_handler in all_thread_handlers: if not thread_handler.is_alive(): logger.warn("{0} thread died, restarting".format( thread_handler.get_thread_name())) thread_handler.start() # # Process the goal state # if not protocol.try_update_goal_state(): self._heartbeat_update_goal_state_error_count += 1 else: if self._upgrade_available(protocol): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) activity_id, correlation_id, gs_creation_time = exthandlers_handler.get_goal_state_debug_metadata( ) msg = 'ProcessGoalState completed [Incarnation: {0}; {1} ms; Activity Id: {2}; Correlation Id: {3}; GS Creation Time: {4}]'.format( exthandlers_handler.last_etag, duration, activity_id, correlation_id, gs_creation_time) logger.info(msg) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message=msg) self._send_heartbeat_telemetry(protocol) time.sleep(goal_state_interval) except Exception as error: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(error)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)