def test_foreach_controller_should_handle_errors_in_individual_controllers( self): successful_controllers = [] def controller_operation(controller): if controller == 'cpu': raise Exception('A test exception') successful_controllers.append(controller) with patch("azurelinuxagent.common.cgroupapi.logger.warn" ) as mock_logger_warn: CGroupsApi._foreach_controller(controller_operation, 'A dummy message') self.assertIn( 'memory', successful_controllers, 'The operation was not executed on the memory controller') self.assertEqual( len(successful_controllers), 1, 'The operation was not executed on unexpected controllers: {0}' .format(successful_controllers)) args, kwargs = mock_logger_warn.call_args (message_format, controller, error, message) = args self.assertEquals(message_format, 'Error in cgroup controller "{0}": {1}. {2}') self.assertEquals(controller, 'cpu') self.assertEquals(error, 'A test exception') self.assertEquals(message, 'A dummy message')
def __init__(self): """ Ensures the cgroups file system is mounted and selects the correct API to interact with it """ osutil = get_osutil() self._cgroups_supported = osutil.is_cgroups_supported() if self._cgroups_supported: self._enabled = True try: osutil.mount_cgroups() self._cgroups_api = CGroupsApi.create() status = "The cgroup filesystem is ready to use" except Exception as e: status = ustr(e) self._enabled = False else: self._enabled = False self._cgroups_api = None status = "Cgroups are not supported by the platform" logger.info("CGroups Status: {0}".format(status)) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.InitializeCGroups, is_success=self._enabled, message=status, log_event=False)
def test_create_should_return_a_FileSystemCgroupsApi_on_non_systemd_platforms( self): with patch("azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd", return_value=False): api = CGroupsApi.create() self.assertTrue(type(api) == FileSystemCgroupsApi)
def test_is_systemd_should_return_false_when_systemd_does_not_manage_current_process(self): fileutil_read_file = fileutil.read_file def mock_read_file(filepath, asbin=False, remove_bom=False, encoding='utf-8'): if filepath == "/proc/cgroups": return """ #subsys_name hierarchy num_cgroups enabled cpuset 11 1 1 cpu 3 77 1 cpuacct 3 77 1 blkio 10 70 1 memory 12 124 1 devices 9 70 1 freezer 4 1 1 net_cls 2 1 1 perf_event 7 1 1 net_prio 2 1 1 hugetlb 8 1 1 pids 5 76 1 rdma 6 1 1 """ if filepath == "/proc/self/cgroup": return """ 3:name=systemd:/ 2:memory:/walinuxagent.service 1:cpu,cpuacct:/walinuxagent.service """ return fileutil_read_file(filepath, asbin=asbin, remove_bom=remove_bom, encoding=encoding) with patch("azurelinuxagent.common.cgroupapi.fileutil.read_file", mock_read_file): is_systemd = CGroupsApi._is_systemd() self.assertFalse(is_systemd)
def test_foreach_controller_should_execute_operation_on_all_mounted_controllers( self): executed_controllers = [] def controller_operation(controller): executed_controllers.append(controller) CGroupsApi._foreach_controller(controller_operation, 'A dummy message') # The setUp method mocks azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT to have the cpu and memory controllers mounted self.assertIn('cpu', executed_controllers, 'The operation was not executed on the cpu controller') self.assertIn( 'memory', executed_controllers, 'The operation was not executed on the memory controller') self.assertEqual( len(executed_controllers), 2, 'The operation was not executed on unexpected controllers: {0}'. format(executed_controllers))
def initialize(self): try: if self._initialized: return # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: logger.info("Cgroup monitoring is not supported on {0}", get_distro()) return # check that systemd is detected correctly self._cgroups_api = SystemdCgroupsApi() if not systemd.is_systemd(): _log_cgroup_warning("systemd was not detected on {0}", get_distro()) return _log_cgroup_info("systemd version: {0}", systemd.get_version()) # This is temporarily disabled while we analyze telemetry. Likely it will be removed. # self.__collect_azure_unit_telemetry() # self.__collect_agent_unit_files_telemetry() if not self.__check_no_legacy_cgroups(): return agent_unit_name = systemd.get_agent_unit_name() agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") if agent_slice not in (_AZURE_SLICE, "system.slice"): _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) return self.__setup_azure_slice() cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, cpu_controller_root, memory_controller_root) if self._agent_cpu_cgroup_path is not None: _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) self.enable() CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) _log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled) except Exception as exception: _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) finally: self._initialized = True
def _check_processes_in_agent_cgroup(self): """ Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run commands used to start extensions on their own cgroup). Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported as unexpected, since they should belong to their own cgroup. Raises a CGroupsException if the check fails """ unexpected = [] try: daemon = os.getppid() extension_handler = os.getpid() agent_commands = set() agent_commands.update(shellutil.get_running_commands()) systemd_run_commands = set() systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup; agent_commands.update(shellutil.get_running_commands()) systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) for process in agent_cgroup: # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't. if process in (daemon, extension_handler) or process in systemd_run_commands: continue # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent if self._get_parent(process) in systemd_run_commands and self._get_command(process) == 'systemd-run': continue # check if the process is a command started by the agent or a descendant of one of those commands current = process while current != 0 and current not in agent_commands: current = self._get_parent(current) if current == 0: unexpected.append(self.__format_process(process)) if len(unexpected) >= 5: # collect just a small sample break except Exception as exception: _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) if len(unexpected) > 0: raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self): test_cases = [ (['ubuntu', '16.04', 'xenial'], True), (['ubuntu', '16.10', 'yakkety'], True), (['ubuntu', '18.04', 'bionic'], True), (['ubuntu', '18.10', 'cosmic'], True), (['ubuntu', '20.04', 'focal'], True), (['ubuntu', '20.10', 'groovy'], True), (['centos', '7.5', 'Source'], False), (['redhat', '7.7', 'Maipo'], False), (['redhat', '7.7.1908', 'Core'], False), (['bigip', '15.0.1', 'Final'], False), (['gaia', '273.562', 'R80.30'], False), (['debian', '9.1', ''], False), ] for (distro, supported) in test_cases: with patch("azurelinuxagent.common.cgroupapi.get_distro", return_value=distro): self.assertEqual(CGroupsApi.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro))
def test_is_systemd_should_return_false_when_systemd_does_not_manage_current_process( self): path_exists = os.path.exists def mock_path_exists(path): if path == "/run/systemd/system/": mock_path_exists.path_tested = True return False return path_exists(path) mock_path_exists.path_tested = False with patch("azurelinuxagent.common.cgroupapi.os.path.exists", mock_path_exists): is_systemd = CGroupsApi._is_systemd() self.assertFalse(is_systemd) self.assertTrue( mock_path_exists.path_tested, 'The expected path was not tested; the implementation of CGroupsApi._is_systemd() may have changed.' )
def is_log_collection_allowed(): # There are three conditions that need to be met in order to allow periodic log collection: # 1) It should be enabled in the configuration. # 2) The system must be using systemd to manage services. Needed for resource limiting of the log collection. # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting. conf_enabled = conf.get_collect_logs() systemd_present = CGroupsApi.is_systemd() supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3 is_allowed = conf_enabled and systemd_present and supported_python msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \ "configuration enabled [{1}], systemd present [{2}], python supported: [{3}]".format(is_allowed, conf_enabled, systemd_present, supported_python) logger.info(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, is_success=is_allowed, message=msg, log_event=False) return is_allowed
def initialize(self): try: if self._initialized: return # # check whether cgroup monitoring is supported on the current distro # self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: logger.info("Cgroup monitoring is not supported on {0}", get_distro()) return # # check systemd # self._cgroups_api = CGroupsApi.create() if not isinstance(self._cgroups_api, SystemdCgroupsApi): message = "systemd was not detected on {0}".format( get_distro()) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False) return def log_cgroup_info(format_string, *args): message = format_string.format(*args) logger.info(message) add_event(op=WALAEventOperation.CGroupsInfo, message=message) def log_cgroup_warn(format_string, *args): message = format_string.format(*args) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # # Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running # under systemd this could produce invalid resource usage data. Do not enable cgroups under this condition. # legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() if legacy_cgroups > 0: log_cgroup_warn( "The daemon's PID was added to a legacy cgroup; will not monitor resource usage." ) return # # check v1 controllers # cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points( ) if cpu_controller_root is not None: logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) else: log_cgroup_warn("The CPU cgroup controller is not mounted") if memory_controller_root is not None: logger.info( "The memory cgroup controller is mounted at {0}", memory_controller_root) else: log_cgroup_warn( "The memory cgroup controller is not mounted") # # check v2 controllers # cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers( ) if cgroup2_mountpoint is not None: log_cgroup_warn( "cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mountpoint, cgroup2_controllers) # # check the cgroups for the agent # agent_unit_name = self._cgroups_api.get_agent_unit_name() cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths( "self") if cpu_cgroup_relative_path is None: log_cgroup_warn( "The agent's process is not within a CPU cgroup") else: cpu_accounting = self._cgroups_api.get_unit_property( agent_unit_name, "CPUAccounting") log_cgroup_info('CPUAccounting: {0}', cpu_accounting) if memory_cgroup_relative_path is None: log_cgroup_warn( "The agent's process is not within a memory cgroup") else: memory_accounting = self._cgroups_api.get_unit_property( agent_unit_name, "MemoryAccounting") log_cgroup_info('MemoryAccounting: {0}', memory_accounting) # # All good, enable cgroups and start monitoring the agent # self._cgroups_enabled = True if cpu_controller_root is None or cpu_cgroup_relative_path is None: logger.info("Will not track CPU for the agent's cgroup") else: self._agent_cpu_cgroup_path = os.path.join( cpu_controller_root, cpu_cgroup_relative_path) CGroupsTelemetry.track_cgroup( CpuCgroup(agent_unit_name, self._agent_cpu_cgroup_path)) if memory_controller_root is None or memory_cgroup_relative_path is None: logger.info("Will not track memory for the agent's cgroup") else: self._agent_memory_cgroup_path = os.path.join( memory_controller_root, memory_cgroup_relative_path) CGroupsTelemetry.track_cgroup( MemoryCgroup(agent_unit_name, self._agent_memory_cgroup_path)) log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}", self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path) except Exception as e: message = "Error initializing cgroups: {0}".format(ustr(e)) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False) finally: self._initialized = True
def run(self, debug=False): # pylint: disable=R0912 """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # # Fetch the goal state one time; some components depend on information provided by the goal state and this # call ensures the required info is initialized (e.g telemetry depends on the container ID.) # protocol = self.protocol_util.get_protocol() protocol.update_goal_state() # Initialize the common parameters for telemetry events initialize_event_logger_vminfo_common_parameters(protocol) # Log OS-specific info. os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\ u"OSUtil: {util_name}; AgentService: {service_name}; "\ u"Python: {py_major}.{py_minor}.{py_micro}; "\ u"systemd: {systemd}; "\ u"LISDrivers: {lis_ver}; "\ u"logrotate: {has_logrotate};".format( dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION, util_name=type(self.osutil).__name__, service_name=self.osutil.service_name, py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR, py_micro=PY_VERSION_MICRO, systemd=CGroupsApi.is_systemd(), lis_ver=get_lis_version(), has_logrotate=has_logrotate() ) logger.info(os_info_msg) add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # # Perform initialization tasks # from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler(protocol) migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler(protocol) self._ensure_no_orphans() self._emit_restart_event() self._emit_changes_in_default_configuration() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() self._ensure_extension_telemetry_state_configured_properly( protocol) # Get all thread handlers telemetry_handler = get_send_telemetry_events_handler( self.protocol_util) all_thread_handlers = [ get_monitor_handler(), get_env_handler(), telemetry_handler, get_collect_telemetry_events_handler(telemetry_handler) ] if is_log_collection_allowed(): all_thread_handlers.append(get_collect_logs_handler()) # Launch all monitoring threads for thread_handler in all_thread_handlers: thread_handler.run() goal_state_interval = conf.get_goal_state_period( ) if conf.get_extensions_enabled( ) else GOAL_STATE_INTERVAL_DISABLED while self.running: # # Check that the parent process (the agent's daemon) is still running # if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break # # Check that all the threads are still running # for thread_handler in all_thread_handlers: if not thread_handler.is_alive(): logger.warn("{0} thread died, restarting".format( thread_handler.get_thread_name())) thread_handler.start() # # Process the goal state # if not protocol.try_update_goal_state(): self._heartbeat_update_goal_state_error_count += 1 else: if self._upgrade_available(protocol): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info( 'ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format( exthandlers_handler.last_etag)) self._send_heartbeat_telemetry(protocol) time.sleep(goal_state_interval) except Exception as e: # pylint: disable=C0103 msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)