def test_start_extension_command_should_start_tracking_the_extension_cgroups( self): # CPU usage is initialized when we begin tracking a CPU cgroup; since this test does not retrieve the # CPU usage, there is no need for initialization with patch( "azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage" ): CGroupConfigurator.get_instance().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", timeout=300, shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "cpu", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" ))) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "memory", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" )))
def test_cleanup_legacy_cgroups_should_disable_cgroups_when_the_daemon_was_added_to_the_legacy_cgroup_on_systemd(self, _): # Set up a mock /var/run/waagent.pid file daemon_pid = "42" daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") fileutil.write_file(daemon_pid_file, daemon_pid + "\n") # Set up old controller cgroups and add the daemon PID to them CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) cgroup_configurator = CGroupConfigurator.get_instance() with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as mock_add_event: with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): cgroup_configurator.cleanup_legacy_cgroups() self.assertEquals(len(mock_add_event.call_args_list), 1) _, kwargs = mock_add_event.call_args_list[0] self.assertEquals(kwargs['op'], 'CGroupsCleanUp') self.assertFalse(kwargs['is_success']) self.assertEquals( kwargs['message'], "Failed to process legacy cgroups. Collection of resource usage data will be disabled. [CGroupsException] The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) self.assertFalse(cgroup_configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value, proc_stat_memory_usage_value, pids=None): for metric in metrics: self.assertIn(metric.category, ["Process", "Memory"]) if metric.category == "Process": self.assertEqual(metric.counter, "% Processor Time") self.assertEqual(metric.value, cpu_metric_value) if metric.category == "Memory": self.assertIn(metric.counter, [ "Total Memory Usage", "Max Memory Usage", "Memory Used by Process" ]) if metric.counter == "Total Memory Usage": self.assertEqual(metric.value, memory_metric_value) elif metric.counter == "Max Memory Usage": self.assertEqual(metric.value, max_memory_metric_value) elif metric.counter == "Memory Used by Process": if pids: processes_instances = [ CGroupsTelemetry.get_process_info_summary(pid) for pid in pids ] else: processes_instances = [ CGroupsTelemetry.get_process_info_summary(pid) for pid in TestCGroupsTelemetry.TestProcessIds ] self.assertIn(metric.instance, processes_instances) self.assertEqual(metric.value, proc_stat_memory_usage_value)
def test_telemetry_polling_with_inactive_cgroups(self, *_): num_extensions = 5 no_extensions_expected = 0 self._track_new_extension_cgroups(num_extensions) self._assert_cgroups_are_tracked(num_extensions) metrics = CGroupsTelemetry.poll_all_tracked() for i in range(num_extensions): self.assertFalse( CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) self.assertFalse( CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) self._assert_calculated_resource_metrics_equal([], [], [], [], proc_ids=None) self.assertEqual(len(metrics), 0) collected_metrics = CGroupsTelemetry.report_all_tracked() self._assert_extension_metrics_data(collected_metrics, num_extensions, [], [], [], [], is_cpu_present=False, is_memory_present=False) self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), no_extensions_expected) self._assert_calculated_resource_metrics_equal([], [], [], [], [])
def exercise_telemetry_instantiation(self, test_cgroup): test_extension_name = test_cgroup.name CGroupsTelemetry.track_cgroup(test_cgroup) self.assertIn('cpu', test_cgroup.cgroups) self.assertIn('memory', test_cgroup.cgroups) self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name)) consume_cpu_time() time.sleep(1) metrics, limits = CGroupsTelemetry.report_all_tracked() my_metrics = metrics[test_extension_name] self.assertEqual(len(my_metrics), 2) for item in my_metrics: metric_family, metric_name, metric_value = item if metric_family == "Process": self.assertEqual(metric_name, "% Processor Time") self.assertGreater(metric_value, 0.0) elif metric_family == "Memory": self.assertEqual(metric_name, "Total Memory Usage") self.assertGreater(metric_value, 100000) else: self.fail("Unknown metric {0}/{1} value {2}".format( metric_family, metric_name, metric_value)) my_limits = limits[test_extension_name] self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance") self.assertGreater(my_limits.cpu_limit, 0.0) self.assertGreater(my_limits.memory_limit, 0.0)
def disable(self, reason): self._cgroups_enabled = False message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) logger.info(message) # log as INFO for now, in the future it should be logged as WARNING add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False) self.__reset_cpu_quota() CGroupsTelemetry.reset()
def track_cgroups(extension_cgroups): try: for cgroup in extension_cgroups: CGroupsTelemetry.track_cgroup(cgroup) except Exception as e: logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " "Error: {1}".format(cgroup.path, ustr(e)))
def __impl(): cgroups = self._cgroups_api.create_agent_cgroups() if track_cgroups: for cgroup in cgroups: CGroupsTelemetry.track_cgroup(cgroup) return cgroups
def _track_new_extension_cgroups(num_extensions): for i in range(num_extensions): dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", "dummy_extension_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_memory_cgroup)
def test_cgroup_is_tracked(self, *args): num_extensions = 5 self._track_new_extension_cgroups(num_extensions) self._assert_cgroups_are_tracked(num_extensions) self.assertFalse( CGroupsTelemetry.is_tracked("not_present_cpu_dummy_path")) self.assertFalse( CGroupsTelemetry.is_tracked("not_present_memory_dummy_path"))
def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_systemd_times_out( self, _): with self._get_cgroup_configurator() as configurator: # Systemd has its own internal timeout which is shorter than what we define for extension operation timeout. # When systemd times out, it will write a message to stderr and exit with exit code 1. # In that case, we will internally recognize the failure due to the non-zero exit code, not as a timeout. configurator.mocks.add_command( MockCommand( "systemd-run", return_value=1, stdout='', stderr= 'Failed to start transient scope unit: Connection timed out' )) with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: CGroupsTelemetry.reset() configurator.start_extension_command( extension_name= "Microsoft.Compute.TestExtension-1.2.3", command="echo 'success'", timeout=300, shell=True, cwd=self.tmp_dir, env={}, stdout=stdout, stderr=stderr) self.assertFalse(configurator.enabled(), "Cgroups should have been disabled") extension_calls = [ args[0] for (args, _) in popen_patch.call_args_list if "echo 'success'" in args[0] ] self.assertEqual( 2, len(extension_calls), "The extension should have been called twice. Got: {0}" .format(extension_calls)) self.assertIn( "systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0], "The first call to the extension should have used systemd" ) self.assertNotIn( "systemd-run", extension_calls[1], "The second call to the extension should not have used systemd" ) self.assertEqual( len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created")
def setUp(self): AgentTestCase.setUp(self) event.init_event_logger(os.path.join(self.tmp_dir, EVENTS_DIRECTORY)) CGroupsTelemetry.reset() clear_singleton_instances(ProtocolUtil) protocol = WireProtocol('endpoint') protocol.update_goal_state = MagicMock() self.get_protocol = patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol) self.get_protocol.start()
def disable_cgroups(exception): self.disable() CGroupsTelemetry.reset() add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=False, log_event=False, message='{0} {1}'.format(message, ustr(exception)))
def poll_telemetry_metrics(self): time_now = datetime.datetime.utcnow() if not self.last_cgroup_polling_telemetry: self.last_cgroup_polling_telemetry = time_now if time_now >= (self.last_cgroup_polling_telemetry + MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): CGroupsTelemetry.poll_all_tracked() self.last_cgroup_polling_telemetry = time_now
def test_start_extension_command_should_invoke_the_command_directly_if_systemd_times_out( self, _): # Systemd has its own internal timeout which is shorter than what we define for extension operation timeout. # When systemd times out, it will write a message to stderr and exit with exit code 1. # In that case, we will internally recognize the failure due to the non-zero exit code, not as a timeout. original_popen = subprocess.Popen systemd_timeout_command = "echo 'Failed to start transient scope unit: Connection timed out' >&2 && exit 1" def mock_popen(*args, **kwargs): # If trying to invoke systemd, mock what would happen if systemd timed out internally: # write failure to stderr and exit with exit code 1. new_args = args if "systemd-run" in args[0]: new_args = (systemd_timeout_command, ) return original_popen(new_args, **kwargs) expected_output = "[stdout]\n{0}\n\n\n[stderr]\n" with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as popen_patch: CGroupsTelemetry.reset() SystemdCgroupsApi().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="echo 'success'", timeout=300, shell=True, cwd=self.tmp_dir, env={}, stdout=stdout, stderr=stderr) extension_calls = [ args[0] for (args, _) in popen_patch.call_args_list if "echo 'success'" in args[0] ] self.assertEquals( 2, len(extension_calls), "The extension should have been invoked exactly twice") self.assertIn( "systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0], "The first call to the extension should have used systemd" ) self.assertEquals( "echo 'success'", extension_calls[1], "The second call to the extension should not have used systemd" ) self.assertEquals(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created")
def __reset_cpu_quota(): """ Removes any CPUQuota on the agent NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence over this setting. """ logger.info("Resetting agent's CPUQuota") if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity) CGroupsTelemetry.set_track_throttled_time(False)
def test_cleanup_legacy_cgroups_should_disable_cgroups_when_it_fails_to_process_legacy_cgroups( self): # Set up a mock /var/run/waagent.pid file daemon_pid = "42" daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") fileutil.write_file(daemon_pid_file, daemon_pid + "\n") # Set up old controller cgroups and add the daemon PID to them CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) # Set up new controller cgroups and add extension handler's PID to them CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "cpu", "999") CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "memory", "999") def mock_append_file(filepath, contents, **kwargs): if re.match(r'/.*/cpu/.*/cgroup.procs', filepath): raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC)) fileutil.append_file(filepath, contents, **kwargs) # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) cgroup_configurator = CGroupConfigurator.get_instance() with patch("azurelinuxagent.common.cgroupconfigurator.add_event" ) as mock_add_event: with patch( "azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): with patch( "azurelinuxagent.common.cgroupapi.fileutil.append_file", side_effect=mock_append_file): cgroup_configurator.cleanup_legacy_cgroups() self.assertEquals(len(mock_add_event.call_args_list), 1) _, kwargs = mock_add_event.call_args_list[0] self.assertEquals(kwargs['op'], 'CGroupsCleanUp') self.assertFalse(kwargs['is_success']) self.assertEquals( kwargs['message'], 'Failed to process legacy cgroups. Collection of resource usage data will be disabled. [Errno 28] No space left on device' ) self.assertFalse(cgroup_configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def test_disable_should_reset_tracked_cgroups(self): configurator = CGroupConfigurator.get_instance() # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) configurator.disable() self.assertFalse(configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def __set_cpu_quota(quota): """ Sets the agent's CPU quota to the given percentage (100% == 1 CPU) NOTE: This is done using a dropin file in the default dropin directory; any local overrides on the VM will take precedence over this setting. """ quota_percentage = "{0}%".format(quota) _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage) if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage): CGroupsTelemetry.set_track_throttled_time(True)
def test_telemetry_polling_to_generate_transient_logs_index_error(self): num_extensions = 1 self._track_new_extension_cgroups(num_extensions) # Generating a different kind of error (non-IOError) to check the logging. # Trying to invoke IndexError during the getParameter call with patch("azurelinuxagent.common.utils.fileutil.read_file", return_value=''): with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: expected_call_count = 2 # 1 periodic warning for the cpu cgroups, and 1 for memory for data_count in range(1, 10): CGroupsTelemetry.poll_all_tracked() self.assertEqual(expected_call_count, patch_periodic_warn.call_count)
def stop_tracking_unit_cgroups(self, unit_name): """ TODO: remove Memory cgroups from tracked list. """ try: cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is not None: CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path)) except Exception as exception: logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): num_extensions = 5 self._track_new_extension_cgroups(num_extensions) with patch( "azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage" ) as patch_get_memory_max_usage: with patch( "azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage" ) as patch_get_memory_usage: with patch("azurelinuxagent.common.cgroup.CGroup.is_active" ) as patch_is_active: patch_is_active.return_value = True current_memory = 209715200 current_max_memory = 471859200 patch_get_memory_usage.return_value = current_memory # example 200 MB patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB num_polls = 10 for data_count in range(1, num_polls + 1): metrics = CGroupsTelemetry.poll_all_tracked() self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) self._assert_calculated_resource_metrics_equal( cpu_usage=[], memory_usage=[current_memory] * data_count, max_memory_usage=[current_max_memory] * data_count, memory_statm_memory_usage=[ TestCGroupsTelemetry.TestProcStatmMemoryUsed ] * data_count, proc_ids=TestCGroupsTelemetry.TestProcessIds) # Memory is only populated, CPU is not. Thus 5 metrics per cgroup. self.assertEqual(len(metrics), num_extensions * 5) self._assert_polled_metrics_equal( metrics, 0, current_memory, current_max_memory, TestCGroupsTelemetry.TestProcStatmMemoryUsed) collected_metrics = CGroupsTelemetry.report_all_tracked() self._assert_extension_metrics_data( collected_metrics, num_extensions, [], [TestCGroupsTelemetry.TestProcStatmMemoryUsed] * num_polls, [current_memory] * num_polls, [current_max_memory] * num_polls, is_cpu_present=False) self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) self._assert_calculated_resource_metrics_equal([], [], [], [], [])
def test_disable_should_reset_tracked_cgroups(self): # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) CGroupConfiguratorSystemdTestCase._get_new_cgroup_configurator_instance( ).disable() self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def test_cgroup_pruning(self, *args): num_extensions = 5 num_controllers = 2 self._track_new_extension_cgroups(num_extensions) self._assert_cgroups_are_tracked(num_extensions) self.assertEqual(num_extensions * num_controllers, len(CGroupsTelemetry._tracked)) CGroupsTelemetry.prune_all_tracked() for i in range(num_extensions): self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) self.assertEqual(0, len(CGroupsTelemetry._tracked))
def start_tracking_unit_cgroups(self, unit_name): """ TODO: Start tracking Memory Cgroups """ try: cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is None: logger.info("The CPU controller is not mounted; will not track resource usage") else: CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) except Exception as exception: logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
def test_telemetry_polling_to_not_generate_transient_logs_ioerror_file_not_found(self, patch_periodic_warn): num_extensions = 1 self._track_new_extension_cgroups(num_extensions) self.assertEqual(0, patch_periodic_warn.call_count) # Not expecting logs present for io_error with errno=errno.ENOENT io_error_2 = IOError() io_error_2.errno = errno.ENOENT with patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=io_error_2): poll_count = 1 for data_count in range(poll_count, 10): CGroupsTelemetry.poll_all_tracked() self.assertEqual(0, patch_periodic_warn.call_count)
def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): num_extensions = 5 for i in range(num_extensions): dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", "dummy_extension_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) with patch("azurelinuxagent.common.cgroup.CpuCgroup._get_cpu_percent") as patch_get_cpu_percent: with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: patch_is_active.return_value = True patch_get_memory_usage.side_effect = Exception("File not found") current_cpu = 30 patch_get_cpu_percent.return_value = current_cpu poll_count = 1 for data_count in range(poll_count, 10): CGroupsTelemetry.poll_all_tracked() self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) self._assert_cgroup_metrics_equal(cpu_usage=[current_cpu] * data_count, memory_usage=[], max_memory_usage=[]) CGroupsTelemetry.report_all_tracked() self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) self._assert_cgroup_metrics_equal([], [], [])
def test_telemetry_polling_with_inactive_cgroups(self, *_): num_extensions = 5 no_extensions_expected = 0 self._track_new_extension_cgroups(num_extensions) self._assert_cgroups_are_tracked(num_extensions) metrics = CGroupsTelemetry.poll_all_tracked() for i in range(num_extensions): self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) self.assertEqual(len(metrics), 0)
def test_enable_should_not_track_throttled_time_when_setting_the_cpu_quota_fails( self): with self._get_cgroup_configurator(enable=False) as configurator: if CGroupsTelemetry.get_track_throttled_time(): raise Exception( "Test setup should not start tracking Throttle Time") configurator.mocks.add_file(UnitFilePaths.cpu_quota, Exception("A TEST EXCEPTION")) configurator.enable() self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked")
def initialize(self): try: if self._initialized: return # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: logger.info("Cgroup monitoring is not supported on {0}", get_distro()) return # check that systemd is detected correctly self._cgroups_api = SystemdCgroupsApi() if not systemd.is_systemd(): _log_cgroup_warning("systemd was not detected on {0}", get_distro()) return _log_cgroup_info("systemd version: {0}", systemd.get_version()) # This is temporarily disabled while we analyze telemetry. Likely it will be removed. # self.__collect_azure_unit_telemetry() # self.__collect_agent_unit_files_telemetry() if not self.__check_no_legacy_cgroups(): return agent_unit_name = systemd.get_agent_unit_name() agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") if agent_slice not in (_AZURE_SLICE, "system.slice"): _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) return self.__setup_azure_slice() cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, cpu_controller_root, memory_controller_root) if self._agent_cpu_cgroup_path is not None: _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) self.enable() CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) _log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled) except Exception as exception: _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) finally: self._initialized = True