def test_get_current_cpu_total_exception_handling(self): test_cpu_cg = CpuCgroup("test_extension", "dummy_path") self.assertRaises(IOError, test_cpu_cg._get_current_cpu_total) # Trying to raise ERRNO 20. test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount", "cpuacct.stat")) self.assertRaises(CGroupsException, test_cpu_cg._get_current_cpu_total)
def test_is_active(self): test_cgroup = CpuCgroup("test_extension", self.tmp_dir) self.assertEqual(False, test_cgroup.is_active()) with open(os.path.join(self.tmp_dir, "tasks"), mode="wb") as tasks: tasks.write(str(1000).encode()) self.assertEqual(True, test_cgroup.is_active())
def test_is_active_file_not_present(self, patch_periodic_warn): test_cgroup = CpuCgroup("test_extension", self.tmp_dir) self.assertEqual(False, test_cgroup.is_active()) test_cgroup = MemoryCgroup( "test_extension", os.path.join(self.tmp_dir, "this_cgroup_does_not_exist")) self.assertEqual(False, test_cgroup.is_active()) self.assertEqual(0, patch_periodic_warn.call_count)
def test_get_cpu_usage(self, patch_get_proc_stat, *args): patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat")) test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount")) # Mocking CPU consumption patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat_updated")) cpu_usage = test_cpu_cg.get_cpu_usage() self.assertEqual(5.114, cpu_usage)
def test_collect(self, patch_get_proc_stat, *args): patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat")) test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount")) # Mocking CPU consumption patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat_updated")) collected_metric = test_cpu_cg.collect()[0] self.assertEqual("cpu", collected_metric.controller) self.assertEqual("% Processor Time", collected_metric.metric_name) self.assertEqual(5.114, collected_metric.value)
def test_initialize_cpu_usage_should_set_current_cpu_usage(self): cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } cgroup.initialize_cpu_usage() self.assertEqual(cgroup._current_cgroup_cpu, 63763) # pylint: disable=protected-access self.assertEqual(cgroup._current_system_cpu, 5496872) # pylint: disable=protected-access
def test_get_throttled_time_should_return_the_value_since_its_last_invocation( self): test_file = os.path.join(self.tmp_dir, "cpu.stat") shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), test_file) # throttled_time = 50 cgroup = CpuCgroup("test", self.tmp_dir) cgroup.initialize_cpu_usage() shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t1"), test_file) # throttled_time = 2075541442327 throttled_time = cgroup.get_throttled_time() self.assertEqual(throttled_time, float(2075541442327 - 50) / 1E9, "The value of throttled_time is incorrect")
def test_initialie_cpu_usage_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist( self): cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") io_error_2 = IOError() io_error_2.errno = errno.ENOENT # "No such directory" TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), os.path.join(cgroup.path, "cpuacct.stat"): io_error_2 } cgroup.initialize_cpu_usage() self.assertEqual(cgroup._current_cgroup_cpu, 0) # pylint: disable=protected-access self.assertEqual(cgroup._current_system_cpu, 5496872) # check the system usage just for test sanity # pylint: disable=protected-access
def test_generate_extension_metrics_telemetry_dictionary(self, *args): # pylint: disable=unused-argument num_polls = 10 num_extensions = 1 cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)] # only verifying calculations and not validity of the values. memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] # no need to initialize the CPU usage, since we mock get_cpu_usage() below with patch("azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage"): for i in range(num_extensions): dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) dummy_memory_cgroup = MemoryCgroup("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked)) with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: for i in range(num_polls): patch_is_active.return_value = True patch_get_cpu_usage.return_value = cpu_percent_values[i] patch_get_memory_usage.return_value = memory_usage_values[i] # example 200 MB patch_get_memory_max_usage.return_value = max_memory_usage_values[i] # example 450 MB CGroupsTelemetry.poll_all_tracked()
def test_telemetry_in_place_leaf_cgroup(self): """ Ensure this leaf (i.e. not root of cgroup tree) cgroup has distinct metrics from the root cgroup. """ # Does nothing on systems where the default cgroup for a randomly-created process (like this test invocation) # is the root cgroup. cg = make_self_cgroups() root = make_root_cgroups() if cg.cgroups['cpu'] != root.cgroups['cpu']: ct = CGroupsTelemetry("test", cg) cpu = CpuCgroup(ct) self.assertLess(cpu._current_cpu_total, cpu._current_system_cpu) consume_cpu_time() # Eat some CPU time.sleep(1) # Generate some idle time cpu._update_cpu_data() self.assertLess(cpu._current_cpu_total, cpu._current_system_cpu)
def _track_new_extension_cgroups(num_extensions): for i in range(num_extensions): dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) dummy_memory_cgroup = MemoryCgroup( "dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_memory_cgroup)
def test_telemetry_inplace(self): """ Test raw measures and basic statistics for the cgroup in which this process is currently running. """ cg = make_self_cgroups() self.assertIn('cpu', cg.cgroups) self.assertIn('memory', cg.cgroups) ct = CGroupsTelemetry("test", cg) cpu = CpuCgroup(ct) self.assertGreater(cpu._current_system_cpu, 0) consume_cpu_time() # Eat some CPU cpu._update_cpu_data() self.assertGreater(cpu._current_cpu_total, cpu._previous_cpu_total) self.assertGreater(cpu._current_system_cpu, cpu._previous_system_cpu) percent_used = cpu._get_cpu_percent() self.assertGreater(percent_used, 0)
def test_cpu_cgroup_create(self, patch_get_proc_stat): patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat")) test_cpu_cg = CpuCgroup("test_extension", "dummy_path") self.assertEqual(398488, test_cpu_cg._current_system_cpu) self.assertEqual(0, test_cpu_cg._current_cpu_total) self.assertEqual(0, test_cpu_cg._previous_cpu_total) self.assertEqual(0, test_cpu_cg._previous_system_cpu) self.assertEqual("cpu", test_cpu_cg.controller)
def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2(self, patch_periodic_warn, # pylint: disable=unused-argument patch_cpu_usage, patch_add_metric, *args): ioerror = IOError() ioerror.errno = 2 patch_cpu_usage.side_effect = ioerror CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name", "/test/path")) PollResourceUsage().run() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent.
def stop_tracking_unit_cgroups(self, unit_name): """ TODO: remove Memory cgroups from tracked list. """ try: cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is not None: CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path)) except Exception as exception: logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
def test_cpu_telemetry(self): """ Test Cpu telemetry class """ cg = make_self_cgroups() self.assertIn('cpu', cg.cgroups) ct = CGroupsTelemetry('test', cg) self.assertIs(cg, ct.cgroup) cpu = CpuCgroup(ct) self.assertIs(cg, cpu.cgt.cgroup) ticks_before = cpu._current_cpu_total consume_cpu_time() time.sleep(1) cpu._update_cpu_data() ticks_after = cpu._current_cpu_total self.assertGreater(ticks_after, ticks_before) p2 = cpu._get_cpu_percent() self.assertGreater(p2, 0) # when running under PyCharm, this is often > 100 # on a multi-core machine self.assertLess(p2, 200)
def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation( self): cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } cgroup.initialize_cpu_usage() TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t1"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t1") } cpu_usage = cgroup.get_cpu_usage() self.assertEqual(cpu_usage, 0.031) TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t2"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t2") } cpu_usage = cgroup.get_cpu_usage() self.assertEqual(cpu_usage, 0.045)
def start_tracking_unit_cgroups(self, unit_name): """ TODO: Start tracking Memory Cgroups """ try: cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is None: logger.info("The CPU controller is not mounted; will not track resource usage") else: CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) except Exception as exception: logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
def initialize(self): try: if self._initialized: return # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: logger.info("Cgroup monitoring is not supported on {0}", get_distro()) return # check that systemd is detected correctly self._cgroups_api = SystemdCgroupsApi() if not systemd.is_systemd(): _log_cgroup_warning("systemd was not detected on {0}", get_distro()) return _log_cgroup_info("systemd version: {0}", systemd.get_version()) # This is temporarily disabled while we analyze telemetry. Likely it will be removed. # self.__collect_azure_unit_telemetry() # self.__collect_agent_unit_files_telemetry() if not self.__check_no_legacy_cgroups(): return agent_unit_name = systemd.get_agent_unit_name() agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") if agent_slice not in (_AZURE_SLICE, "system.slice"): _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) return self.__setup_azure_slice() cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, cpu_controller_root, memory_controller_root) if self._agent_cpu_cgroup_path is not None: _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) self.enable() CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) _log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled) except Exception as exception: _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) finally: self._initialized = True
def disable(self, reason, disableCgroups): # Todo: disable/reset extension when ext quotas introduced if disableCgroups == DisableCgroups.ALL: # disable all self._agent_cgroups_enabled = False self._extensions_cgroups_enabled = False self.__reset_agent_cpu_quota() CGroupsTelemetry.reset() elif disableCgroups == DisableCgroups.AGENT: # disable agent self._agent_cgroups_enabled = False self.__reset_agent_cpu_quota() CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) elif disableCgroups == DisableCgroups.EXTENSIONS: # disable extensions self._extensions_cgroups_enabled = False message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) logger.info(message) # log as INFO for now, in the future it should be logged as WARNING add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
def stop_tracking_extension_cgroups(self, extension_name): """ TODO: remove extension Memory cgroups from tracked list """ try: extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name) cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE, extension_slice_name) cpu_cgroup_mountpoint, _ = self._cgroups_api.get_cgroup_mount_points() cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) if cpu_cgroup_path is not None: CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path)) except Exception as exception: logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
def test_initialize_cpu_usage_should_raise_an_exception_when_called_more_than_once( self): cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } cgroup.initialize_cpu_usage() with self.assertRaises(CGroupsException): cgroup.initialize_cpu_usage()
def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2( self, patch_periodic_warn, patch_cpu_usage, patch_add_metric, *args): ioerror = IOError() ioerror.errno = 2 patch_cpu_usage.side_effect = ioerror CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name", "/test/path")) monitor_handler = get_monitor_handler() monitor_handler.init_protocols() monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow( ) - timedelta(hours=1) monitor_handler.poll_telemetry_metrics() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual( 0, patch_add_metric.call_count) # No metrics should be sent. monitor_handler.stop()
def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation( self): osutil = get_osutil() cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } cgroup.initialize_cpu_usage() TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t1"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t1") } cpu_usage = cgroup.get_cpu_usage() self.assertEqual( cpu_usage, round(100.0 * 0.000307697876885 * osutil.get_processor_cores(), 3)) TestCpuCgroup.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t2"), os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t2") } cpu_usage = cgroup.get_cpu_usage() self.assertEqual( cpu_usage, round(100.0 * 0.000445181085968 * osutil.get_processor_cores(), 3))
def test_get_tracked_metrics_should_return_the_throttled_time(self): cgroup = CpuCgroup("test", os.path.join(data_dir, "cgroups")) cgroup.initialize_cpu_usage() def find_throttled_time(metrics): return [ m for m in metrics if m.counter == MetricsCounter.THROTTLED_TIME ] found = find_throttled_time(cgroup.get_tracked_metrics()) self.assertTrue( len(found) == 0, "get_tracked_metrics should not fetch the throttled time by default. Found: {0}" .format(found)) found = find_throttled_time( cgroup.get_tracked_metrics(track_throttled_time=True)) self.assertTrue( len(found) == 1, "get_tracked_metrics should have fetched the throttled time by default. Found: {0}" .format(found))
def start_extension_command( self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): scope = "{0}_{1}".format(cmd_name, uuid.uuid4()) extension_slice_name = self.get_extension_slice_name(extension_name) with self._systemd_run_commands_lock: process = subprocess.Popen( # pylint: disable=W1509 "systemd-run --unit={0} --scope --slice={1} {2}".format( scope, extension_slice_name, command), shell=shell, cwd=cwd, stdout=stdout, stderr=stderr, env=env, preexec_fn=os.setsid) # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run self._systemd_run_commands.append(process.pid) scope_name = scope + '.scope' logger.info("Started extension in unit '{0}'", scope_name) try: cgroup_relative_path = os.path.join( 'azure.slice/azure-vmextensions.slice', extension_slice_name) cpu_cgroup_mountpoint, _ = self.get_cgroup_mount_points() if cpu_cgroup_mountpoint is None: logger.info( "The CPU controller is not mounted; will not track resource usage" ) else: cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) CGroupsTelemetry.track_cgroup( CpuCgroup(extension_name, cpu_cgroup_path)) except IOError as e: if e.errno == 2: # 'No such file or directory' logger.info( "The extension command already completed; will not track resource usage" ) logger.info( "Failed to start tracking resource usage for the extension: {0}", ustr(e)) except Exception as e: logger.info( "Failed to start tracking resource usage for the extension: {0}", ustr(e)) # Wait for process completion or timeout try: return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. if not self._is_systemd_failure(scope, stderr): # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error raise # There was an issue with systemd-run. We need to log it and retry the extension without systemd. process_output = read_output(stdout, stderr) # Reset the stdout and stderr stdout.truncate(0) stderr.truncate(0) if isinstance(e, ExtensionOperationError): # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError err_msg = 'Systemd process exited with code %s and output %s' % ( e.exit_code, process_output) # pylint: disable=no-member else: err_msg = "Systemd timed-out, output: %s" % process_output raise SystemdRunError(err_msg) finally: with self._systemd_run_commands_lock: self._systemd_run_commands.remove(process.pid)
def initialize(self): try: if self._initialized: return # # check whether cgroup monitoring is supported on the current distro # self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: logger.info("Cgroup monitoring is not supported on {0}", get_distro()) return # # check systemd # self._cgroups_api = CGroupsApi.create() if not isinstance(self._cgroups_api, SystemdCgroupsApi): message = "systemd was not detected on {0}".format( get_distro()) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False) return def log_cgroup_info(format_string, *args): message = format_string.format(*args) logger.info(message) add_event(op=WALAEventOperation.CGroupsInfo, message=message) def log_cgroup_warn(format_string, *args): message = format_string.format(*args) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # # Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running # under systemd this could produce invalid resource usage data. Do not enable cgroups under this condition. # legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() if legacy_cgroups > 0: log_cgroup_warn( "The daemon's PID was added to a legacy cgroup; will not monitor resource usage." ) return # # check v1 controllers # cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points( ) if cpu_controller_root is not None: logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) else: log_cgroup_warn("The CPU cgroup controller is not mounted") if memory_controller_root is not None: logger.info( "The memory cgroup controller is mounted at {0}", memory_controller_root) else: log_cgroup_warn( "The memory cgroup controller is not mounted") # # check v2 controllers # cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers( ) if cgroup2_mountpoint is not None: log_cgroup_warn( "cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mountpoint, cgroup2_controllers) # # check the cgroups for the agent # agent_unit_name = self._cgroups_api.get_agent_unit_name() cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths( "self") if cpu_cgroup_relative_path is None: log_cgroup_warn( "The agent's process is not within a CPU cgroup") else: cpu_accounting = self._cgroups_api.get_unit_property( agent_unit_name, "CPUAccounting") log_cgroup_info('CPUAccounting: {0}', cpu_accounting) if memory_cgroup_relative_path is None: log_cgroup_warn( "The agent's process is not within a memory cgroup") else: memory_accounting = self._cgroups_api.get_unit_property( agent_unit_name, "MemoryAccounting") log_cgroup_info('MemoryAccounting: {0}', memory_accounting) # # All good, enable cgroups and start monitoring the agent # self._cgroups_enabled = True if cpu_controller_root is None or cpu_cgroup_relative_path is None: logger.info("Will not track CPU for the agent's cgroup") else: self._agent_cpu_cgroup_path = os.path.join( cpu_controller_root, cpu_cgroup_relative_path) CGroupsTelemetry.track_cgroup( CpuCgroup(agent_unit_name, self._agent_cpu_cgroup_path)) if memory_controller_root is None or memory_cgroup_relative_path is None: logger.info("Will not track memory for the agent's cgroup") else: self._agent_memory_cgroup_path = os.path.join( memory_controller_root, memory_cgroup_relative_path) CGroupsTelemetry.track_cgroup( MemoryCgroup(agent_unit_name, self._agent_memory_cgroup_path)) log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}", self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path) except Exception as e: message = "Error initializing cgroups: {0}".format(ustr(e)) logger.warn(message) add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False) finally: self._initialized = True
def test_is_active_incorrect_file(self, patch_periodic_warn): open(os.path.join(self.tmp_dir, "tasks"), mode="wb").close() test_cgroup = CpuCgroup("test_extension", os.path.join(self.tmp_dir, "tasks")) self.assertEqual(False, test_cgroup.is_active()) self.assertEqual(1, patch_periodic_warn.call_count)
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): scope = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4()) process = subprocess.Popen( "systemd-run --unit={0} --scope {1}".format(scope, command), shell=shell, cwd=cwd, stdout=stdout, stderr=stderr, env=env, preexec_fn=os.setsid) scope_name = scope + '.scope' logger.info("Started extension in unit '{0}'", scope_name) try: # systemd-run creates the scope under the system slice by default cgroup_relative_path = os.path.join('system.slice', scope_name) cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points() if cpu_cgroup_mountpoint is None: logger.info("The CPU controller is not mounted; will not track resource usage") else: cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) CGroupsTelemetry.track_cgroup(CpuCgroup(extension_name, cpu_cgroup_path)) if memory_cgroup_mountpoint is None: logger.info("The memory controller is not mounted; will not track resource usage") else: memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) CGroupsTelemetry.track_cgroup(MemoryCgroup(extension_name, memory_cgroup_path)) except IOError as e: if e.errno == 2: # 'No such file or directory' logger.info("The extension command already completed; will not track resource usage") logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) except Exception as e: logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) # Wait for process completion or timeout try: process_output = handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. process_output = read_output(stdout, stderr) systemd_failure = self._is_systemd_failure(scope, process_output) if not systemd_failure: # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error raise else: # There was an issue with systemd-run. We need to log it and retry the extension without systemd. err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \ if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \ 'Will retry invoking the extension without systemd. ' \ 'Systemd-run error: {1}'.format(scope, err_msg) add_event(op=WALAEventOperation.InvokeCommandUsingSystemd, is_success=False, log_event=False, message=event_msg) logger.warn(event_msg) # Reset the stdout and stderr stdout.truncate(0) stderr.truncate(0) # Try invoking the process again, this time without systemd-run logger.info('Extension invocation using systemd failed, falling back to regular invocation ' 'without cgroups tracking.') process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) process_output = handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) return process_output # The process terminated in time and successfully return process_output
def test_get_cpu_usage_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked( self): cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") with self.assertRaises(CGroupsException): cpu_usage = cgroup.get_cpu_usage() # pylint: disable=unused-variable