def run(self, child_args=None): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() CGroups.setup() # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn( "Daemon ended with exception -- Sleep 15 seconds and restart daemon" ) time.sleep(15)
def run(self, child_args=None): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() CGroups.setup() # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn("Daemon ended with exception -- Sleep 15 seconds and restart daemon") time.sleep(15)
def pre_exec_function(): """ Change process state before the actual target process is started. Effectively, this runs between the fork() and the exec() of sub-process creation. :return: """ os.setsid() CGroups.add_to_extension_cgroup(self.ext_handler.name)
def test_cgroup_utilities(self): """ Test utilities for querying cgroup metadata """ cpu_id = CGroups.get_hierarchy_id('cpu') self.assertGreater(int(cpu_id), 0) memory_id = CGroups.get_hierarchy_id('memory') self.assertGreater(int(memory_id), 0) self.assertNotEqual(cpu_id, memory_id)
def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error("monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
def init_cgroups(): # Track metrics for the wrapper cgroup and for the agent cgroup try: # This creates the wrapper cgroup for everything under agent, # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/ # There is no need in tracking this cgroup, as it only serves # as an umbrella for the agent and extensions cgroups CGroups.for_extension("") # This creates the agent's cgroup (for the daemon and extension handler) # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent # If the system is using systemd, it would have already been set up under /system.slice CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc())
def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error( "monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value: {1} bytes, Threshold: {2} megabytes." \ .format(cgroup_name, value, thresholds.memory_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. " \ "Current Value: {1}, Threshold: {2}." \ .format(cgroup_name, value, thresholds.cpu_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc())
def make_root_cgroups(): """ Build a CGroups object for the topmost cgroup :return: CGroups for most-encompassing cgroup :rtype: CGroups """ def path_maker(hierarchy, _): return os.path.join(BASE_CGROUPS, hierarchy) return CGroups("root", path_maker)
def make_self_cgroups(): """ Build a CGroups object for the cgroup to which this process already belongs :return: CGroups containing this process :rtype: CGroups """ def path_maker(hierarchy, __): suffix = CGroups.get_my_cgroup_path(CGroups.get_hierarchy_id('cpu')) return os.path.join(BASE_CGROUPS, hierarchy, suffix) return CGroups("inplace", path_maker)
def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() if env is None: env = {} env.update(os.environ) try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). # Some extensions erroneously begin cmd with a slash; don't interpret those # as root-relative. (Issue #1170) full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep)) def pre_exec_function(): """ Change process state before the actual target process is started. Effectively, this runs between the fork() and the exec() of sub-process creation. :return: """ os.setsid() CGroups.add_to_extension_cgroup(self.ext_handler.name) process = subprocess.Popen(full_path, shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, preexec_fn=pre_exec_function) except OSError as e: raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror), code=extension_error_code) cg = CGroups.for_extension(self.ext_handler.name) CGroupsTelemetry.track_extension(self.ext_handler.name, cg) msg = capture_from_process(process, cmd, timeout, extension_error_code) ret = process.poll() if ret is None: raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg), code=extension_error_code) if ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg), code=extension_error_code) duration = elapsed_milliseconds(begin_utc) log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""])) self.logger.verbose(log_msg) self.report_event(message=log_msg, duration=duration, log_event=False)
def test_format_memory_value(self): """ Test formatting of memory amounts into human-readable units """ self.assertEqual(-1, CGroups._format_memory_value('bytes', None)) self.assertEqual(2048, CGroups._format_memory_value('kilobytes', 2)) self.assertEqual(0, CGroups._format_memory_value('kilobytes', 0)) self.assertEqual(2048000, CGroups._format_memory_value('kilobytes', 2000)) self.assertEqual(2048*1024, CGroups._format_memory_value('megabytes', 2)) self.assertEqual((1024 + 512) * 1024 * 1024, CGroups._format_memory_value('gigabytes', 1.5)) self.assertRaises(CGroupsException, CGroups._format_memory_value, 'KiloBytes', 1)
def test_format_memory_value(self): """ Test formatting of memory amounts into human-readable units """ self.assertEqual(-1, CGroups._format_memory_value('bytes', None)) self.assertEqual(2048, CGroups._format_memory_value('kilobytes', 2)) self.assertEqual(0, CGroups._format_memory_value('kilobytes', 0)) self.assertEqual(2048000, CGroups._format_memory_value('kilobytes', 2000)) self.assertEqual(2048 * 1024, CGroups._format_memory_value('megabytes', 2)) self.assertEqual((1024 + 512) * 1024 * 1024, CGroups._format_memory_value('gigabytes', 1.5)) self.assertRaises(CGroupsException, CGroups._format_memory_value, 'KiloBytes', 1)
def assert_limits(self, _, patch_set_cpu, patch_set_memory_limit, patch_get_enforce, patch_add_event, ext_name, expected_cpu_limit, limits_enforced=True, exception_raised=False): should_limit = expected_cpu_limit > 0 patch_get_enforce.return_value = limits_enforced if exception_raised: patch_set_memory_limit.side_effect = CGroupsException( 'set_memory_limit error') try: cg = CGroups.for_extension(ext_name) cg.set_limits() if exception_raised: self.fail('exception expected') except CGroupsException: if not exception_raised: self.fail('exception not expected') self.assertEqual(should_limit, patch_set_cpu.called) self.assertEqual(should_limit, patch_set_memory_limit.called) self.assertEqual(should_limit, patch_add_event.called) if should_limit: actual_cpu_limit = patch_set_cpu.call_args[0][0] actual_memory_limit = patch_set_memory_limit.call_args[0][0] event_kw_args = patch_add_event.call_args[1] self.assertEqual(expected_cpu_limit, actual_cpu_limit) self.assertTrue(actual_memory_limit >= DEFAULT_MEM_LIMIT_MIN_MB) self.assertEqual(event_kw_args['op'], 'SetCGroupsLimits') self.assertEqual(event_kw_args['is_success'], not exception_raised) self.assertTrue( '{0}%'.format(expected_cpu_limit) in event_kw_args['message']) self.assertTrue(ext_name in event_kw_args['message']) self.assertEqual( exception_raised, 'set_memory_limit error' in event_kw_args['message'])
def test_telemetry_instantiation_as_superuser(self): """ Tracking a new cgroup for an extension; collect all metrics. """ # Record initial state initial_cgroup = make_self_cgroups() # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end test_cgroup = CGroups.for_extension("agent_unittest") test_cgroup.add(os.getpid()) self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu']) self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory']) self.exercise_telemetry_instantiation(test_cgroup) # Restore initial state CGroupsTelemetry.stop_tracking("agent_unittest") initial_cgroup.add(os.getpid())
def assert_limits(self, _, patch_set_cpu, patch_set_memory_limit, patch_get_enforce, patch_add_event, ext_name, expected_cpu_limit, limits_enforced=True, exception_raised=False): should_limit = expected_cpu_limit > 0 patch_get_enforce.return_value = limits_enforced if exception_raised: patch_set_memory_limit.side_effect = CGroupsException('set_memory_limit error') try: cg = CGroups.for_extension(ext_name) cg.set_limits() if exception_raised: self.fail('exception expected') except CGroupsException: if not exception_raised: self.fail('exception not expected') self.assertEqual(should_limit, patch_set_cpu.called) self.assertEqual(should_limit, patch_set_memory_limit.called) self.assertEqual(should_limit, patch_add_event.called) if should_limit: actual_cpu_limit = patch_set_cpu.call_args[0][0] actual_memory_limit = patch_set_memory_limit.call_args[0][0] event_kw_args = patch_add_event.call_args[1] self.assertEqual(expected_cpu_limit, actual_cpu_limit) self.assertTrue(actual_memory_limit >= DEFAULT_MEM_LIMIT_MIN_MB) self.assertEqual(event_kw_args['op'], 'SetCGroupsLimits') self.assertEqual(event_kw_args['is_success'], not exception_raised) self.assertTrue('{0}%'.format(expected_cpu_limit) in event_kw_args['message']) self.assertTrue(ext_name in event_kw_args['message']) self.assertEqual(exception_raised, 'set_memory_limit error' in event_kw_args['message'])
def setUpClass(cls): CGroups.setup(True) super(AgentTestCase, cls).setUpClass()
def path_maker(hierarchy, __): suffix = CGroups.get_my_cgroup_path(CGroups.get_hierarchy_id('cpu')) return os.path.join(BASE_CGROUPS, hierarchy, suffix)