def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error("monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
def exercise_telemetry_instantiation(self, test_cgroup): test_extension_name = test_cgroup.name CGroupsTelemetry.track_cgroup(test_cgroup) self.assertIn('cpu', test_cgroup.cgroups) self.assertIn('memory', test_cgroup.cgroups) self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name)) consume_cpu_time() time.sleep(1) metrics, limits = CGroupsTelemetry.collect_all_tracked() my_metrics = metrics[test_extension_name] self.assertEqual(len(my_metrics), 2) for item in my_metrics: metric_family, metric_name, metric_value = item if metric_family == "Process": self.assertEqual(metric_name, "% Processor Time") self.assertGreater(metric_value, 0.0) elif metric_family == "Memory": self.assertEqual(metric_name, "Total Memory Usage") self.assertGreater(metric_value, 100000) else: self.fail("Unknown metric {0}/{1} value {2}".format( metric_family, metric_name, metric_value)) my_limits = limits[test_extension_name] self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance") self.assertGreater(my_limits.cpu_limit, 0.0) self.assertGreater(my_limits.memory_limit, 0.0)
def exercise_telemetry_instantiation(self, test_cgroup): test_extension_name = test_cgroup.name CGroupsTelemetry.track_cgroup(test_cgroup) self.assertIn('cpu', test_cgroup.cgroups) self.assertIn('memory', test_cgroup.cgroups) self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name)) consume_cpu_time() time.sleep(1) metrics, limits = CGroupsTelemetry.collect_all_tracked() my_metrics = metrics[test_extension_name] self.assertEqual(len(my_metrics), 2) for item in my_metrics: metric_family, metric_name, metric_value = item if metric_family == "Process": self.assertEqual(metric_name, "% Processor Time") self.assertGreater(metric_value, 0.0) elif metric_family == "Memory": self.assertEqual(metric_name, "Total Memory Usage") self.assertGreater(metric_value, 100000) else: self.fail("Unknown metric {0}/{1} value {2}".format(metric_family, metric_name, metric_value)) my_limits = limits[test_extension_name] self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance") self.assertGreater(my_limits.cpu_limit, 0.0) self.assertGreater(my_limits.memory_limit, 0.0)
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked( ).items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked( ) for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": if value >= thresholds["memory"]: msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["memory"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds["cpu"]: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["cpu"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn( "Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: logger.error( "monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value: {1} bytes, Threshold: {2} megabytes." \ .format(cgroup_name, value, thresholds.memory_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. " \ "Current Value: {1}, Threshold: {2}." \ .format(cgroup_name, value, thresholds.cpu_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def init_cgroups(): # Track metrics for the roll-up cgroup and for the agent cgroup try: CGroupsTelemetry.track_cgroup(CGroups.for_extension("")) CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc())
def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() if env is None: env = {} env.update(os.environ) try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). # Some extensions erroneously begin cmd with a slash; don't interpret those # as root-relative. (Issue #1170) full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep)) def pre_exec_function(): """ Change process state before the actual target process is started. Effectively, this runs between the fork() and the exec() of sub-process creation. :return: """ os.setsid() CGroups.add_to_extension_cgroup(self.ext_handler.name) process = subprocess.Popen(full_path, shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, preexec_fn=pre_exec_function) except OSError as e: raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror), code=extension_error_code) cg = CGroups.for_extension(self.ext_handler.name) CGroupsTelemetry.track_extension(self.ext_handler.name, cg) msg = capture_from_process(process, cmd, timeout, extension_error_code) ret = process.poll() if ret is None: raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg), code=extension_error_code) if ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg), code=extension_error_code) duration = elapsed_milliseconds(begin_utc) log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""])) self.logger.verbose(log_msg) self.report_event(message=log_msg, duration=duration, log_event=False)
def test_telemetry_instantiation_as_superuser(self): """ Tracking a new cgroup for an extension; collect all metrics. """ # Record initial state initial_cgroup = make_self_cgroups() # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end test_cgroup = CGroups.for_extension("agent_unittest") test_cgroup.add(os.getpid()) self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu']) self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory']) self.exercise_telemetry_instantiation(test_cgroup) # Restore initial state CGroupsTelemetry.stop_tracking("agent_unittest") initial_cgroup.add(os.getpid())
def init_cgroups(): # Track metrics for the wrapper cgroup and for the agent cgroup try: # This creates the wrapper cgroup for everything under agent, # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/ # There is no need in tracking this cgroup, as it only serves # as an umbrella for the agent and extensions cgroups CGroups.for_extension("") # This creates the agent's cgroup (for the daemon and extension handler) # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent # If the system is using systemd, it would have already been set up under /system.slice CGroupsTelemetry.track_agent() except Exception as e: # when a hierarchy is not mounted, we raise an exception # and we should therefore only issue a warning, since this # is not unexpected logger.warn("Monitor: cgroups not initialized: {0}", ustr(e)) logger.verbose(traceback.format_exc())
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def test_memory_telemetry(self): """ Test Memory telemetry class """ cg = make_self_cgroups() raw_usage_file_contents = cg.get_file_contents('memory', 'memory.usage_in_bytes') self.assertIsNotNone(raw_usage_file_contents) self.assertGreater(len(raw_usage_file_contents), 0) self.assertIn('memory', cg.cgroups) ct = CGroupsTelemetry('test', cg) self.assertIs(cg, ct.cgroup) memory = Memory(ct) usage_in_bytes = memory.get_memory_usage() self.assertGreater(usage_in_bytes, 100000)
def test_telemetry_in_place_leaf_cgroup(self): """ Ensure this leaf (i.e. not root of cgroup tree) cgroup has distinct metrics from the root cgroup. """ # Does nothing on systems where the default cgroup for a randomly-created process (like this test invocation) # is the root cgroup. cg = make_self_cgroups() root = make_root_cgroups() if cg.cgroups['cpu'] != root.cgroups['cpu']: ct = CGroupsTelemetry("test", cg) cpu = Cpu(ct) self.assertLess(cpu.current_cpu_total, cpu.current_system_cpu) consume_cpu_time() # Eat some CPU time.sleep(1) # Generate some idle time cpu.update() self.assertLess(cpu.current_cpu_total, cpu.current_system_cpu)
def test_telemetry_inplace(self): """ Test raw measures and basic statistics for the cgroup in which this process is currently running. """ cg = make_self_cgroups() self.assertIn('cpu', cg.cgroups) self.assertIn('memory', cg.cgroups) ct = CGroupsTelemetry("test", cg) cpu = Cpu(ct) self.assertGreater(cpu.current_system_cpu, 0) consume_cpu_time() # Eat some CPU cpu.update() self.assertGreater(cpu.current_cpu_total, cpu.previous_cpu_total) self.assertGreater(cpu.current_system_cpu, cpu.previous_system_cpu) percent_used = cpu.get_cpu_percent() self.assertGreater(percent_used, 0)
def test_cpu_telemetry(self): """ Test Cpu telemetry class """ cg = make_self_cgroups() self.assertIn('cpu', cg.cgroups) ct = CGroupsTelemetry('test', cg) self.assertIs(cg, ct.cgroup) cpu = Cpu(ct) self.assertIs(cg, cpu.cgt.cgroup) ticks_before = cpu.current_cpu_total consume_cpu_time() time.sleep(1) cpu.update() ticks_after = cpu.current_cpu_total self.assertGreater(ticks_after, ticks_before) p2 = cpu.get_cpu_percent() self.assertGreater(p2, 0) # when running under PyCharm, this is often > 100 # on a multi-core machine self.assertLess(p2, 200)