Beispiel #1
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         logger.error("monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
Beispiel #2
0
    def exercise_telemetry_instantiation(self, test_cgroup):
        test_extension_name = test_cgroup.name
        CGroupsTelemetry.track_cgroup(test_cgroup)
        self.assertIn('cpu', test_cgroup.cgroups)
        self.assertIn('memory', test_cgroup.cgroups)
        self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name))
        consume_cpu_time()
        time.sleep(1)
        metrics, limits = CGroupsTelemetry.collect_all_tracked()
        my_metrics = metrics[test_extension_name]
        self.assertEqual(len(my_metrics), 2)
        for item in my_metrics:
            metric_family, metric_name, metric_value = item
            if metric_family == "Process":
                self.assertEqual(metric_name, "% Processor Time")
                self.assertGreater(metric_value, 0.0)
            elif metric_family == "Memory":
                self.assertEqual(metric_name, "Total Memory Usage")
                self.assertGreater(metric_value, 100000)
            else:
                self.fail("Unknown metric {0}/{1} value {2}".format(
                    metric_family, metric_name, metric_value))

        my_limits = limits[test_extension_name]
        self.assertIsInstance(my_limits,
                              CGroupsLimits,
                              msg="is not the correct instance")
        self.assertGreater(my_limits.cpu_limit, 0.0)
        self.assertGreater(my_limits.memory_limit, 0.0)
Beispiel #3
0
    def exercise_telemetry_instantiation(self, test_cgroup):
        test_extension_name = test_cgroup.name
        CGroupsTelemetry.track_cgroup(test_cgroup)
        self.assertIn('cpu', test_cgroup.cgroups)
        self.assertIn('memory', test_cgroup.cgroups)
        self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name))
        consume_cpu_time()
        time.sleep(1)
        metrics, limits = CGroupsTelemetry.collect_all_tracked()
        my_metrics = metrics[test_extension_name]
        self.assertEqual(len(my_metrics), 2)
        for item in my_metrics:
            metric_family, metric_name, metric_value = item
            if metric_family == "Process":
                self.assertEqual(metric_name, "% Processor Time")
                self.assertGreater(metric_value, 0.0)
            elif metric_family == "Memory":
                self.assertEqual(metric_name, "Total Memory Usage")
                self.assertGreater(metric_value, 100000)
            else:
                self.fail("Unknown metric {0}/{1} value {2}".format(metric_family, metric_name, metric_value))

        my_limits = limits[test_extension_name]
        self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance")
        self.assertGreater(my_limits.cpu_limit, 0.0)
        self.assertGreater(my_limits.memory_limit, 0.0)
Beispiel #4
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked(
                ).items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]",
                            e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: updating tracked extensions raised {0}: {1}", e,
                    traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Beispiel #5
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked(
                )
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)

                        if metric_group == "Memory":
                            if value >= thresholds["memory"]:
                                msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["memory"])
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds["cpu"]:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["cpu"])
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

            except Exception as e:
                logger.warn(
                    "Monitor: failed to collect cgroups performance metrics: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: failed to update cgroups tracked extensions: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Beispiel #6
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         logger.error(
             "monitor: Exception tracking wrapper and agent: {0} [{1}]", e,
             traceback.format_exc())
Beispiel #7
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            # Memory is collected in bytes, and limit is set in megabytes.
                            if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit):
                                msg = "CGroup {0}: Crossed the Memory Threshold. " \
                                      "Current Value: {1} bytes, Threshold: {2} megabytes." \
                                       .format(cgroup_name, value, thresholds.memory_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds.cpu_limit:
                                msg = "CGroup {0}: Crossed the Processor Threshold. " \
                                      "Current Value: {1}, Threshold: {2}." \
                                       .format(cgroup_name, value, thresholds.cpu_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Beispiel #8
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
Beispiel #9
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
Beispiel #10
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
Beispiel #11
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
Beispiel #12
0
    def test_telemetry_instantiation_as_superuser(self):
        """
        Tracking a new cgroup for an extension; collect all metrics.
        """
        # Record initial state
        initial_cgroup = make_self_cgroups()

        # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end
        test_cgroup = CGroups.for_extension("agent_unittest")
        test_cgroup.add(os.getpid())
        self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu'])
        self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory'])

        self.exercise_telemetry_instantiation(test_cgroup)

        # Restore initial state
        CGroupsTelemetry.stop_tracking("agent_unittest")
        initial_cgroup.add(os.getpid())
Beispiel #13
0
 def init_cgroups():
     # Track metrics for the wrapper cgroup and for the agent cgroup
     try:
         # This creates the wrapper cgroup for everything under agent,
         # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/
         # There is no need in tracking this cgroup, as it only serves
         # as an umbrella for the agent and extensions cgroups
         CGroups.for_extension("")
         # This creates the agent's cgroup (for the daemon and extension handler)
         # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent
         # If the system is using systemd, it would have already been set up under /system.slice
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
    def test_telemetry_instantiation_as_superuser(self):
        """
        Tracking a new cgroup for an extension; collect all metrics.
        """
        # Record initial state
        initial_cgroup = make_self_cgroups()

        # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end
        test_cgroup = CGroups.for_extension("agent_unittest")
        test_cgroup.add(os.getpid())
        self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu'])
        self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory'])

        self.exercise_telemetry_instantiation(test_cgroup)

        # Restore initial state
        CGroupsTelemetry.stop_tracking("agent_unittest")
        initial_cgroup.add(os.getpid())
Beispiel #15
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Beispiel #16
0
 def test_memory_telemetry(self):
     """
     Test Memory telemetry class
     """
     cg = make_self_cgroups()
     raw_usage_file_contents = cg.get_file_contents('memory', 'memory.usage_in_bytes')
     self.assertIsNotNone(raw_usage_file_contents)
     self.assertGreater(len(raw_usage_file_contents), 0)
     self.assertIn('memory', cg.cgroups)
     ct = CGroupsTelemetry('test', cg)
     self.assertIs(cg, ct.cgroup)
     memory = Memory(ct)
     usage_in_bytes = memory.get_memory_usage()
     self.assertGreater(usage_in_bytes, 100000)
Beispiel #17
0
    def test_telemetry_in_place_leaf_cgroup(self):
        """
        Ensure this leaf (i.e. not root of cgroup tree) cgroup has distinct metrics from the root cgroup.
        """
        # Does nothing on systems where the default cgroup for a randomly-created process (like this test invocation)
        # is the root cgroup.
        cg = make_self_cgroups()
        root = make_root_cgroups()
        if cg.cgroups['cpu'] != root.cgroups['cpu']:
            ct = CGroupsTelemetry("test", cg)
            cpu = Cpu(ct)
            self.assertLess(cpu.current_cpu_total, cpu.current_system_cpu)

            consume_cpu_time()  # Eat some CPU
            time.sleep(1)  # Generate some idle time
            cpu.update()
            self.assertLess(cpu.current_cpu_total, cpu.current_system_cpu)
Beispiel #18
0
    def test_telemetry_inplace(self):
        """
        Test raw measures and basic statistics for the cgroup in which this process is currently running.
        """
        cg = make_self_cgroups()
        self.assertIn('cpu', cg.cgroups)
        self.assertIn('memory', cg.cgroups)
        ct = CGroupsTelemetry("test", cg)
        cpu = Cpu(ct)
        self.assertGreater(cpu.current_system_cpu, 0)

        consume_cpu_time()  # Eat some CPU
        cpu.update()

        self.assertGreater(cpu.current_cpu_total, cpu.previous_cpu_total)
        self.assertGreater(cpu.current_system_cpu, cpu.previous_system_cpu)

        percent_used = cpu.get_cpu_percent()
        self.assertGreater(percent_used, 0)
Beispiel #19
0
 def test_cpu_telemetry(self):
     """
     Test Cpu telemetry class
     """
     cg = make_self_cgroups()
     self.assertIn('cpu', cg.cgroups)
     ct = CGroupsTelemetry('test', cg)
     self.assertIs(cg, ct.cgroup)
     cpu = Cpu(ct)
     self.assertIs(cg, cpu.cgt.cgroup)
     ticks_before = cpu.current_cpu_total
     consume_cpu_time()
     time.sleep(1)
     cpu.update()
     ticks_after = cpu.current_cpu_total
     self.assertGreater(ticks_after, ticks_before)
     p2 = cpu.get_cpu_percent()
     self.assertGreater(p2, 0)
     # when running under PyCharm, this is often > 100
     # on a multi-core machine
     self.assertLess(p2, 200)