예제 #1
0
    def run(self, child_args=None):
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        CGroups.setup()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:
                err_msg = traceback.format_exc()
                add_event(name=AGENT_NAME,
                          is_success=False,
                          message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn(
                    "Daemon ended with exception -- Sleep 15 seconds and restart daemon"
                )
                time.sleep(15)
예제 #2
0
    def run(self, child_args=None):
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        CGroups.setup()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:
                err_msg = traceback.format_exc()
                add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn("Daemon ended with exception -- Sleep 15 seconds and restart daemon")
                time.sleep(15)
예제 #3
0
 def pre_exec_function():
     """
     Change process state before the actual target process is started. Effectively, this runs between
     the fork() and the exec() of sub-process creation.
     :return:
     """
     os.setsid()
     CGroups.add_to_extension_cgroup(self.ext_handler.name)
 def pre_exec_function():
     """
     Change process state before the actual target process is started. Effectively, this runs between
     the fork() and the exec() of sub-process creation.
     :return:
     """
     os.setsid()
     CGroups.add_to_extension_cgroup(self.ext_handler.name)
예제 #5
0
 def test_cgroup_utilities(self):
     """
     Test utilities for querying cgroup metadata
     """
     cpu_id = CGroups.get_hierarchy_id('cpu')
     self.assertGreater(int(cpu_id), 0)
     memory_id = CGroups.get_hierarchy_id('memory')
     self.assertGreater(int(memory_id), 0)
     self.assertNotEqual(cpu_id, memory_id)
예제 #6
0
 def test_cgroup_utilities(self):
     """
     Test utilities for querying cgroup metadata
     """
     cpu_id = CGroups.get_hierarchy_id('cpu')
     self.assertGreater(int(cpu_id), 0)
     memory_id = CGroups.get_hierarchy_id('memory')
     self.assertGreater(int(memory_id), 0)
     self.assertNotEqual(cpu_id, memory_id)
예제 #7
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         logger.error("monitor: Exception tracking wrapper and agent: {0} [{1}]", e, traceback.format_exc())
예제 #8
0
 def init_cgroups():
     # Track metrics for the wrapper cgroup and for the agent cgroup
     try:
         # This creates the wrapper cgroup for everything under agent,
         # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/
         # There is no need in tracking this cgroup, as it only serves
         # as an umbrella for the agent and extensions cgroups
         CGroups.for_extension("")
         # This creates the agent's cgroup (for the daemon and extension handler)
         # /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent
         # If the system is using systemd, it would have already been set up under /system.slice
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
예제 #9
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         logger.error(
             "monitor: Exception tracking wrapper and agent: {0} [{1}]", e,
             traceback.format_exc())
예제 #10
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            # Memory is collected in bytes, and limit is set in megabytes.
                            if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit):
                                msg = "CGroup {0}: Crossed the Memory Threshold. " \
                                      "Current Value: {1} bytes, Threshold: {2} megabytes." \
                                       .format(cgroup_name, value, thresholds.memory_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds.cpu_limit:
                                msg = "CGroup {0}: Crossed the Processor Threshold. " \
                                      "Current Value: {1}, Threshold: {2}." \
                                       .format(cgroup_name, value, thresholds.cpu_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
예제 #11
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
def make_root_cgroups():
    """
    Build a CGroups object for the topmost cgroup

    :return: CGroups for most-encompassing cgroup
    :rtype: CGroups
    """
    def path_maker(hierarchy, _):
        return os.path.join(BASE_CGROUPS, hierarchy)

    return CGroups("root", path_maker)
예제 #13
0
 def init_cgroups():
     # Track metrics for the roll-up cgroup and for the agent cgroup
     try:
         CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
         CGroupsTelemetry.track_agent()
     except Exception as e:
         # when a hierarchy is not mounted, we raise an exception
         # and we should therefore only issue a warning, since this
         # is not unexpected
         logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
         logger.verbose(traceback.format_exc())
def make_self_cgroups():
    """
    Build a CGroups object for the cgroup to which this process already belongs

    :return: CGroups containing this process
    :rtype: CGroups
    """
    def path_maker(hierarchy, __):
        suffix = CGroups.get_my_cgroup_path(CGroups.get_hierarchy_id('cpu'))
        return os.path.join(BASE_CGROUPS, hierarchy, suffix)

    return CGroups("inplace", path_maker)
예제 #15
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
예제 #16
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
예제 #17
0
 def test_format_memory_value(self):
     """
     Test formatting of memory amounts into human-readable units
     """
     self.assertEqual(-1, CGroups._format_memory_value('bytes', None))
     self.assertEqual(2048, CGroups._format_memory_value('kilobytes', 2))
     self.assertEqual(0, CGroups._format_memory_value('kilobytes', 0))
     self.assertEqual(2048000, CGroups._format_memory_value('kilobytes', 2000))
     self.assertEqual(2048*1024, CGroups._format_memory_value('megabytes', 2))
     self.assertEqual((1024 + 512) * 1024 * 1024, CGroups._format_memory_value('gigabytes', 1.5))
     self.assertRaises(CGroupsException, CGroups._format_memory_value, 'KiloBytes', 1)
예제 #18
0
 def test_format_memory_value(self):
     """
     Test formatting of memory amounts into human-readable units
     """
     self.assertEqual(-1, CGroups._format_memory_value('bytes', None))
     self.assertEqual(2048, CGroups._format_memory_value('kilobytes', 2))
     self.assertEqual(0, CGroups._format_memory_value('kilobytes', 0))
     self.assertEqual(2048000, CGroups._format_memory_value('kilobytes', 2000))
     self.assertEqual(2048 * 1024, CGroups._format_memory_value('megabytes', 2))
     self.assertEqual((1024 + 512) * 1024 * 1024, CGroups._format_memory_value('gigabytes', 1.5))
     self.assertRaises(CGroupsException, CGroups._format_memory_value, 'KiloBytes', 1)
예제 #19
0
    def assert_limits(self,
                      _,
                      patch_set_cpu,
                      patch_set_memory_limit,
                      patch_get_enforce,
                      patch_add_event,
                      ext_name,
                      expected_cpu_limit,
                      limits_enforced=True,
                      exception_raised=False):

        should_limit = expected_cpu_limit > 0
        patch_get_enforce.return_value = limits_enforced

        if exception_raised:
            patch_set_memory_limit.side_effect = CGroupsException(
                'set_memory_limit error')

        try:
            cg = CGroups.for_extension(ext_name)
            cg.set_limits()
            if exception_raised:
                self.fail('exception expected')
        except CGroupsException:
            if not exception_raised:
                self.fail('exception not expected')

        self.assertEqual(should_limit, patch_set_cpu.called)
        self.assertEqual(should_limit, patch_set_memory_limit.called)
        self.assertEqual(should_limit, patch_add_event.called)

        if should_limit:
            actual_cpu_limit = patch_set_cpu.call_args[0][0]
            actual_memory_limit = patch_set_memory_limit.call_args[0][0]
            event_kw_args = patch_add_event.call_args[1]

            self.assertEqual(expected_cpu_limit, actual_cpu_limit)
            self.assertTrue(actual_memory_limit >= DEFAULT_MEM_LIMIT_MIN_MB)
            self.assertEqual(event_kw_args['op'], 'SetCGroupsLimits')
            self.assertEqual(event_kw_args['is_success'], not exception_raised)
            self.assertTrue(
                '{0}%'.format(expected_cpu_limit) in event_kw_args['message'])
            self.assertTrue(ext_name in event_kw_args['message'])
            self.assertEqual(
                exception_raised, 'set_memory_limit error'
                in event_kw_args['message'])
예제 #20
0
    def test_telemetry_instantiation_as_superuser(self):
        """
        Tracking a new cgroup for an extension; collect all metrics.
        """
        # Record initial state
        initial_cgroup = make_self_cgroups()

        # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end
        test_cgroup = CGroups.for_extension("agent_unittest")
        test_cgroup.add(os.getpid())
        self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu'])
        self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory'])

        self.exercise_telemetry_instantiation(test_cgroup)

        # Restore initial state
        CGroupsTelemetry.stop_tracking("agent_unittest")
        initial_cgroup.add(os.getpid())
예제 #21
0
    def test_telemetry_instantiation_as_superuser(self):
        """
        Tracking a new cgroup for an extension; collect all metrics.
        """
        # Record initial state
        initial_cgroup = make_self_cgroups()

        # Put the process into a different cgroup, consume some resources, ensure we see them end-to-end
        test_cgroup = CGroups.for_extension("agent_unittest")
        test_cgroup.add(os.getpid())
        self.assertNotEqual(initial_cgroup.cgroups['cpu'], test_cgroup.cgroups['cpu'])
        self.assertNotEqual(initial_cgroup.cgroups['memory'], test_cgroup.cgroups['memory'])

        self.exercise_telemetry_instantiation(test_cgroup)

        # Restore initial state
        CGroupsTelemetry.stop_tracking("agent_unittest")
        initial_cgroup.add(os.getpid())
예제 #22
0
    def assert_limits(self, _, patch_set_cpu, patch_set_memory_limit, patch_get_enforce, patch_add_event,
                      ext_name,
                      expected_cpu_limit,
                      limits_enforced=True,
                      exception_raised=False):

        should_limit = expected_cpu_limit > 0
        patch_get_enforce.return_value = limits_enforced

        if exception_raised:
            patch_set_memory_limit.side_effect = CGroupsException('set_memory_limit error')

        try:
            cg = CGroups.for_extension(ext_name)
            cg.set_limits()
            if exception_raised:
                self.fail('exception expected')
        except CGroupsException:
            if not exception_raised:
                self.fail('exception not expected')

        self.assertEqual(should_limit, patch_set_cpu.called)
        self.assertEqual(should_limit, patch_set_memory_limit.called)
        self.assertEqual(should_limit, patch_add_event.called)

        if should_limit:
            actual_cpu_limit = patch_set_cpu.call_args[0][0]
            actual_memory_limit = patch_set_memory_limit.call_args[0][0]
            event_kw_args = patch_add_event.call_args[1]

            self.assertEqual(expected_cpu_limit, actual_cpu_limit)
            self.assertTrue(actual_memory_limit >= DEFAULT_MEM_LIMIT_MIN_MB)
            self.assertEqual(event_kw_args['op'], 'SetCGroupsLimits')
            self.assertEqual(event_kw_args['is_success'], not exception_raised)
            self.assertTrue('{0}%'.format(expected_cpu_limit) in event_kw_args['message'])
            self.assertTrue(ext_name in event_kw_args['message'])
            self.assertEqual(exception_raised, 'set_memory_limit error' in event_kw_args['message'])
예제 #23
0
 def setUpClass(cls):
     CGroups.setup(True)
     super(AgentTestCase, cls).setUpClass()
예제 #24
0
 def path_maker(hierarchy, __):
     suffix = CGroups.get_my_cgroup_path(CGroups.get_hierarchy_id('cpu'))
     return os.path.join(BASE_CGROUPS, hierarchy, suffix)
예제 #25
0
 def setUpClass(cls):
     CGroups.setup(True)
     super(AgentTestCase, cls).setUpClass()
예제 #26
0
 def path_maker(hierarchy, __):
     suffix = CGroups.get_my_cgroup_path(CGroups.get_hierarchy_id('cpu'))
     return os.path.join(BASE_CGROUPS, hierarchy, suffix)