Example #1
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            # Memory is collected in bytes, and limit is set in megabytes.
                            if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit):
                                msg = "CGroup {0}: Crossed the Memory Threshold. " \
                                      "Current Value: {1} bytes, Threshold: {2} megabytes." \
                                       .format(cgroup_name, value, thresholds.memory_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds.cpu_limit:
                                msg = "CGroup {0}: Crossed the Processor Threshold. " \
                                      "Current Value: {1}, Threshold: {2}." \
                                       .format(cgroup_name, value, thresholds.cpu_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Example #2
0
 def test_report_metric(self, mock_event):
     event.report_metric("cpu", "%idle", "_total", 10.0)
     self.assertEqual(1, mock_event.call_count)
     event_json = mock_event.call_args[0][0]
     self.assertIn("69B669B9-4AF8-4C50-BDC4-6006FA76E975", event_json)
     self.assertIn("%idle", event_json)
     import json
     event_dictionary = json.loads(event_json)
     self.assertEqual(event_dictionary['providerId'], "69B669B9-4AF8-4C50-BDC4-6006FA76E975")
     for parameter in event_dictionary["parameters"]:
         if parameter['name'] == 'Counter':
             self.assertEqual(parameter['value'], '%idle')
             break
     else:
         self.fail("Counter '%idle' not found in event parameters: {0}".format(repr(event_dictionary)))
Example #3
0
 def test_report_metric(self, mock_event):
     event.report_metric("cpu", "%idle", "_total", 10.0)
     self.assertEqual(1, mock_event.call_count)
     event_json = mock_event.call_args[0][0]
     self.assertIn(event.TELEMETRY_EVENT_PROVIDER_ID, event_json)
     self.assertIn("%idle", event_json)
     import json
     event_dictionary = json.loads(event_json)
     self.assertEqual(event_dictionary['providerId'],
                      event.TELEMETRY_EVENT_PROVIDER_ID)
     for parameter in event_dictionary["parameters"]:
         if parameter['name'] == 'Counter':
             self.assertEqual(parameter['value'], '%idle')
             break
     else:
         self.fail(
             "Counter '%idle' not found in event parameters: {0}".format(
                 repr(event_dictionary)))
 def test_report_metric(self, mock_event):
     event.report_metric("cpu", "%idle", "_total", 10.0)
     self.assertEqual(1, mock_event.call_count)
     event_json = mock_event.call_args[0][0]
     self.assertIn("69B669B9-4AF8-4C50-BDC4-6006FA76E975", event_json)
     self.assertIn("%idle", event_json)
     import json
     event_dictionary = json.loads(event_json)
     self.assertEqual(event_dictionary['providerId'],
                      "69B669B9-4AF8-4C50-BDC4-6006FA76E975")
     for parameter in event_dictionary["parameters"]:
         if parameter['name'] == 'Counter':
             self.assertEqual(parameter['value'], '%idle')
             break
     else:
         self.fail(
             "Counter '%idle' not found in event parameters: {0}".format(
                 repr(event_dictionary)))
Example #5
0
 def test_report_metric_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema(
         self):
     self._test_create_event_function_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema(
         create_event_function=lambda: report_metric(
             "cpu", "%idle", "total", 12.34),
         expected_parameters={
             GuestAgentPerfCounterEventsSchema.Category: 'cpu',
             GuestAgentPerfCounterEventsSchema.Counter: '%idle',
             GuestAgentPerfCounterEventsSchema.Instance: 'total',
             GuestAgentPerfCounterEventsSchema.Value: 12.34
         })
Example #6
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Example #7
0
 def test_report_metric_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema(
         self):
     self._test_create_event_function_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema(
         create_event_function=lambda: report_metric(
             "cpu", "%idle", "total", 12.34),
         expected_parameters={
             'Category': 'cpu',
             'Counter': '%idle',
             'Instance': 'total',
             'Value': 12.34,
             'ExtensionType': ''
         })
Example #8
0
    def _operation_impl(self):
        #
        # Check the processes in the agent cgroup
        #
        processes_check_error = None
        try:
            processes = CGroupConfigurator.get_instance(
            ).get_processes_in_agent_cgroup()

            if processes is not None:
                unexpected_processes = []

                for (_, command_line) in processes:
                    if not CGroupConfigurator.is_agent_process(command_line):
                        unexpected_processes.append(command_line)

                if len(unexpected_processes) > 0:
                    unexpected_processes.sort()
                    processes_check_error = "The agent's cgroup includes unexpected processes: {0}".format(
                        ustr(unexpected_processes))
        except Exception as e:
            processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format(
                ustr(e))

        # Report a small sample of errors
        if processes_check_error != self._last_error and self._error_count < 5:
            self._error_count += 1
            self._last_error = processes_check_error
            logger.info(processes_check_error)
            add_event(op=WALAEventOperation.CGroupsDebug,
                      message=processes_check_error)

        #
        # Report metrics
        #
        metrics = CGroupsTelemetry.poll_all_tracked()

        for metric in metrics:
            report_metric(metric.category, metric.counter, metric.instance,
                          metric.value)
Example #9
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)
            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Example #10
0
    def poll_telemetry_metrics(self):
        """
        This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly.

        :return: List of Metrics (which would be sent to PerfCounterMetrics directly.
        """
        try:  # If there is an issue in reporting, it should not take down whole monitor thread.
            time_now = datetime.datetime.utcnow()
            if not self.last_cgroup_polling_telemetry:
                self.last_cgroup_polling_telemetry = time_now

            if time_now >= (self.last_cgroup_polling_telemetry +
                            MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD):
                metrics = CGroupsTelemetry.poll_all_tracked()
                self.last_cgroup_polling_telemetry = time_now

                if metrics:
                    for metric in metrics:
                        report_metric(metric.category, metric.counter,
                                      metric.instance, metric.value)
        except Exception as e:
            logger.warn("Could not poll all the tracked telemetry due to {0}",
                        ustr(e))
Example #11
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked(
                )
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)

                        if metric_group == "Memory":
                            # Memory is collected in bytes, and limit is set in megabytes.
                            if value >= CGroups._format_memory_value(
                                    'megabytes', thresholds.memory_limit):
                                msg = "CGroup {0}: Crossed the Memory Threshold. " \
                                      "Current Value:{1} bytes, Threshold:{2} megabytes.".format(cgroup_name, value,
                                                                                 thresholds.memory_limit)
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds.cpu_limit:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds.cpu_limit)
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

            except Exception as e:
                logger.warn(
                    "Monitor: failed to collect cgroups performance metrics: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: failed to update cgroups tracked extensions: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()