def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value: {1} bytes, Threshold: {2} megabytes." \ .format(cgroup_name, value, thresholds.memory_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. " \ "Current Value: {1}, Threshold: {2}." \ .format(cgroup_name, value, thresholds.cpu_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def test_report_metric(self, mock_event): event.report_metric("cpu", "%idle", "_total", 10.0) self.assertEqual(1, mock_event.call_count) event_json = mock_event.call_args[0][0] self.assertIn("69B669B9-4AF8-4C50-BDC4-6006FA76E975", event_json) self.assertIn("%idle", event_json) import json event_dictionary = json.loads(event_json) self.assertEqual(event_dictionary['providerId'], "69B669B9-4AF8-4C50-BDC4-6006FA76E975") for parameter in event_dictionary["parameters"]: if parameter['name'] == 'Counter': self.assertEqual(parameter['value'], '%idle') break else: self.fail("Counter '%idle' not found in event parameters: {0}".format(repr(event_dictionary)))
def test_report_metric(self, mock_event): event.report_metric("cpu", "%idle", "_total", 10.0) self.assertEqual(1, mock_event.call_count) event_json = mock_event.call_args[0][0] self.assertIn(event.TELEMETRY_EVENT_PROVIDER_ID, event_json) self.assertIn("%idle", event_json) import json event_dictionary = json.loads(event_json) self.assertEqual(event_dictionary['providerId'], event.TELEMETRY_EVENT_PROVIDER_ID) for parameter in event_dictionary["parameters"]: if parameter['name'] == 'Counter': self.assertEqual(parameter['value'], '%idle') break else: self.fail( "Counter '%idle' not found in event parameters: {0}".format( repr(event_dictionary)))
def test_report_metric(self, mock_event): event.report_metric("cpu", "%idle", "_total", 10.0) self.assertEqual(1, mock_event.call_count) event_json = mock_event.call_args[0][0] self.assertIn("69B669B9-4AF8-4C50-BDC4-6006FA76E975", event_json) self.assertIn("%idle", event_json) import json event_dictionary = json.loads(event_json) self.assertEqual(event_dictionary['providerId'], "69B669B9-4AF8-4C50-BDC4-6006FA76E975") for parameter in event_dictionary["parameters"]: if parameter['name'] == 'Counter': self.assertEqual(parameter['value'], '%idle') break else: self.fail( "Counter '%idle' not found in event parameters: {0}".format( repr(event_dictionary)))
def test_report_metric_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema( self): self._test_create_event_function_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema( create_event_function=lambda: report_metric( "cpu", "%idle", "total", 12.34), expected_parameters={ GuestAgentPerfCounterEventsSchema.Category: 'cpu', GuestAgentPerfCounterEventsSchema.Counter: '%idle', GuestAgentPerfCounterEventsSchema.Instance: 'total', GuestAgentPerfCounterEventsSchema.Value: 12.34 })
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def test_report_metric_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema( self): self._test_create_event_function_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema( create_event_function=lambda: report_metric( "cpu", "%idle", "total", 12.34), expected_parameters={ 'Category': 'cpu', 'Counter': '%idle', 'Instance': 'total', 'Value': 12.34, 'ExtensionType': '' })
def _operation_impl(self): # # Check the processes in the agent cgroup # processes_check_error = None try: processes = CGroupConfigurator.get_instance( ).get_processes_in_agent_cgroup() if processes is not None: unexpected_processes = [] for (_, command_line) in processes: if not CGroupConfigurator.is_agent_process(command_line): unexpected_processes.append(command_line) if len(unexpected_processes) > 0: unexpected_processes.sort() processes_check_error = "The agent's cgroup includes unexpected processes: {0}".format( ustr(unexpected_processes)) except Exception as e: processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format( ustr(e)) # Report a small sample of errors if processes_check_error != self._last_error and self._error_count < 5: self._error_count += 1 self._last_error = processes_check_error logger.info(processes_check_error) add_event(op=WALAEventOperation.CGroupsDebug, message=processes_check_error) # # Report metrics # metrics = CGroupsTelemetry.poll_all_tracked() for metric in metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value)
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def poll_telemetry_metrics(self): """ This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly. :return: List of Metrics (which would be sent to PerfCounterMetrics directly. """ try: # If there is an issue in reporting, it should not take down whole monitor thread. time_now = datetime.datetime.utcnow() if not self.last_cgroup_polling_telemetry: self.last_cgroup_polling_telemetry = time_now if time_now >= (self.last_cgroup_polling_telemetry + MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): metrics = CGroupsTelemetry.poll_all_tracked() self.last_cgroup_polling_telemetry = time_now if metrics: for metric in metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value) except Exception as e: logger.warn("Could not poll all the tracked telemetry due to {0}", ustr(e))
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked( ) for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value( 'megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value:{1} bytes, Threshold:{2} megabytes.".format(cgroup_name, value, thresholds.memory_limit) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds.cpu_limit) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn( "Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()