def test_chunk_samples(self): # Generate 1.2 kbs of metric samples test_collector = MetricsCollector( self._services, 5, 10, self.timeout, grpc_max_msg_size_mb=0.01, loop=asyncio.new_event_loop(), ) samples = self._generate_samples(2000) chunked_samples = test_collector._chunk_samples(samples) self.assertEqual(len(list(chunked_samples)), 2)
def setUp(self): ServiceRegistry.add_service('test', '0.0.0.0', 0) ServiceRegistry._PROXY_CONFIG = {'local_port': 1234, 'cloud_address': 'test', 'proxy_cloud_connections': True} self._services = ['test'] self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904" self.queue_length = 5 self.timeout = 1 self._collector = MetricsCollector(self._services, 5, 10, self.timeout, grpc_max_msg_size_mb=4, queue_length=self.queue_length, loop=asyncio.new_event_loop())
def main(): """ Main magmad function """ service = MagmaService('magmad', mconfigs_pb2.MagmaD()) # Optionally pipe errors to Sentry sentry_init(service_name=service.name) logging.info('Starting magmad for UUID: %s', snowflake.make_snowflake()) # Create service manager services = service.config.get('magma_services') init_system = service.config.get('init_system', 'systemd') registered_dynamic_services = service.config.get( 'registered_dynamic_services', [], ) enabled_dynamic_services = [] if service.mconfig is not None: enabled_dynamic_services = service.mconfig.dynamic_services # Poll the services' Service303 interface service_poller = ServicePoller( service.loop, service.config, enabled_dynamic_services, ) service_poller.start() service_manager = ServiceManager( services, init_system, service_poller, registered_dynamic_services, enabled_dynamic_services, ) # Get metrics service config metrics_config = service.config.get('metricsd') metrics_services = metrics_config['services'] collect_interval = metrics_config['collect_interval'] sync_interval = metrics_config['sync_interval'] grpc_timeout = metrics_config['grpc_timeout'] grpc_msg_size = metrics_config.get('max_grpc_msg_size_mb', 4) metrics_post_processor_fn = metrics_config.get('post_processing_fn') metric_scrape_targets = [ ScrapeTarget(t['url'], t['name'], t['interval']) for t in metrics_config.get('metric_scrape_targets', []) ] # Create local metrics collector metrics_collector = MetricsCollector( services=metrics_services, collect_interval=collect_interval, sync_interval=sync_interval, grpc_timeout=grpc_timeout, grpc_max_msg_size_mb=grpc_msg_size, loop=service.loop, post_processing_fn=get_metrics_postprocessor_fn( metrics_post_processor_fn, ), scrape_targets=metric_scrape_targets, ) # Poll and sync the metrics collector loops metrics_collector.run() # Start a background thread to stream updates from the cloud stream_client = None if service.config.get('enable_config_streamer', False): stream_client = StreamerClient( { CONFIG_STREAM_NAME: ConfigManager( services, service_manager, service, MconfigManagerImpl(), ), }, service.loop, ) # Create sync rpc client with a heartbeat of 30 seconds (timeout = 60s) sync_rpc_client = None if service.config.get('enable_sync_rpc', False): sync_rpc_client = SyncRPCClient( service.loop, 30, service.config.get('print_grpc_payload', False), ) first_time_bootstrap = True # This is called when bootstrap succeeds and when _bootstrap_check is # invoked but bootstrap is not needed. If it's invoked right after certs # are generated, certs_generated is true, control_proxy will restart. async def bootstrap_success_cb(certs_generated: bool): nonlocal first_time_bootstrap if first_time_bootstrap: if stream_client: stream_client.start() if sync_rpc_client: sync_rpc_client.start() first_time_bootstrap = False if certs_generated: svcs_to_restart = [] if 'control_proxy' in services: svcs_to_restart.append('control_proxy') # fluent-bit caches TLS client certs in memory, so we need to # restart it whenever the certs change fresh_mconfig = get_mconfig_manager().load_service_mconfig( 'magmad', mconfigs_pb2.MagmaD(), ) dynamic_svcs = fresh_mconfig.dynamic_services or [] if 'td-agent-bit' in dynamic_svcs: svcs_to_restart.append('td-agent-bit') await service_manager.restart_services(services=svcs_to_restart) # Create bootstrap manager bootstrap_manager = BootstrapManager(service, bootstrap_success_cb) # Initialize kernel version poller if it is enabled kernel_version_poller = None if service.config.get('enable_kernel_version_checking', False): kernel_version_poller = KernelVersionsPoller(service) kernel_version_poller.start() # gateway status generator to bundle various information about this # gateway into an object. gateway_status_factory = GatewayStatusFactory( service=service, service_poller=service_poller, kernel_version_poller=kernel_version_poller, ) # _grpc_client_manager to manage grpc client recycling grpc_client_manager = GRPCClientManager( service_name="state", service_stub=StateServiceStub, max_client_reuse=60, ) # Initialize StateReporter state_reporter = StateReporter( config=service.config, mconfig=service.mconfig, loop=service.loop, bootstrap_manager=bootstrap_manager, gw_status_factory=gateway_status_factory, grpc_client_manager=grpc_client_manager, ) # Initialize ServiceHealthWatchdog service_health_watchdog = ServiceHealthWatchdog( config=service.config, loop=service.loop, service_poller=service_poller, service_manager=service_manager, ) # Start _bootstrap_manager bootstrap_manager.start_bootstrap_manager() # Start all services when magmad comes up service.loop.create_task(service_manager.start_services()) # Start state reporting loop state_reporter.start() # Start service timeout health check loop service_health_watchdog.start() # Start upgrade manager loop if service.config.get('enable_upgrade_manager', False): upgrader = _get_upgrader_impl(service) service.loop.create_task(start_upgrade_loop(service, upgrader)) # Start network health metric collection if service.config.get('enable_network_monitor', False): service.loop.create_task(metrics_collection_loop(service.config)) # Create generic command executor command_executor = None if service.config.get('generic_command_config', None): command_executor = get_command_executor_impl(service) # Start loop to monitor unattended upgrade status service.loop.create_task(monitor_unattended_upgrade_status()) # Add all servicers to the server magmad_servicer = MagmadRpcServicer( service, services, service_manager, get_mconfig_manager(), command_executor, service.loop, service.config.get('print_grpc_payload', False), ) magmad_servicer.add_to_server(service.rpc_server) if SDWatchdog.has_notify(): # Create systemd watchdog sdwatchdog = SDWatchdog( tasks=[bootstrap_manager, state_reporter], update_status=True, ) # Start watchdog loop service.loop.create_task(sdwatchdog.run()) # Run the service loop service.run() # Cleanup the service service.close()
class MetricsCollectorTests(unittest.TestCase): """ Tests for the MetricCollector collect and sync """ @classmethod def setUpClass(cls): cls.queue_size = 5 def setUp(self): ServiceRegistry.add_service('test', '0.0.0.0', 0) ServiceRegistry._PROXY_CONFIG = { 'local_port': 1234, 'cloud_address': 'test', 'proxy_cloud_connections': True } self._services = ['test'] self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904" self.timeout = 1 self._collector = MetricsCollector(self._services, 5, 10, self.timeout, grpc_max_msg_size_mb=4, loop=asyncio.new_event_loop()) @unittest.mock.patch('magma.magmad.metrics_collector.MetricsControllerStub' ) def test_sync(self, controller_mock): """ Test if the collector syncs our sample. """ # Mock out Collect.future mock = unittest.mock.Mock() mock.Collect.future.side_effect = [ unittest.mock.Mock(), unittest.mock.Mock(), unittest.mock.Mock() ] controller_mock.side_effect = [mock, mock, mock] # Call with no samples service_name = "test" self._collector.sync(service_name) controller_mock.Collect.future.assert_not_called() self._collector._loop.stop() # Call with new samples to send samples = [MetricFamily(name="1234")] self._collector._samples_for_service[service_name].extend(samples) with unittest.mock.patch('snowflake.snowflake') as mock_snowflake: mock_snowflake.side_effect = lambda: self.gateway_id self._collector.sync(service_name) mock.Collect.future.assert_called_once_with( MetricsContainer(gatewayId=self.gateway_id, family=samples), self.timeout) self.assertCountEqual( self._collector._samples_for_service[service_name], []) # Reduce max msg size to trigger msg chunking self._collector.grpc_max_msg_size_bytes = 1500 samples = self._generate_samples(140) self._collector._samples_for_service[service_name].extend(samples) chunk1 = samples[:70] chunk2 = samples[70:140] with unittest.mock.patch('snowflake.snowflake') as mock_snowflake: mock_snowflake.side_effect = lambda: self.gateway_id self._collector.sync(service_name) mock.Collect.future.assert_any_call( MetricsContainer(gatewayId=self.gateway_id, family=chunk1), self.timeout) mock.Collect.future.assert_any_call( MetricsContainer(gatewayId=self.gateway_id, family=chunk2), self.timeout) self.assertCountEqual( self._collector._samples_for_service[service_name], []) def test_collect(self): """ Test if the collector syncs our sample. """ mock = unittest.mock.MagicMock() service_name = "test" samples = [MetricFamily(name="2345")] self._collector._samples_for_service[service_name].clear() self._collector._samples_for_service[service_name].extend(samples) mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] self._collector.collect_done('test', mock) # Should dequeue sample from the left, and enqueue on right # collector should add one more metric for collection success/failure self.assertEqual( len(self._collector._samples_for_service[service_name]), len(samples * 2) + 1) def test_collect_start_time(self): """ Test if the collector syncs our sample. """ mock = unittest.mock.MagicMock() start_metric = Metric() start_metric.gauge.value = calendar.timegm(time.gmtime()) - 1 start_time = MetricFamily( name=str(metricsd_pb2.process_start_time_seconds), metric=[start_metric], ) samples = [start_time] service_name = "test" self._collector._samples_for_service[service_name].clear() mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] self._collector.collect_done('test', mock) # should have uptime, start time, and collection success self.assertEqual( len(self._collector._samples_for_service[service_name]), 3) uptime_list = [ fam for fam in self._collector._samples_for_service[service_name] if fam.name == str(metricsd_pb2.process_uptime_seconds) ] self.assertEqual(len(uptime_list), 1) self.assertEqual(len(uptime_list[0].metric), 1) self.assertGreater(uptime_list[0].metric[0].gauge.value, 0) # ensure no exceptions with empty metric empty = MetricFamily(name=str(metricsd_pb2.process_start_time_seconds)) samples = [empty] self._collector._samples_for_service[service_name].clear() mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] try: self._collector.collect_done('test', mock) except Exception: # pylint: disable=broad-except self.fail("Collection with empty metric should not have failed") def test_counter_to_proto(self): test_counter = prometheus_client.core.CounterMetricFamily( "test", "", labels=["testLabel"], ) test_counter.add_metric(["val"], 1.23) test_counter.add_metric(["val2"], 2.34) proto = _counter_to_proto(test_counter) self.assertEqual(proto.name, test_counter.name) self.assertEqual(proto.type, metrics_pb2.COUNTER) self.assertEqual(2, len(proto.metric)) self.assertEqual("val", proto.metric[0].label[0].value) self.assertEqual(1.23, proto.metric[0].counter.value) self.assertEqual("val2", proto.metric[1].label[0].value) self.assertEqual(2.34, proto.metric[1].counter.value) def test_gauge_to_proto(self): test_gauge = prometheus_client.core.GaugeMetricFamily( "test", "", labels=["testLabel"], ) test_gauge.add_metric(["val"], 1.23) test_gauge.add_metric(["val2"], 2.34) proto = _gauge_to_proto(test_gauge) self.assertEqual(proto.name, test_gauge.name) self.assertEqual(proto.type, metrics_pb2.GAUGE) self.assertEqual(2, len(proto.metric)) self.assertEqual("val", proto.metric[0].label[0].value) self.assertEqual(1.23, proto.metric[0].gauge.value) self.assertEqual("val2", proto.metric[1].label[0].value) self.assertEqual(2.34, proto.metric[1].gauge.value) def test_untyped_to_proto(self): test_untyped = prometheus_client.core.UntypedMetricFamily( "test", "", labels=["testLabel"], ) test_untyped.add_metric(["val"], 1.23) test_untyped.add_metric(["val2"], 2.34) proto = _untyped_to_proto(test_untyped) self.assertEqual(proto.name, test_untyped.name) self.assertEqual(proto.type, metrics_pb2.UNTYPED) self.assertEqual(2, len(proto.metric)) self.assertEqual("val", proto.metric[0].label[0].value) self.assertEqual(1.23, proto.metric[0].untyped.value) self.assertEqual("val2", proto.metric[1].label[0].value) self.assertEqual(2.34, proto.metric[1].untyped.value) def test_summary_to_proto(self): test_summary = prometheus_client.core.SummaryMetricFamily( "test", "", labels=["testLabel"], ) # Add first unique labelset metrics test_summary.add_metric(["val1"], 10, 0.1) test_summary.add_sample("test", { "quantile": "0.0", "testLabel": "val1" }, 0.01) test_summary.add_sample("test", { "quantile": "0.5", "testLabel": "val1" }, 0.02) test_summary.add_sample("test", { "quantile": "1.0", "testLabel": "val1" }, 0.03) # Add second unique labelset metrics test_summary.add_metric(["val2"], 20, 0.2) test_summary.add_sample("test", { "quantile": "0.0", "testLabel": "val2" }, 0.02) test_summary.add_sample("test", { "quantile": "0.5", "testLabel": "val2" }, 0.04) test_summary.add_sample("test", { "quantile": "1.0", "testLabel": "val2" }, 0.06) protos = _summary_to_proto(test_summary) self.assertEqual(2, len(protos)) for proto in protos: self.assertEqual(proto.name, test_summary.name) self.assertEqual(proto.type, metrics_pb2.SUMMARY) if proto.metric[0].label[0].value == "val1": self.assertEqual(1, len(proto.metric)) self.assertEqual(10, proto.metric[0].summary.sample_count) self.assertEqual(0.1, proto.metric[0].summary.sample_sum) self.assertEqual(3, len(proto.metric[0].summary.quantile)) self.assertEqual(0.01, proto.metric[0].summary.quantile[0].value) self.assertEqual(0.02, proto.metric[0].summary.quantile[1].value) self.assertEqual(0.03, proto.metric[0].summary.quantile[2].value) else: self.assertEqual(1, len(proto.metric)) self.assertEqual(20, proto.metric[0].summary.sample_count) self.assertEqual(0.2, proto.metric[0].summary.sample_sum) self.assertEqual(3, len(proto.metric[0].summary.quantile)) self.assertEqual(0.02, proto.metric[0].summary.quantile[0].value) self.assertEqual(0.04, proto.metric[0].summary.quantile[1].value) self.assertEqual(0.06, proto.metric[0].summary.quantile[2].value) def test_histogram_to_proto(self): test_hist = prometheus_client.core.HistogramMetricFamily( "test", "", labels=["testLabel"], ) # Add first unique labelset metrics test_hist.add_metric(["val1"], [(1, 1), (10, 2), (100, 3)], 6) # Add second unique labelset metrics test_hist.add_metric(["val2"], [(1, 2), (10, 3), (100, 4)], 9) protos = _histogram_to_proto(test_hist) self.assertEqual(2, len(protos)) for proto in protos: self.assertEqual(proto.name, test_hist.name) self.assertEqual(proto.type, metrics_pb2.HISTOGRAM) if proto.metric[0].label[0].value == "val1": self.assertEqual(1, len(proto.metric)) self.assertEqual(3, proto.metric[0].histogram.sample_count) self.assertEqual(6, proto.metric[0].histogram.sample_sum) self.assertEqual(3, len(proto.metric[0].histogram.bucket)) self.assertEqual( 1, proto.metric[0].histogram.bucket[0].cumulative_count) self.assertEqual( 2, proto.metric[0].histogram.bucket[1].cumulative_count) self.assertEqual( 3, proto.metric[0].histogram.bucket[2].cumulative_count) else: self.assertEqual(1, len(proto.metric)) self.assertEqual(4, proto.metric[0].histogram.sample_count) self.assertEqual(9, proto.metric[0].histogram.sample_sum) self.assertEqual(3, len(proto.metric[0].histogram.bucket)) self.assertEqual( 2, proto.metric[0].histogram.bucket[0].cumulative_count) self.assertEqual( 3, proto.metric[0].histogram.bucket[1].cumulative_count) self.assertEqual( 4, proto.metric[0].histogram.bucket[2].cumulative_count) def _generate_samples(self, number): samples = [] for _ in range(number): sample_name = randrange(10000) samples.append(MetricFamily(name=str(sample_name))) return samples
class MetricsCollectorTests(unittest.TestCase): """ Tests for the MetricCollector collect and sync """ @classmethod def setUpClass(cls): cls.queue_size = 5 def setUp(self): ServiceRegistry.add_service('test', '0.0.0.0', 0) ServiceRegistry._PROXY_CONFIG = { 'local_port': 1234, 'cloud_address': 'test', 'proxy_cloud_connections': True } self._services = ['test'] self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904" self.queue_length = 5 self.timeout = 1 self._collector = MetricsCollector(self._services, 5, 10, self.timeout, queue_length=self.queue_length, loop=asyncio.new_event_loop()) @unittest.mock.patch('magma.magmad.metrics_collector.MetricsControllerStub' ) def test_sync(self, controller_mock): """ Test if the collector syncs our sample. """ # Mock out Collect.future mock = unittest.mock.Mock() mock.Collect.future.side_effect = [unittest.mock.Mock()] controller_mock.side_effect = [mock] # Call with no samples self._collector.sync() controller_mock.Collect.future.assert_not_called() self._collector._loop.stop() # Call with new samples to send and some to retry samples = [MetricFamily(name="1234")] self._collector._samples.extend(samples) self._collector._retry_queue.extend(samples) with unittest.mock.patch('snowflake.snowflake') as mock_snowflake: mock_snowflake.side_effect = lambda: self.gateway_id self._collector.sync() mock.Collect.future.assert_called_once_with( MetricsContainer(gatewayId=self.gateway_id, family=samples * 2), self.timeout) self.assertCountEqual(self._collector._samples, []) self.assertCountEqual(self._collector._retry_queue, []) def test_sync_queue(self): """ Test if the sync queues items on failure """ # We should retry sending the newest samples samples = [ MetricFamily(name=str(i)) for i in range(self.queue_length + 1) ] mock_future = MockFuture(is_error=True) self._collector.sync_done(samples, mock_future) self.assertCountEqual(self._collector._samples, []) self.assertCountEqual(self._collector._retry_queue, samples[-self.queue_length:]) # On success don't retry to send self._collector._retry_queue.clear() mock_future = MockFuture(is_error=False) self._collector.sync_done(samples, mock_future) self.assertCountEqual(self._collector._samples, []) self.assertCountEqual(self._collector._retry_queue, []) def test_collect(self): """ Test if the collector syncs our sample. """ mock = unittest.mock.MagicMock() samples = [MetricFamily(name="2345")] self._collector._samples.clear() self._collector._samples.extend(samples) mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] self._collector.collect_done('test', mock) # Should dequeue sample from the left, and enqueue on right # collector should add one more metric for collection success/failure self.assertEqual(len(self._collector._samples), len(samples * 2) + 1) def test_collect_start_time(self): """ Test if the collector syncs our sample. """ mock = unittest.mock.MagicMock() start_metric = Metric() start_metric.gauge.value = calendar.timegm(time.gmtime()) - 1 start_time = MetricFamily( name=str(metricsd_pb2.process_start_time_seconds), metric=[start_metric], ) samples = [start_time] self._collector._samples.clear() mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] self._collector.collect_done('test', mock) # should have uptime, start time, and collection success self.assertEqual(len(self._collector._samples), 3) uptime_list = [ fam for fam in self._collector._samples if fam.name == str(metricsd_pb2.process_uptime_seconds) ] self.assertEqual(len(uptime_list), 1) self.assertEqual(len(uptime_list[0].metric), 1) self.assertGreater(uptime_list[0].metric[0].gauge.value, 0) # ensure no exceptions with empty metric empty = MetricFamily(name=str(metricsd_pb2.process_start_time_seconds)) samples = [empty] self._collector._samples.clear() mock.result.side_effect = [MetricsContainer(family=samples)] mock.exception.side_effect = [False] try: self._collector.collect_done('test', mock) except Exception: # pylint: disable=broad-except self.fail("Collection with empty metric should not have failed")