def test_service_check_tags(self): stats = MetricsAggregator('myhost') stats.submit_packets('_sc|check.1|0') stats.submit_packets('_sc|check.2|0|#t1') stats.submit_packets('_sc|check.3|0|h:i-abcd1234|#t1,t2|m:fakeout#t5') stats.submit_packets('_sc|check.4|0|#t1,t2:v2,t3,t4') service_checks = self.sort_service_checks(stats.flush_service_checks()) assert len(service_checks) == 4 first, second, third, fourth = service_checks nt.assert_equal(first['check'], 'check.1') assert first.get('tags') is None, "service_check['tags'] shouldn't be" + \ "defined when no tags aren't explicited in the packet" nt.assert_equal(second['check'], 'check.2') nt.assert_equal(second['tags'], sorted(['t1'])) nt.assert_equal(third['check'], 'check.3') nt.assert_equal(third['host_name'], 'i-abcd1234') nt.assert_equal(third['message'], 'fakeout#t5') nt.assert_equal(third['tags'], sorted(['t1', 't2'])) nt.assert_equal(fourth['check'], 'check.4') nt.assert_equal(fourth['tags'], sorted(['t1', 't2:v2', 't3', 't4']))
def test_custom_aggregate(self): configstr = 'median, max' stats = MetricsAggregator( 'myhost', histogram_aggregates=get_histogram_aggregates(configstr) ) self.assertEquals( sorted(stats.metric_config[Histogram]['aggregates']), ['max', 'median'], stats.metric_config[Histogram] ) for i in xrange(20): stats.submit_packets('myhistogram:{0}|h'.format(i)) metrics = stats.flush() self.assertEquals(len(metrics), 3, metrics) value_by_type = {} for k in metrics: value_by_type[k['metric'][len('myhistogram')+1:]] = k['points'][0][1] self.assertEquals(value_by_type['median'], 9, value_by_type) self.assertEquals(value_by_type['max'], 19, value_by_type) self.assertEquals(value_by_type['95percentile'], 18, value_by_type)
def test_custom_single_percentile(self): configstr = '0.40' stats = MetricsAggregator( 'myhost', histogram_percentiles=get_histogram_percentiles(configstr) ) self.assertEquals( stats.metric_config[Histogram]['percentiles'], [0.40], stats.metric_config[Histogram] ) for i in xrange(20): stats.submit_packets('myhistogram:{0}|h'.format(i)) metrics = stats.flush() self.assertEquals(len(metrics), 5, metrics) value_by_type = {} for k in metrics: value_by_type[k[0][len('myhistogram')+1:]] = k[2] self.assertEquals(value_by_type['40percentile'], 7, value_by_type)
def test_tags(self): stats = MetricsAggregator("myhost") stats.submit_packets("gauge:1|c") stats.submit_packets("gauge:2|c|@1") stats.submit_packets("gauge:4|c|#tag1,tag2") stats.submit_packets("gauge:8|c|#tag2,tag1") # Should be the same as above stats.submit_packets("gauge:16|c|#tag3,tag4") metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 3 first, second, third = metrics nt.assert_equal(first["metric"], "gauge") nt.assert_equal(first["tags"], None) nt.assert_equal(first["points"][0][1], 3) nt.assert_equal(first["host"], "myhost") nt.assert_equal(second["metric"], "gauge") nt.assert_equal(second["tags"], ("tag1", "tag2")) nt.assert_equal(second["points"][0][1], 12) nt.assert_equal(second["host"], "myhost") nt.assert_equal(third["metric"], "gauge") nt.assert_equal(third["tags"], ("tag3", "tag4")) nt.assert_equal(third["points"][0][1], 16) nt.assert_equal(third["host"], "myhost")
def test_custom_multiple_percentile(self): configstr = '0.4, 0.65, 0.999' stats = MetricsAggregator( 'myhost', histogram_percentiles=get_histogram_percentiles(configstr) ) self.assertEquals( stats.metric_config[Histogram]['percentiles'], [0.4, 0.65, 0.99], stats.metric_config[Histogram] ) for i in xrange(20): stats.submit_packets('myhistogram:{0}|h'.format(i)) metrics = stats.flush() self.assertEquals(len(metrics), 7, metrics) value_by_type = {} for k in metrics: value_by_type[k['metric'][len('myhistogram')+1:]] = k['points'][0][1] self.assertEquals(value_by_type['40percentile'], 7, value_by_type) self.assertEquals(value_by_type['65percentile'], 12, value_by_type) self.assertEquals(value_by_type['99percentile'], 19, value_by_type)
def test_event_tags(self): stats = MetricsAggregator("myhost") stats.submit_packets("_e{6,4}:title1|text") stats.submit_packets("_e{6,4}:title2|text|#t1") stats.submit_packets("_e{6,4}:title3|text|#t1,t2:v2,t3,t4") stats.submit_packets("_e{6,4}:title4|text|k:key|p:normal|#t1,t2") events = self.sort_events(stats.flush_events()) assert len(events) == 4 first, second, third, fourth = events try: first["tags"] except Exception: assert True else: assert False, "event['tags'] shouldn't be defined when no tags aren't explicited in the packet" nt.assert_equal(first["msg_title"], "title1") nt.assert_equal(first["msg_text"], "text") nt.assert_equal(second["msg_title"], "title2") nt.assert_equal(second["msg_text"], "text") nt.assert_equal(second["tags"], sorted(["t1"])) nt.assert_equal(third["msg_title"], "title3") nt.assert_equal(third["msg_text"], "text") nt.assert_equal(third["tags"], sorted(["t1", "t2:v2", "t3", "t4"])) nt.assert_equal(fourth["msg_title"], "title4") nt.assert_equal(fourth["msg_text"], "text") nt.assert_equal(fourth["aggregation_key"], "key") nt.assert_equal(fourth["priority"], "normal") nt.assert_equal(fourth["tags"], sorted(["t1", "t2"]))
def test_event_tags(self): stats = MetricsAggregator('myhost') stats.submit_packets('_e{6,4}:title1|text') stats.submit_packets('_e{6,4}:title2|text|#t1') stats.submit_packets('_e{6,4}:title3|text|#t1,t2:v2,t3,t4') stats.submit_packets('_e{6,4}:title4|text|k:key|p:normal|#t1,t2') events = self.sort_events(stats.flush_events()) assert len(events) == 4 first, second, third, fourth = events try: first['tags'] except Exception: assert True else: assert False, "event['tags'] shouldn't be defined when no tags aren't explicited in the packet" nt.assert_equal(first['msg_title'], 'title1') nt.assert_equal(first['msg_text'], 'text') nt.assert_equal(second['msg_title'], 'title2') nt.assert_equal(second['msg_text'], 'text') nt.assert_equal(second['tags'], sorted(['t1'])) nt.assert_equal(third['msg_title'], 'title3') nt.assert_equal(third['msg_text'], 'text') nt.assert_equal(third['tags'], sorted(['t1', 't2:v2', 't3', 't4'])) nt.assert_equal(fourth['msg_title'], 'title4') nt.assert_equal(fourth['msg_text'], 'text') nt.assert_equal(fourth['aggregation_key'], 'key') nt.assert_equal(fourth['priority'], 'normal') nt.assert_equal(fourth['tags'], sorted(['t1', 't2']))
def test_histogram(self): stats = MetricsAggregator('myhost') # Sample all numbers between 1-100 many times. This # means our percentiles should be relatively close to themselves. percentiles = range(100) random.shuffle(percentiles) # in place for i in percentiles: for j in xrange(20): for type_ in ['h', 'ms']: m = 'my.p:%s|%s' % (i, type_) stats.submit_packets(m) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 5) p95, pavg, pcount, pmax, pmed = self.sort_metrics(metrics) nt.assert_equal(p95['metric'], 'my.p.95percentile') self.assert_almost_equal(p95['points'][0][1], 95, 10) self.assert_almost_equal(pmax['points'][0][1], 99, 1) self.assert_almost_equal(pmed['points'][0][1], 50, 2) self.assert_almost_equal(pavg['points'][0][1], 50, 2) self.assert_almost_equal(pcount['points'][0][1], 4000, 0) # 100 * 20 * 2 nt.assert_equals(p95['host'], 'myhost') # Ensure that histograms are reset. metrics = self.sort_metrics(stats.flush()) assert not metrics
def test_tags(self): stats = MetricsAggregator('myhost') stats.submit_packets('gauge:1|c') stats.submit_packets('gauge:2|c|@1') stats.submit_packets('gauge:4|c|#tag1,tag2') stats.submit_packets('gauge:8|c|#tag2,tag1') # Should be the same as above stats.submit_packets('gauge:16|c|#tag3,tag4') metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 3 first, second, third = metrics nt.assert_equal(first['metric'], 'gauge') nt.assert_equal(first['tags'], None) nt.assert_equal(first['points'][0][1], 3) nt.assert_equal(first['host'], 'myhost') nt.assert_equal(second['metric'], 'gauge') nt.assert_equal(second['tags'], ('tag1', 'tag2')) nt.assert_equal(second['points'][0][1], 12) nt.assert_equal(second['host'], 'myhost') nt.assert_equal(third['metric'], 'gauge') nt.assert_equal(third['tags'], ('tag3', 'tag4')) nt.assert_equal(third['points'][0][1], 16) nt.assert_equal(third['host'], 'myhost')
def test_magic_tags(self): stats = MetricsAggregator('myhost') stats.submit_packets('my.gauge.a:1|c|#host:test-a') stats.submit_packets('my.gauge.b:4|c|#tag1,tag2,host:test-b') stats.submit_packets('my.gauge.b:8|c|#host:test-b,tag2,tag1') stats.submit_packets('my.gauge.c:10|c|#tag3') stats.submit_packets('my.gauge.c:16|c|#device:floppy,tag3') metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 4) first, second, third, fourth = metrics nt.assert_equal(first['metric'], 'my.gauge.a') nt.assert_equal(first['tags'], None) nt.assert_equal(first['points'][0][1], 1) nt.assert_equal(first['host'], 'test-a') nt.assert_equal(second['metric'], 'my.gauge.b') nt.assert_equal(second['tags'], ('tag1', 'tag2')) nt.assert_equal(second['points'][0][1], 12) nt.assert_equal(second['host'], 'test-b') nt.assert_equal(third['metric'], 'my.gauge.c') nt.assert_equal(third['tags'], ('tag3', )) nt.assert_equal(third['points'][0][1], 10) nt.assert_equal(third['device_name'], None) nt.assert_equal(fourth['metric'], 'my.gauge.c') nt.assert_equal(fourth['tags'], ('tag3', )) nt.assert_equal(fourth['points'][0][1], 16) nt.assert_equal(fourth['device_name'], 'floppy')
def test_histogram(self): # The min is not enabled by default stats = MetricsAggregator( 'myhost', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) # Sample all numbers between 1-100 many times. This # means our percentiles should be relatively close to themselves. percentiles = range(100) random.shuffle(percentiles) # in place for i in percentiles: for j in xrange(20): for type_ in ['h', 'ms']: m = 'my.p:%s|%s' % (i, type_) stats.submit_packets(m) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 6) p95, pavg, pcount, pmax, pmed, pmin = self.sort_metrics(metrics) nt.assert_equal(p95[0], 'my.p.95percentile') self.assert_almost_equal(p95[2], 95, 10) self.assert_almost_equal(pmax[2], 99, 1) self.assert_almost_equal(pmed[2], 50, 2) self.assert_almost_equal(pavg[2], 50, 2) self.assert_almost_equal(pmin[2], 1, 1) self.assert_almost_equal(pcount[2], 4000, 0) # 100 * 20 * 2 nt.assert_equals(p95[3]['hostname'], 'myhost') # Ensure that histograms are reset. metrics = self.sort_metrics(stats.flush()) assert not metrics
def test_scientific_notation(self): stats = MetricsAggregator('myhost', interval=10) stats.submit_packets('test.scinot:9.512901e-05|g') metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 1 ts, val = metrics[0].get('points')[0] nt.assert_almost_equal(val, 9.512901e-05)
class TestAggregator(unittest.TestCase): def setUp(self): self.aggr = MetricsAggregator("test-aggr") def test_dupe_tags(self): self.aggr.increment("test-counter", 1, tags=["a", "b"]) self.aggr.increment("test-counter", 1, tags=["a", "b", "b"]) self.assertEquals(len(self.aggr.metrics), 1, self.aggr.metrics) metric = self.aggr.metrics.values()[0] self.assertEquals(metric.value, 2)
class TestAggregator(unittest.TestCase): def setUp(self): self.aggr = MetricsAggregator('test-aggr') def test_dupe_tags(self): self.aggr.increment('test-counter', 1, tags=['a', 'b']) self.aggr.increment('test-counter', 1, tags=['a', 'b', 'b']) self.assertEquals(len(self.aggr.metrics), 1, self.aggr.metrics) metric = self.aggr.metrics.values()[0] self.assertEquals(metric.value, 2)
def test_sampled_counter(self): # Submit a sampled counter. stats = MetricsAggregator('myhost') stats.submit_packets('sampled.counter:1|c|@0.5') metrics = stats.flush() assert len(metrics) == 1 m = metrics[0] assert m['metric'] == 'sampled.counter' nt.assert_equal(m['points'][0][1], 2)
def test_sampled_counter(self): # Submit a sampled counter. stats = MetricsAggregator("myhost") stats.submit_packets("sampled.counter:1|c|@0.5") metrics = stats.flush() assert len(metrics) == 1 m = metrics[0] assert m["metric"] == "sampled.counter" nt.assert_equal(m["points"][0][1], 2)
def test_counter(self): stats = MetricsAggregator('myhost') # Track some counters. stats.submit_packets('my.first.counter:1|c') stats.submit_packets('my.first.counter:5|c') stats.submit_packets('my.second.counter:1|c') stats.submit_packets('my.third.counter:3|c') # Ensure they roll up nicely. metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 3 first, second, third = metrics nt.assert_equals(first['metric'], 'my.first.counter') nt.assert_equals(first['points'][0][1], 6) nt.assert_equals(first['host'], 'myhost') nt.assert_equals(second['metric'], 'my.second.counter') nt.assert_equals(second['points'][0][1], 1) nt.assert_equals(third['metric'], 'my.third.counter') nt.assert_equals(third['points'][0][1], 3) # Ensure that counters reset to zero. metrics = self.sort_metrics(stats.flush()) first, second, third = metrics nt.assert_equals(first['metric'], 'my.first.counter') nt.assert_equals(first['points'][0][1], 0) nt.assert_equals(second['metric'], 'my.second.counter') nt.assert_equals(second['points'][0][1], 0) nt.assert_equals(third['metric'], 'my.third.counter') nt.assert_equals(third['points'][0][1], 0)
def test_gauge_sample_rate(self): stats = MetricsAggregator('myhost') # Submit a sampled gauge metric. stats.submit_packets('sampled.gauge:10|g|@0.1') # Assert that it's treated normally. metrics = stats.flush() nt.assert_equal(len(metrics), 1) m = metrics[0] nt.assert_equal(m['metric'], 'sampled.gauge') nt.assert_equal(m['points'][0][1], 10)
def test_gauge_sample_rate(self): stats = MetricsAggregator("myhost") # Submit a sampled gauge metric. stats.submit_packets("sampled.gauge:10|g|@0.1") # Assert that it's treated normally. metrics = stats.flush() nt.assert_equal(len(metrics), 1) m = metrics[0] nt.assert_equal(m["metric"], "sampled.gauge") nt.assert_equal(m["points"][0][1], 10)
def test_batch_submission(self): # Submit a sampled histogram. stats = MetricsAggregator("myhost") metrics = ["counter:1|c", "counter:1|c", "gauge:1|g"] packet = "\n".join(metrics) stats.submit_packets(packet) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(2, len(metrics)) counter, gauge = metrics assert counter["points"][0][1] == 2 assert gauge["points"][0][1] == 1
def test_sampled_histogram(self): # Submit a sampled histogram. stats = MetricsAggregator("myhost") stats.submit_packets("sampled.hist:5|h|@0.5") # Assert we scale up properly. metrics = self.sort_metrics(stats.flush()) p95, pavg, pcount, pmax, pmed = self.sort_metrics(metrics) nt.assert_equal(pcount["points"][0][1], 2) for p in [p95, pavg, pmed, pmax]: nt.assert_equal(p["points"][0][1], 5)
def test_monokey_batching_withtags_with_sampling(self): # The min is not enabled by default stats = MetricsAggregator( 'host', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) stats.submit_packets('test_metric:1.5|c|#tag1:one,tag2:two:2.3|g|#tag3:three:3|g:42|h|#tag1:12,tag42:42|@0.22') stats_ref = MetricsAggregator( 'host', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) packets = [ 'test_metric:1.5|c|#tag1:one,tag2:two', 'test_metric:2.3|g|#tag3:three', 'test_metric:3|g', 'test_metric:42|h|#tag1:12,tag42:42|@0.22' ] stats_ref.submit_packets("\n".join(packets)) metrics = self.sort_metrics(stats.flush()) metrics_ref = self.sort_metrics(stats_ref.flush()) self.assertTrue(len(metrics) == len(metrics_ref) == 9, (metrics, metrics_ref)) for i in range(len(metrics)): nt.assert_equal(metrics[i]['points'][0][1], metrics_ref[i]['points'][0][1]) nt.assert_equal(metrics[i]['tags'], metrics_ref[i]['tags'])
def test_monokey_batching_notags(self): # The min is not enabled by default stats = MetricsAggregator( 'host', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) stats.submit_packets('test_hist:0.3|ms:2.5|ms|@0.5:3|ms') stats_ref = MetricsAggregator( 'host', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) packets = [ 'test_hist:0.3|ms', 'test_hist:2.5|ms|@0.5', 'test_hist:3|ms' ] stats_ref.submit_packets("\n".join(packets)) metrics = stats.flush() metrics_ref = stats_ref.flush() self.assertTrue(len(metrics) == len(metrics_ref) == 6, (metrics, metrics_ref)) for i in range(len(metrics)): nt.assert_equal(metrics[i]['points'][0][1], metrics_ref[i]['points'][0][1])
def test_sampled_histogram(self): # Submit a sampled histogram. # The min is not enabled by default stats = MetricsAggregator( 'myhost', histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES+['min'] ) stats.submit_packets('sampled.hist:5|h|@0.5') # Assert we scale up properly. metrics = self.sort_metrics(stats.flush()) p95, pavg, pcount, pmax, pmed, pmin = self.sort_metrics(metrics) nt.assert_equal(pcount['points'][0][1], 2) for p in [p95, pavg, pmed, pmax, pmin]: nt.assert_equal(p['points'][0][1], 5)
def test_event_text(self): stats = MetricsAggregator("myhost") stats.submit_packets("_e{2,0}:t1|") stats.submit_packets("_e{2,12}:t2|text|content") stats.submit_packets("_e{2,23}:t3|First line\\nSecond line") # \n is a newline stats.submit_packets(u"_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪") # utf-8 compliant events = self.sort_events(stats.flush_events()) assert len(events) == 4 first, second, third, fourth = events nt.assert_equal(first["msg_text"], "") nt.assert_equal(second["msg_text"], "text|content") nt.assert_equal(third["msg_text"], "First line\nSecond line") nt.assert_equal(fourth["msg_text"], u"♬ †øU †øU ¥ºu T0µ ♪")
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles') ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int)
def test_event_text(self): stats = MetricsAggregator('myhost') stats.submit_packets('_e{2,0}:t1|') stats.submit_packets('_e{2,12}:t2|text|content') stats.submit_packets('_e{2,23}:t3|First line\\nSecond line') # \n is a newline stats.submit_packets(u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪') # utf-8 compliant events = self.sort_events(stats.flush_events()) assert len(events) == 4 first, second, third, fourth = events nt.assert_equal(first['msg_text'], '') nt.assert_equal(second['msg_text'], 'text|content') nt.assert_equal(third['msg_text'], 'First line\nSecond line') nt.assert_equal(fourth['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪')
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None)) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None
def test_batch_submission(self): # Submit a sampled histogram. stats = MetricsAggregator('myhost') metrics = [ 'counter:1|c', 'counter:1|c', 'gauge:1|g' ] packet = "\n".join(metrics) stats.submit_packets(packet) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(2, len(metrics)) counter, gauge = metrics assert counter[2] == 2 assert gauge[2] == 1
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get("developer_mode") and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float(agentConfig.get("default_integration_http_timeout", 9)) self.hostname = agentConfig.get("checksd_hostname") or get_hostname(agentConfig) self.log = logging.getLogger("%s.%s" % (__name__, name)) self.min_collection_interval = self.init_config.get( "min_collection_interval", self.DEFAULT_MIN_COLLECTION_INTERVAL ) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get("recent_point_threshold", None), histogram_aggregates=agentConfig.get("histogram_aggregates"), histogram_percentiles=agentConfig.get("histogram_percentiles"), ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get("use_agent_proxy", True) self.proxies = {"http": None, "https": None} if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings["host"], port=self.proxy_settings["port"]) if self.proxy_settings["user"] and self.proxy_settings["password"]: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings["user"], password=self.proxy_settings["password"], uri=uri ) self.proxies["http"] = "http://{uri}".format(uri=uri) self.proxies["https"] = "https://{uri}".format(uri=uri)
def test_histogram_normalization(self): # The min is not enabled by default stats = MetricsAggregator( 'myhost', interval=10, histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES + ['min']) for i in range(5): stats.submit_packets('h1:1|h') for i in range(20): stats.submit_packets('h2:1|h') metrics = self.sort_metrics(stats.flush()) _, _, h1count, _, _, _, \ _, _, h2count, _, _, _ = metrics nt.assert_equal(h1count['points'][0][1], 0.5) nt.assert_equal(h2count['points'][0][1], 2)
def test_service_check_basic(self): stats = MetricsAggregator('myhost') stats.submit_packets('_sc|check.1|0') stats.submit_packets('_sc|check.2|1') stats.submit_packets('_sc|check.3|2') service_checks = self.sort_service_checks(stats.flush_service_checks()) assert len(service_checks) == 3 first, second, third = service_checks assert first['check'] == 'check.1' assert first['status'] == 0 assert second['check'] == 'check.2' assert second['status'] == 1 assert third['check'] == 'check.3' assert third['status'] == 2
def test_service_check_basic(self): stats = MetricsAggregator('myhost') stats.submit_packets('_sc|check.1|0') stats.submit_packets('_sc|check.2|1') stats.submit_packets('_sc|check.3|2') service_checks = self.sort_service_checks(stats.flush_service_checks()) assert len(service_checks) == 3 first, second, third = service_checks nt.assert_equal(first['check'], 'check.1') nt.assert_equal(first['status'], 0) nt.assert_equal(second['check'], 'check.2') nt.assert_equal(second['status'], 1) nt.assert_equal(third['check'], 'check.3') nt.assert_equal(third['status'], 2)
def test_event_text_utf8(self): stats = MetricsAggregator('myhost', utf8_decoding=True) # Should raise because content is not encoded self.assertRaises(Exception, stats.submit_packets, u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪') stats.submit_packets(u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪'.encode( 'utf-8')) # utf-8 compliant # Normal packet stats.submit_packets( '_e{2,23}:t3|First line\\nSecond line') # \n is a newline events = self.sort_events(stats.flush_events()) assert len(events) == 2 nt.assert_equal(events[0]['msg_text'], 'First line\nSecond line') nt.assert_equal(events[1]['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪')
def test_spurr(subprocess_patch): # defer import to test to avoid breaking get_subprocess_output # patching. from datadog_checks.lparstats import LPARStats hostname = 'foo' aggregator = MetricsAggregator( hostname, interval=1.0, histogram_aggregates=None, histogram_percentiles=None, ) c = LPARStats("lparstats", {}, {}, aggregator) c.collect_spurr() metrics = c.aggregator.flush( )[:-1] # we remove the datadog.agent.running metric expected_metrics = [ 'system.lpar.spurr.user', 'system.lpar.spurr.sys', 'system.lpar.spurr.wait', 'system.lpar.spurr.idle', 'system.lpar.spurr.user.norm', 'system.lpar.spurr.sys.norm', 'system.lpar.spurr.wait.norm', 'system.lpar.spurr.idle.norm', 'system.lpar.spurr.user.pct', 'system.lpar.spurr.sys.pct', 'system.lpar.spurr.wait.pct', 'system.lpar.spurr.idle.pct', 'system.lpar.spurr.user.norm.pct', 'system.lpar.spurr.sys.norm.pct', 'system.lpar.spurr.wait.norm.pct', 'system.lpar.spurr.idle.norm.pct', ] assert len(metrics) == len(expected_metrics) for metric in metrics: assert metric['metric'] in expected_metrics
def init(config_path=None, use_watchdog=False, use_forwarder=False): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) log.debug("Configuration dogstatsd") port = c['dogstatsd_port'] interval = int(c['dogstatsd_interval']) api_key = c['api_key'] non_local_traffic = c['non_local_traffic'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsAggregator(hostname, interval, recent_point_threshold=c.get( 'recent_point_threshold', None)) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog) # Start the server on an IPv4 stack # Default to loopback server_host = '127.0.0.1' # If specified, bind to all addressses if non_local_traffic: server_host = '' server = Server(aggregator, server_host, port) return reporter, server, c
def test_memory_entitlements(subprocess_patch): # defer import to test to avoid breaking get_subprocess_output # patching. from datadog_checks.lparstats import LPARStats hostname = 'foo' aggregator = MetricsAggregator( hostname, interval=1.0, histogram_aggregates=None, histogram_percentiles=None, ) c = LPARStats("lparstats", {}, {}, aggregator) c.collect_memory_entitlements() metrics = c.aggregator.flush( )[:-1] # we remove the datadog.agent.running metric expected_metrics = [ 'system.lpar.memory.entitlement.iomin', 'system.lpar.memory.entitlement.iodes', 'system.lpar.memory.entitlement.iomu', 'system.lpar.memory.entitlement.iores', 'system.lpar.memory.entitlement.iohwm', 'system.lpar.memory.entitlement.iomaf', ] # compile entitlements from mock output output = list(filter(None, AIX_LPARSTATS_MEMORY_ENTITLEMENTS.splitlines())) output = output[c.MEMORY_ENTITLEMENTS_START_IDX + 1:] entitlements = collect_column(output, 0) assert len(metrics) == (len(expected_metrics) * len(entitlements)) for metric in metrics: for tag in metric['tags']: tag = tag.decode('utf-8') if 'iompn' in tag: assert tag.split(':')[1] in entitlements
def test_memory_page(subprocess_patch): # defer import to test to avoid breaking get_subprocess_output # patching. from datadog_checks.lparstats import LPARStats hostname = 'foo' aggregator = MetricsAggregator( hostname, interval=1.0, histogram_aggregates=None, histogram_percentiles=None, ) c = LPARStats("lparstats", {}, {}, aggregator) c.collect_memory(page_stats=True) metrics = c.aggregator.flush( )[:-1] # we remove the datadog.agent.running metric # NOTE: iomf unavailable expected_metrics = [ 'system.lpar.memory.physb', 'system.lpar.memory.hpi', 'system.lpar.memory.hpit', 'system.lpar.memory.pmem', 'system.lpar.memory.iomu', 'system.lpar.memory.iomin', 'system.lpar.memory.iohwm', 'system.lpar.memory.iomaf', 'system.lpar.memory.pgcol', 'system.lpar.memory.mpgcol', 'system.lpar.memory.ccol', 'system.lpar.memory.entc', 'system.lpar.memory.vcsw', ] assert len(metrics) == len(expected_metrics) for metric in metrics: assert metric['metric'] in expected_metrics
def test_disk_basic(disk_io_counters, disk_usage, disk_partitions): from datadog_checks.disk import Disk # delayed import for good patching disk_partitions.return_value = MOCK_PARTITIONS aggregator = MetricsAggregator( HOSTNAME, interval=1.0, histogram_aggregates=None, histogram_percentiles=None, ) total_gauges, expected_gauges = generate_expected_gauges() total_rates, expected_rates = generate_expected_rates() c = Disk("disk", {}, {}, aggregator) c.check({}) metrics = c.aggregator.flush( )[:-1] # we remove the datadog.agent.running metric assert len(metrics) == total_gauges time.sleep(1) c.check({}) metrics = c.aggregator.flush( )[:-1] # we remove the datadog.agent.running metric assert len(metrics) == (total_gauges + total_rates) for metric in metrics: assert metric['metric'] in expected_gauges or metric[ 'metric'] in expected_rates assert len(metric['points']) == 1 assert metric['host'] == HOSTNAME assert metric['type'] == GAUGE assert is_metric_expected(expected_gauges, metric) or is_metric_expected( expected_rates, metric)
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get( 'developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "agent_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ # Events are disabled. return if event.get('agent_key') is None: event['agent_key'] = self.agentConfig['agent_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Don't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = str(message) self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, str(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ self.warnings.append(str(warning_message)) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn( "Cannot call psutil method {0} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally:
def test_gauge(self): stats = MetricsAggregator('myhost') # Track some counters. stats.submit_packets('my.first.gauge:1|g') stats.submit_packets('my.first.gauge:5|g') stats.submit_packets('my.second.gauge:1.5|g') # Ensure that gauges roll up correctly. metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 2 first, second = metrics nt.assert_equals(first['metric'], 'my.first.gauge') nt.assert_equals(first['points'][0][1], 5) nt.assert_equals(first['host'], 'myhost') nt.assert_equals(second['metric'], 'my.second.gauge') nt.assert_equals(second['points'][0][1], 1.5) # Ensure that old gauges get dropped due to old timestamps stats.gauge('my.first.gauge', 5) stats.gauge('my.first.gauge', 1, timestamp=1000000000) stats.gauge('my.second.gauge', 20, timestamp=1000000000) metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 1 first = metrics[0] nt.assert_equals(first['metric'], 'my.first.gauge') nt.assert_equals(first['points'][0][1], 5) nt.assert_equals(first['host'], 'myhost')
def test_event_title(self): stats = MetricsAggregator('myhost', utf8_decoding=True) stats.submit_packets('_e{0,4}:|text') stats.submit_packets( u'_e{9,4}:2intitulé|text'.encode('utf-8')) # comes from socket stats.submit_packets('_e{14,4}:3title content|text') stats.submit_packets('_e{14,4}:4title|content|text') stats.submit_packets( '_e{13,4}:5title\\ntitle|text') # \n stays escaped events = self.sort_events(stats.flush_events()) assert len(events) == 5 nt.assert_equal(events[0]['msg_title'], '') nt.assert_equal(events[1]['msg_title'], u'2intitulé') nt.assert_equal(events[2]['msg_title'], '3title content') nt.assert_equal(events[3]['msg_title'], '4title|content') nt.assert_equal(events[4]['msg_title'], '5title\\ntitle')
def test_metrics_expiry(self): # Ensure metrics eventually expire and stop submitting. ag_interval = 1 expiry = ag_interval * 4 + 2 # The min is not enabled by default stats = MetricsAggregator( 'myhost', interval=ag_interval, expiry_seconds=expiry, histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES + ['min']) stats.submit_packets('test.counter:123|c') stats.submit_packets('test.gauge:55|g') stats.submit_packets('test.set:44|s') stats.submit_packets('test.histogram:11|h') # Ensure points keep submitting time.sleep(ag_interval) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 9) nt.assert_equal(metrics[0]['metric'], 'test.counter') nt.assert_equal(metrics[0]['points'][0][1], 123) time.sleep(ag_interval) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 1) nt.assert_equal(metrics[0]['metric'], 'test.counter') nt.assert_equal(metrics[0]['points'][0][1], 0) time.sleep(ag_interval) time.sleep(0.5) metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 1) nt.assert_equal(metrics[0]['metric'], 'test.counter') nt.assert_equal(metrics[0]['points'][0][1], 0) # Now sleep for longer than the expiry window and ensure # no points are submitted time.sleep(ag_interval) time.sleep(2) m = stats.flush() assert not m, str(m) # If we submit again, we're all good. stats.submit_packets('test.counter:123|c') stats.submit_packets('test.gauge:55|g') stats.submit_packets('test.set:44|s') stats.submit_packets('test.histogram:11|h') metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 9) nt.assert_equal(metrics[0]['metric'], 'test.counter') nt.assert_equal(metrics[0]['points'][0][1], 123)
def test_histogram_counter(self): # Test whether histogram.count == increment # same deal with a sample rate cnt = 100000 for run in [1, 2]: stats = MetricsAggregator('myhost') for i in xrange(cnt): if run == 2: stats.submit_packets('test.counter:1|c|@0.5') stats.submit_packets('test.hist:1|ms|@0.5') else: stats.submit_packets('test.counter:1|c') stats.submit_packets('test.hist:1|ms') metrics = self.sort_metrics(stats.flush()) assert len(metrics) > 0 nt.assert_equal([ m['points'][0][1] for m in metrics if m['metric'] == 'test.counter' ], [cnt * run]) nt.assert_equal([ m['points'][0][1] for m in metrics if m['metric'] == 'test.hist.count' ], [cnt * run])
class AgentCheck(object): def __init__(self, name, init_config, agentConfig): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = gethostname(agentConfig) self.log = logging.getLogger('checks.%s' % name) self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter) self.events = [] def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() @classmethod def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None): """ A method used for testing your check without running the agent. """ from util import yaml, yLoader if path_to_yaml: check_name = os.path.basename(path_to_yaml).split('.')[0] try: f = open(path_to_yaml) except IOError: raise Exception('Unable to open yaml config: %s' % path_to_yaml) yaml_text = f.read() f.close() config = yaml.load(yaml_text, Loader=yLoader) check = cls(check_name, config.get('init_config') or {}, agentConfig or {}) return check, config.get('instances', []) def normalize(self, metric, prefix=None): """ Turn a metric into a well-formed metric name prefix.b.c :param metric The metric name to normalize :param prefix A prefix to to add to the normalized name, default None """ name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric) # Eliminate multiple _ name = re.sub(r"__+", "_", name) # Don't start/end with _ name = re.sub(r"^_", "", name) name = re.sub(r"_$", "", name) # Drop ._ and _. name = re.sub(r"\._", ".", name) name = re.sub(r"_\.", ".", name) if prefix is not None: return prefix + "." + name else: return name
def test_formatter(self): stats = MetricsAggregator('myhost', interval=10, formatter=get_formatter( {"statsd_metric_namespace": "datadog"})) stats.submit_packets('gauge:16|c|#tag3,tag4') metrics = self.sort_metrics(stats.flush()) self.assertTrue(len(metrics) == 1) self.assertTrue(metrics[0]['metric'] == "datadog.gauge") stats = MetricsAggregator( 'myhost', interval=10, formatter=get_formatter({"statsd_metric_namespace": "datadoge."})) stats.submit_packets('gauge:16|c|#tag3,tag4') metrics = self.sort_metrics(stats.flush()) self.assertTrue(len(metrics) == 1) self.assertTrue(metrics[0]['metric'] == "datadoge.gauge") stats = MetricsAggregator('myhost', interval=10, formatter=get_formatter( {"statsd_metric_namespace": None})) stats.submit_packets('gauge:16|c|#tag3,tag4') metrics = self.sort_metrics(stats.flush()) self.assertTrue(len(metrics) == 1) self.assertTrue(metrics[0]['metric'] == "gauge")
def test_recent_point_threshold(self): threshold = 100 # The min is not enabled by default stats = MetricsAggregator( 'myhost', recent_point_threshold=threshold, histogram_aggregates=DEFAULT_HISTOGRAM_AGGREGATES + ['min']) timestamp_beyond_threshold = time.time() - threshold * 2 timestamp_within_threshold = time.time() - threshold / 2 # Ensure that old gauges get dropped due to old timestamps stats.submit_metric('my.first.gauge', 5, 'g') stats.submit_metric('my.first.gauge', 1, 'g', timestamp=timestamp_beyond_threshold) stats.submit_metric('my.second.gauge', 20, 'g', timestamp=timestamp_beyond_threshold) metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 1 first = metrics[0] nt.assert_equals(first['metric'], 'my.first.gauge') nt.assert_equals(first['points'][0][1], 5) nt.assert_equals(first['host'], 'myhost') # Ensure that old gauges get dropped due to old timestamps stats.submit_metric('my.1.gauge', 5, 'g') stats.submit_metric('my.1.gauge', 1, 'g', timestamp=timestamp_within_threshold) stats.submit_metric('my.2.counter', 20, 'c', timestamp=timestamp_within_threshold) stats.submit_metric('my.3.set', 20, 's', timestamp=timestamp_within_threshold) stats.submit_metric('my.4.histogram', 20, 'h', timestamp=timestamp_within_threshold) flush_timestamp = time.time() metrics = self.sort_metrics(stats.flush()) nt.assert_equal(len(metrics), 9) first, second, third, h1, h2, h3, h4, h5, h6 = metrics nt.assert_equals(first['metric'], 'my.1.gauge') nt.assert_equals(first['points'][0][1], 1) nt.assert_equals(first['host'], 'myhost') self.assert_almost_equal(first['points'][0][0], timestamp_within_threshold, 0.1) nt.assert_equals(second['metric'], 'my.2.counter') nt.assert_equals(second['points'][0][1], 20) self.assert_almost_equal(second['points'][0][0], flush_timestamp, 0.1) nt.assert_equals(third['metric'], 'my.3.set') nt.assert_equals(third['points'][0][1], 1) self.assert_almost_equal(third['points'][0][0], flush_timestamp, 0.1) nt.assert_equals(h1['metric'], 'my.4.histogram.95percentile') nt.assert_equals(h1['points'][0][1], 20) self.assert_almost_equal(h1['points'][0][0], flush_timestamp, 0.1) nt.assert_equal(h1['points'][0][0], h2['points'][0][0]) nt.assert_equal(h1['points'][0][0], h3['points'][0][0]) nt.assert_equal(h1['points'][0][0], h4['points'][0][0]) nt.assert_equal(h1['points'][0][0], h5['points'][0][0])
def test_service_check_message(self): stats = MetricsAggregator('myhost') stats.submit_packets('_sc|check.1|0|m:testing') stats.submit_packets('_sc|check.2|0|m:First line\\nSecond line') stats.submit_packets(u'_sc|check.3|0|m:♬ †øU †øU ¥ºu T0µ ♪') stats.submit_packets('_sc|check.4|0|m:|t:|m\:|d:') service_checks = self.sort_service_checks(stats.flush_service_checks()) assert len(service_checks) == 4 first, second, third, fourth = service_checks nt.assert_equal(first['check'], 'check.1') nt.assert_equal(first['message'], 'testing') nt.assert_equal(second['check'], 'check.2') nt.assert_equal(second['message'], 'First line\nSecond line') nt.assert_equal(third['check'], 'check.3') nt.assert_equal(third['message'], u'♬ †øU †øU ¥ºu T0µ ♪') nt.assert_equal(fourth['check'], 'check.4') nt.assert_equal(fourth['message'], '|t:|m:|d:')
def test_tags(self): stats = MetricsAggregator('myhost') stats.submit_packets('gauge:1|c') stats.submit_packets('gauge:2|c|@1') stats.submit_packets('gauge:4|c|#tag1,tag2') stats.submit_packets( 'gauge:8|c|#tag2,tag1') # Should be the same as above stats.submit_packets('gauge:16|c|#tag3,tag4') metrics = self.sort_metrics(stats.flush()) assert len(metrics) == 3 first, second, third = metrics nt.assert_equal(first['metric'], 'gauge') nt.assert_equal(first['tags'], None) nt.assert_equal(first['points'][0][1], 3) nt.assert_equal(first['host'], 'myhost') nt.assert_equal(second['metric'], 'gauge') nt.assert_equal(second['tags'], ('tag1', 'tag2')) nt.assert_equal(second['points'][0][1], 12) nt.assert_equal(second['host'], 'myhost') nt.assert_equal(third['metric'], 'gauge') nt.assert_equal(third['tags'], ('tag3', 'tag4')) nt.assert_equal(third['points'][0][1], 16) nt.assert_equal(third['host'], 'myhost')
def test_event_text(self): stats = MetricsAggregator('myhost') stats.submit_packets('_e{2,0}:t1|') stats.submit_packets('_e{2,12}:t2|text|content') stats.submit_packets( '_e{2,23}:t3|First line\\nSecond line') # \n is a newline stats.submit_packets( u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪') # utf-8 compliant events = self.sort_events(stats.flush_events()) assert len(events) == 4 first, second, third, fourth = events nt.assert_equal(first['msg_text'], '') nt.assert_equal(second['msg_text'], 'text|content') nt.assert_equal(third['msg_text'], 'First line\nSecond line') nt.assert_equal(fourth['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪')
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_EXPIRY_SECONDS = 300 DEFAULT_MIN_COLLECTION_INTERVAL = 0 _enabled_checks = [] @classmethod def is_check_enabled(cls, name): return name in cls._enabled_checks def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float( agentConfig.get('default_integration_http_timeout', 9)) self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.min_collection_interval = self.init_config.get( 'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get( "use_agent_proxy", True) self.proxies = { "http": None, "https": None, } if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings['host'], port=self.proxy_settings['port']) if self.proxy_settings['user'] and self.proxy_settings['password']: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings['user'], password=self.proxy_settings['password'], uri=uri) self.proxies['http'] = "http://{uri}".format(uri=uri) self.proxies['https'] = "https://{uri}".format(uri=uri) def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) @classmethod def generate_historate_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): cls.historate(self, metric, value, excluding_tags, tags=tags, hostname=hostname, device_name=device_name) return fct @classmethod def generate_histogram_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): tags = list( tags ) # Use a copy of the list to avoid removing tags from originial for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith(exc_tag + ":"): tags.remove(tag) cls.histogram(self, metric, value, tags=tags, hostname=hostname, device_name=device_name) return fct def historate(self, metric, value, excluding_tags, tags=None, hostname=None, device_name=None): """ Function to create a histogram metric for "rate" like metrics. Warning this doesn't use the harmonic mean, beware of what it means when using it. :param metric: The name of the metric :param value: The value to sample for the histogram :param excluding_tags: A list of tags that will be removed when computing the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ tags = list( tags ) # Use a copy of the list to avoid removing tags from originial context = [metric] if tags is not None: context.append("-".join(sorted(tags))) if hostname is not None: context.append("host:" + hostname) if device_name is not None: context.append("device:" + device_name) now = time.time() context = tuple(context) if context in self.historate_dict: if tags is not None: for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith("{0}:".format(exc_tag)): tags.remove(tag) prev_value, prev_ts = self.historate_dict[context] rate = float(value - prev_value) / float(now - prev_ts) self.aggregator.histogram(metric, rate, tags, hostname, device_name) self.historate_dict[context] = (value, now) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Doesn't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = unicode( message) # ascii converts to unicode but not viceversa self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, unicode(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ warning_message = str(warning_message) self.log.warning(warning_message) self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.min_collection_interval) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats after check {0}".format( self.name)) return instance_statuses def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() def stop(self): """ To be executed when the agent is being stopped to clean ressources """ pass @classmethod def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None): """ A method used for testing your check without running the agent. """ if path_to_yaml: check_name = os.path.basename(path_to_yaml).split('.')[0] try: f = open(path_to_yaml) except IOError: raise Exception('Unable to open yaml config: %s' % path_to_yaml) yaml_text = f.read() f.close() config = yaml.load(yaml_text, Loader=yLoader) try: check = cls(check_name, config.get('init_config') or {}, agentConfig or {}, config.get('instances')) except TypeError: # Compatibility for the check not supporting instances check = cls(check_name, config.get('init_config') or {}, agentConfig or {}) return check, config.get('instances', []) def normalize(self, metric, prefix=None, fix_case=False): """ Turn a metric into a well-formed metric name prefix.b.c :param metric The metric name to normalize :param prefix A prefix to to add to the normalized name, default None :param fix_case A boolean, indicating whether to make sure that the metric name returned is in underscore_case """ if isinstance(metric, unicode): metric_name = unicodedata.normalize('NFKD', metric).encode( 'ascii', 'ignore') else: metric_name = metric if fix_case: name = self.convert_to_underscore_separated(metric_name) if prefix is not None: prefix = self.convert_to_underscore_separated(prefix) else: name = re.sub(r"[,\+\*\-/()\[\]{}\s]", "_", metric_name) # Eliminate multiple _ name = re.sub(r"__+", "_", name) # Don't start/end with _ name = re.sub(r"^_", "", name) name = re.sub(r"_$", "", name) # Drop ._ and _. name = re.sub(r"\._", ".", name) name = re.sub(r"_\.", ".", name) if prefix is not None: return prefix + "." + name else: return name FIRST_CAP_RE = re.compile('(.)([A-Z][a-z]+)') ALL_CAP_RE = re.compile('([a-z0-9])([A-Z])') METRIC_REPLACEMENT = re.compile(r'([^a-zA-Z0-9_.]+)|(^[^a-zA-Z]+)') DOT_UNDERSCORE_CLEANUP = re.compile(r'_*\._*') def convert_to_underscore_separated(self, name): """ Convert from CamelCase to camel_case And substitute illegal metric characters """ metric_name = self.FIRST_CAP_RE.sub(r'\1_\2', name) metric_name = self.ALL_CAP_RE.sub(r'\1_\2', metric_name).lower() metric_name = self.METRIC_REPLACEMENT.sub('_', metric_name) return self.DOT_UNDERSCORE_CLEANUP.sub('.', metric_name).strip('_') @staticmethod def read_config(instance, key, message=None, cast=None): val = instance.get(key) if val is None: message = message or 'Must provide `%s` value in instance config' % key raise Exception(message) if cast is None: return val else: return cast(val)
def test_string_sets(self): stats = MetricsAggregator('myhost') stats.submit_packets('my.set:string|s') stats.submit_packets('my.set:sets|s') stats.submit_packets('my.set:sets|s') stats.submit_packets('my.set:test|s') stats.submit_packets('my.set:test|s') stats.submit_packets('my.set:test|s') # Assert that it's treated normally. metrics = stats.flush() nt.assert_equal(len(metrics), 1) m = metrics[0] nt.assert_equal(m['metric'], 'my.set') nt.assert_equal(m['points'][0][1], 3) # Assert there are no more sets assert not stats.flush()
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 _enabled_checks = [] @classmethod def is_check_enabled(cls, name): return name in cls._enabled_checks def __init__(self, name, init_config, agentConfig, instances=None): from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] if instances: jsoned_instances = json.dumps(instances) encrypted_passwd_list = re.findall('>>>.*?<<<', jsoned_instances) if encrypted_passwd_list: for encrypted_passwd in encrypted_passwd_list: decrypted_passwd = decrypted(encrypted_passwd) jsoned_instances = jsoned_instances.replace( encrypted_passwd, decrypted_passwd) self.instances = convert_to_str( json.loads(jsoned_instances, encoding='utf-8')) else: self.instances = instances else: self.instances = [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} def instance_count(self): return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.histogram(metric, value, tags, hostname, device_name) @classmethod def generate_historate_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): cls.historate(self, metric, value, excluding_tags, tags=tags, hostname=hostname, device_name=device_name) return fct @classmethod def generate_histogram_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): tags = list(tags) for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith(exc_tag + ":"): tags.remove(tag) cls.histogram(self, metric, value, tags=tags, hostname=hostname, device_name=device_name) return fct def historate(self, metric, value, excluding_tags, tags=None, hostname=None, device_name=None): tags = list(tags) context = [metric] if tags is not None: context.append("-".join(sorted(tags))) if hostname is not None: context.append("host:" + hostname) if device_name is not None: context.append("device:" + device_name) now = time.time() context = tuple(context) if context in self.historate_dict: if tags is not None: for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith("{0}:".format(exc_tag)): tags.remove(tag) prev_value, prev_ts = self.historate_dict[context] rate = float(value - prev_value) / float(now - prev_ts) self.aggregator.histogram(metric, rate, tags, hostname, device_name) self.historate_dict[context] = (value, now) def set(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): if event.get('api_key') is None: event['api_key'] = self.agentConfig['api_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): if hostname is None: hostname = self.hostname if message is not None: message = unicode(message) self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): self._instance_metadata.append((meta_name, unicode(value))) def has_events(self): return len(self.events) > 0 def get_metrics(self): return self.aggregator.flush() def get_events(self): events = self.events self.events = [] return events def get_service_checks(self): service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): return len(self.warnings) > 0 def warning(self, warning_message): warning_message = str(warning_message) self.log.warning(warning_message) self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): raise NotImplementedError def get_warnings(self): warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=str(e)) finally:
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float( agentConfig.get('default_integration_http_timeout', 9)) self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.min_collection_interval = self.init_config.get( 'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get( "use_agent_proxy", True) self.proxies = { "http": None, "https": None, } if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings['host'], port=self.proxy_settings['port']) if self.proxy_settings['user'] and self.proxy_settings['password']: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings['user'], password=self.proxy_settings['password'], uri=uri) self.proxies['http'] = "http://{uri}".format(uri=uri) self.proxies['https'] = "https://{uri}".format(uri=uri)
def test_rate_errors(self): stats = MetricsAggregator('myhost') stats.submit_packets('my.rate:10|_dd-r') # Sleep 1 second so the time interval > 0 (timestamp is converted to an int) time.sleep(1) stats.submit_packets('my.rate:9|_dd-r') # Since the difference < 0 we shouldn't get a value metrics = stats.flush() nt.assert_equal(len(metrics), 0) stats.submit_packets('my.rate:10|_dd-r') # Trying to have the times be the same stats.submit_packets('my.rate:40|_dd-r') metrics = stats.flush() nt.assert_equal(len(metrics), 0)
def setUp(self): self.aggr = MetricsAggregator('test-aggr')
def test_event_title(self): stats = MetricsAggregator('myhost') stats.submit_packets('_e{0,4}:|text') stats.submit_packets(u'_e{9,4}:2intitulé|text') stats.submit_packets('_e{14,4}:3title content|text') stats.submit_packets('_e{14,4}:4title|content|text') stats.submit_packets( '_e{13,4}:5title\\ntitle|text') # \n stays escaped events = self.sort_events(stats.flush_events()) assert len(events) == 5 first, second, third, fourth, fifth = events nt.assert_equal(first['msg_title'], '') nt.assert_equal(second['msg_title'], u'2intitulé') nt.assert_equal(third['msg_title'], '3title content') nt.assert_equal(fourth['msg_title'], '4title|content') nt.assert_equal(fifth['msg_title'], '5title\\ntitle')