class TestSummary(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.summary = Summary('s', 'help', registry=self.registry) def test_summary(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) self.assertEqual(0, self.registry.get_sample_value('s_sum')) self.summary.observe(10) self.assertEqual(1, self.registry.get_sample_value('s_count')) self.assertEqual(10, self.registry.get_sample_value('s_sum')) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) @self.summary.time() def f(): pass self.assertEqual(([], None, None, None), inspect.getargspec(f)) f() self.assertEqual(1, self.registry.get_sample_value('s_count')) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) with self.summary.time(): pass self.assertEqual(1, self.registry.get_sample_value('s_count'))
def test_timer_not_observable(self): s = Summary('test', 'help', labelnames=('label',), registry=self.registry) try: s.time() except ValueError as e: self.assertIn('missing label values', str(e))
class TestSummary(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.summary = Summary('s', 'help', registry=self.registry) def test_summary(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) self.assertEqual(0, self.registry.get_sample_value('s_sum')) self.summary.observe(10) self.assertEqual(1, self.registry.get_sample_value('s_count')) self.assertEqual(10, self.registry.get_sample_value('s_sum')) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) @self.summary.time() def f(): pass self.assertEqual(([], None, None, None), inspect.getargspec(f)) f() self.assertEqual(1, self.registry.get_sample_value('s_count')) def test_function_decorator_multithread(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) summary2 = Summary('s2', 'help', registry=self.registry) workers = 3 duration = 0.1 pool = ThreadPoolExecutor(max_workers=workers) @self.summary.time() def f(): time.sleep(duration / 2) # Testing that different instances of timer do not interfere summary2.time()(lambda : time.sleep(duration / 2))() jobs = workers * 3 for i in range(jobs): pool.submit(f) pool.shutdown(wait=True) self.assertEqual(jobs, self.registry.get_sample_value('s_count')) rounding_coefficient = 0.9 total_expected_duration = jobs * duration * rounding_coefficient self.assertLess(total_expected_duration, self.registry.get_sample_value('s_sum')) self.assertLess(total_expected_duration / 2 , self.registry.get_sample_value('s2_sum')) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) with self.summary.time(): pass self.assertEqual(1, self.registry.get_sample_value('s_count'))
def test_duplicate_metrics_raises(self): registry = CollectorRegistry() Counter('c_total', 'help', registry=registry) self.assertRaises(ValueError, Counter, 'c_total', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'c_total', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'c_created', 'help', registry=registry) Gauge('g_created', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'g_created', 'help', registry=registry) self.assertRaises(ValueError, Counter, 'g', 'help', registry=registry) Summary('s', 'help', registry=registry) self.assertRaises(ValueError, Summary, 's', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 's_created', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 's_sum', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 's_count', 'help', registry=registry) # We don't currently expose quantiles, but let's prevent future # clashes anyway. self.assertRaises(ValueError, Gauge, 's', 'help', registry=registry) Histogram('h', 'help', registry=registry) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=registry) # Clashes aggaint various suffixes. self.assertRaises(ValueError, Summary, 'h', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'h_count', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'h_sum', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'h_bucket', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'h_created', 'help', registry=registry) # The name of the histogram itself is also taken. self.assertRaises(ValueError, Gauge, 'h', 'help', registry=registry) Info('i', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 'i_info', 'help', registry=registry)
def test_function_decorator_multithread(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) summary2 = Summary('s2', 'help', registry=self.registry) workers = 3 duration = 0.1 pool = ThreadPoolExecutor(max_workers=workers) @self.summary.time() def f(): time.sleep(duration / 2) # Testing that different instances of timer do not interfere summary2.time()(lambda: time.sleep(duration / 2))() jobs = workers * 3 for i in range(jobs): pool.submit(f) pool.shutdown(wait=True) self.assertEqual(jobs, self.registry.get_sample_value('s_count')) rounding_coefficient = 0.9 total_expected_duration = jobs * duration * rounding_coefficient self.assertLess(total_expected_duration, self.registry.get_sample_value('s_sum')) self.assertLess(total_expected_duration / 2, self.registry.get_sample_value('s2_sum'))
def test_summary_adds(self): s1 = Summary('s', 'help', registry=None) s2 = Summary('s', 'help', registry=None) self.assertEqual(0, self.registry.get_sample_value('s_count')) self.assertEqual(0, self.registry.get_sample_value('s_sum')) s1.observe(1) s2.observe(2) self.assertEqual(2, self.registry.get_sample_value('s_count')) self.assertEqual(3, self.registry.get_sample_value('s_sum'))
def test_restricted_registry(self): registry = CollectorRegistry() Counter('c_total', 'help', registry=registry) Summary('s', 'help', registry=registry).observe(7) m = Metric('s', 'help', 'summary') m.samples = [Sample('s_sum', {}, 7)] self.assertEqual([m], registry.restricted_registry(['s_sum']).collect())
def test_timer_not_observable(self): s = Summary('test', 'help', labelnames=('label',), registry=self.registry) def manager(): with s.time(): pass assert_not_observable(manager)
def test_summary_adds(self): s1 = Summary('s', 'help', registry=None) values.ValueClass = MultiProcessValue(lambda: 456) s2 = Summary('s', 'help', registry=None) self.assertEqual(0, self.registry.get_sample_value('s_count')) self.assertEqual(0, self.registry.get_sample_value('s_sum')) s1.observe(1) s2.observe(2) self.assertEqual(2, self.registry.get_sample_value('s_count')) self.assertEqual(3, self.registry.get_sample_value('s_sum'))
def test_unregister_works(self): registry = CollectorRegistry() s = Summary('s', 'help', registry=registry) self.assertRaises(ValueError, Gauge, 's_count', 'help', registry=registry) registry.unregister(s) Gauge('s_count', 'help', registry=registry)
def test_target_info_restricted_registry(self): registry = CollectorRegistry(target_info={'foo': 'bar'}) Summary('s', 'help', registry=registry).observe(7) m = Metric('s', 'help', 'summary') m.samples = [Sample('s_sum', {}, 7)] self.assertEqual([m], registry.restricted_registry(['s_sum']).collect()) m = Metric('target', 'Target metadata', 'info') m.samples = [Sample('target_info', {'foo': 'bar'}, 1)] self.assertEqual([m], registry.restricted_registry(['target_info']).collect())
def test_reset_registry_with_labels(self): registry = CollectorRegistry() gauge = Gauge('g', 'help', ['l'], registry=registry) gauge.labels('a').inc() self.assertEqual(1, registry.get_sample_value('g', {'l': 'a'})) counter = Counter('c_total', 'help', ['l'], registry=registry) counter.labels('a').inc() self.assertEqual(1, registry.get_sample_value('c_total', {'l': 'a'})) summary = Summary('s', 'help', ['l'], registry=registry) summary.labels('a').observe(10) self.assertEqual(1, registry.get_sample_value('s_count', {'l': 'a'})) self.assertEqual(10, registry.get_sample_value('s_sum', {'l': 'a'})) histogram = Histogram('h', 'help', ['l'], registry=registry) histogram.labels('a').observe(2) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(1, registry.get_sample_value('h_count', {'l': 'a'})) self.assertEqual(2, registry.get_sample_value('h_sum', {'l': 'a'})) registry.reset() self.assertEqual(0, registry.get_sample_value('g', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('c_total', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('s_count', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('s_sum', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_count', {'l': 'a'})) self.assertEqual(0, registry.get_sample_value('h_sum', {'l': 'a'}))
def test_restricted_registry_does_not_call_extra(self): from unittest.mock import MagicMock registry = CollectorRegistry() mock_collector = MagicMock() mock_collector.describe.return_value = [Metric('foo', 'help', 'summary')] registry.register(mock_collector) Summary('s', 'help', registry=registry).observe(7) m = Metric('s', 'help', 'summary') m.samples = [Sample('s_sum', {}, 7)] self.assertEqual([m], list(registry.restricted_registry(['s_sum']).collect())) mock_collector.collect.assert_not_called()
def test_restricted_registry_does_not_yield_while_locked(self): registry = CollectorRegistry(target_info={'foo': 'bar'}) Summary('s', 'help', registry=registry).observe(7) m = Metric('s', 'help', 'summary') m.samples = [Sample('s_sum', {}, 7)] self.assertEqual([m], list(registry.restricted_registry(['s_sum']).collect())) m = Metric('target', 'Target metadata', 'info') m.samples = [Sample('target_info', {'foo': 'bar'}, 1)] for _ in registry.restricted_registry(['target_info', 's_sum']).collect(): self.assertFalse(registry._lock.locked())
def setUp(self): self.registry = CollectorRegistry() self.summary = Summary('s', 'help', registry=self.registry)
def test_reset_registry(self): registry = CollectorRegistry() gauge = Gauge('g', 'help', registry=registry) gauge.inc() self.assertEqual(1, registry.get_sample_value('g')) counter = Counter('c_total', 'help', registry=registry) counter.inc() self.assertEqual(1, registry.get_sample_value('c_total')) summary = Summary('s', 'help', registry=registry) summary.observe(10) self.assertEqual(1, registry.get_sample_value('s_count')) self.assertEqual(10, registry.get_sample_value('s_sum')) histogram = Histogram('h', 'help', registry=registry) histogram.observe(2) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(1, registry.get_sample_value('h_count')) self.assertEqual(2, registry.get_sample_value('h_sum')) registry.reset() self.assertEqual(0, registry.get_sample_value('g')) self.assertEqual(0, registry.get_sample_value('c_total')) self.assertEqual(0, registry.get_sample_value('s_count')) self.assertEqual(0, registry.get_sample_value('s_sum')) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(0, registry.get_sample_value('h_count')) self.assertEqual(0, registry.get_sample_value('h_sum')) # -------------------------- gauge.inc() gauge.inc() gauge.inc() gauge.dec() self.assertEqual(2, registry.get_sample_value('g')) counter.inc() counter.inc() counter.inc() self.assertEqual(3, registry.get_sample_value('c_total')) summary.observe(10) summary.observe(5) self.assertEqual(2, registry.get_sample_value('s_count')) self.assertEqual(15, registry.get_sample_value('s_sum')) histogram.observe(2) histogram.observe(6) histogram.observe(1) self.assertEqual(1, registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(2, registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(2, registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(3, registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(3, registry.get_sample_value('h_count')) self.assertEqual(9, registry.get_sample_value('h_sum')) registry.reset() self.assertEqual(0, registry.get_sample_value('g')) self.assertEqual(0, registry.get_sample_value('c_total')) self.assertEqual(0, registry.get_sample_value('s_count')) self.assertEqual(0, registry.get_sample_value('s_sum')) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(0, registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(0, registry.get_sample_value('h_count')) self.assertEqual(0, registry.get_sample_value('h_sum'))
def process_datapoint(self, datapoint): if (datapoint['feed'] != 'metrics'): log.debug( "'feed' field is not 'metrics' in datapoint, skipping: {}". format(datapoint)) return daemon = str(datapoint['service']).replace('druid/', '').lower() if (daemon not in self.supported_metrics): log.debug("daemon '{}' is not supported, skipping: {}".format( daemon, datapoint)) return metric_name = str(datapoint['metric']) if (metric_name not in self.supported_metrics[daemon]): log.debug("metric '{}' is not supported, skipping: {}".format( datapoint['metric'], datapoint)) return config = self.supported_metrics[daemon][metric_name] config.setdefault('labels', []) config.setdefault('type', 'gauge') config.setdefault('suffix', '_count') metric_type = config['type'] if metric_type == 'skip': return metric_name = self._get_metric_name(daemon, metric_name, config) metric_value = float(datapoint['value']) metric_labels = tuple(sorted(config['labels'] + ['host'])) label_values = tuple( [datapoint[label_name] for label_name in metric_labels]) if '_metric_' not in config: if metric_type == 'counter': config['_metric_'] = Counter(metric_name, metric_name, metric_labels) if metric_type == 'gauge': config['_metric_'] = Gauge(metric_name, metric_name, metric_labels) elif metric_type == 'summary': config['_metric_'] = Summary(metric_name, metric_name, metric_labels) elif metric_type == 'histogram': config['_metric_'] = Histogram(metric_name, metric_name, metric_labels, buckets=config['buckets']) metric = config['_metric_'] if len(metric_labels) > 0: metric = metric.labels(*label_values) if metric_type == 'counter': metric.inc(metric_value) if metric_type == 'gauge': metric.set(metric_value) elif metric_type == 'summary': metric.observe(metric_value) elif metric_type == 'histogram': metric.observe(metric_value) self.datapoints_processed.inc()
class DruidCollector(object): scrape_duration = Summary('druid_scrape_duration_seconds', 'Druid scrape duration') def __init__(self, metrics_config, kafka_config=None): # The ingestion of the datapoints is separated from their processing, # to separate concerns and avoid unnecessary slowdowns for Druid # daemons sending data. # Only one thread de-queues and process datapoints, in this way we # don't really need any special locking to guarantee consistency. # Since this thread is not I/O bound it doesn't seem the case to # use a gevent's greenlet, but more tests might prove the contrary. self.datapoints_queue = queue.Queue() self.stop_threads = threading.Event() threading.Thread(target=self.process_queued_datapoints, args=(self.stop_threads, )).start() # if a Kafka config is provided, create a dedicated thread # that pulls datapoints from a Kafka topic. # The thread will then push datapoints to the same queue that # the HTTP server uses. In this way the exporter allows a mixed # configuration for Druid Brokers between HTTPEmitter and # KafkaEmitter (for daemons emitting too many datapoints/s). if kafka_config: if KafkaConsumer: threading.Thread(target=self.pull_datapoints_from_kafka, args=(kafka_config, self.stop_threads)).start() else: log.error('A Kafka configuration was provided, but it seems ' 'that the Kafka client library is not available. ' 'Please install the correct dependencies.') # Datapoints successfully registered self.datapoints_registered = 0 # Data structure holding histogram data # Format: {daemon: {metric_name: {bucket2: value, bucket2: value, ...}} self.histograms = defaultdict(lambda: {}) # Data structure holding counters data # Format: {daemon: {label_name: {label2_name: value}} # The order of the labels listed in supported_metric_names is important # since it is reflected in this data structure. The layering is not # strictly important for the final prometheus metrics but # it is simplifies the code that creates them (collect method). self.counters = defaultdict(lambda: {}) # List of metrics to collect/expose via the exporter self.metrics_config = metrics_config self.supported_daemons = list(self.metrics_config.keys()) def stop_running_threads(self): self.stop_threads.set() @staticmethod def sanitize_field(datapoint_field): return datapoint_field.replace('druid/', '').lower() def store_counter(self, datapoint): """ This function adds data to the self.counters dictiorary following its convention, creating on the fly the missing bits. For example, given: self.counters = {} datapoint = {'service': 'druid/broker', 'metric'='segment/size', 'datasource': 'test', 'value': 10} This function will creates the following: self.counters = { 'segment/size': { 'broker': { ('test'): 10 } } } The algorithm is generic enough to support all metrics handled by self.counters without caring about the number of labels needed. """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) metrics_storage = self.counters[metric_name] metric_labels = self.metrics_config[daemon][metric_name]['labels'] metrics_storage.setdefault(daemon, {}) label_values = [] if metric_labels: for label in metric_labels: try: label_values.append(str(datapoint[label])) except KeyError as e: log.error( 'Missing label {} for datapoint {} (expected labels: {}), ' 'dropping it. Please check your metric configuration file.' .format(label, datapoint, metric_labels)) return # Convert the list of labels to a tuple to allow indexing metrics_storage[daemon][tuple(label_values)] = metric_value log.debug("The datapoint {} modified the counters dictionary to: \n{}". format(datapoint, self.counters)) def store_histogram(self, datapoint): """ Store datapoints that will end up in histogram buckets using a dictiorary. This function is highly customized for the only histograms configured so far, rather than being generic like store_counter. Example of how it works: self.histograms = {} datapoint = {'service': 'druid/broker', 'metric'='query/time', 'datasource': 'test', 'value': 10} This function will creates the following: self.histograms = { 'query/time': { 'broker': { ('test'): {'10': 1, '100': 1, etc.., 'sum': 10 } } } } """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) metric_labels = self.metrics_config[daemon][metric_name]['labels'] metric_buckets = self.metrics_config[daemon][metric_name]['buckets'] self.histograms.setdefault(metric_name, {daemon: {}}) self.histograms[metric_name].setdefault(daemon, {}) label_values = [] if metric_labels: for label in metric_labels: try: label_values.append(str(datapoint[label])) except KeyError as e: log.error( 'Missing label {} for datapoint {} (expected labels: {}), ' 'dropping it. Please check your metric configuration file.' .format(label, metric_labels, datapoint)) return # Convert the list of labels to a tuple to allow indexing self.histograms[metric_name][daemon].setdefault( tuple(label_values), {}) stored_buckets = self.histograms[metric_name][daemon][tuple( label_values)] for bucket in metric_buckets: if bucket not in stored_buckets: stored_buckets[bucket] = 0 if bucket != 'sum' and metric_value <= float(bucket): stored_buckets[bucket] += 1 stored_buckets['sum'] += metric_value log.debug( "The datapoint {} modified the histograms dictionary to: \n{}". format(datapoint, self.histograms)) @scrape_duration.time() def collect(self): # Loop through all metrics configured, and get datapoints # for them saved by the exporter. for daemon in self.metrics_config.keys(): for druid_metric_name in self.metrics_config[daemon]: metric_type = self.metrics_config[daemon][druid_metric_name][ 'type'] if metric_type == 'gauge' or metric_type == 'counter': try: self.counters[druid_metric_name] self.counters[druid_metric_name][daemon] except KeyError: continue if metric_type == 'gauge': metric_family_obj = GaugeMetricFamily else: metric_family_obj = CounterMetricFamily prometheus_metric = metric_family_obj( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.counters[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.counters[druid_metric_name][daemon][ label_value] prometheus_metric.add_metric(label_value, value) elif metric_type == 'histogram': try: self.histograms[druid_metric_name] self.histograms[druid_metric_name][daemon] except KeyError: continue prometheus_metric = HistogramMetricFamily( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.histograms[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.histograms[druid_metric_name][daemon][ label_value] buckets_without_sum = [[key, value] for key, value in value.items() if key != 'sum'] prometheus_metric.add_metric( label_value, buckets=buckets_without_sum, sum_value=value['sum']) else: log.info( 'metric type not supported: {}'.format(metric_type)) continue yield prometheus_metric registered = CounterMetricFamily( 'druid_exporter_datapoints_registered', 'Number of datapoints successfully registered ' 'by the exporter.') registered.add_metric([], self.datapoints_registered) yield registered def register_datapoint(self, datapoint): if (datapoint['feed'] != 'metrics'): log.debug("The following feed does not contain a datapoint, " "dropping it: {}".format(datapoint)) return daemon = DruidCollector.sanitize_field(str(datapoint['service'])) if (datapoint['feed'] != 'metrics' or daemon not in self.supported_daemons or datapoint['metric'] not in self.metrics_config[daemon].keys()): log.debug( "The following datapoint is not supported, either " "because the 'feed' field is not 'metrics' or " "the daemon name ({}) is not listed in the supported ones ({}) or " "the metric itself is not listed in the exporter's config file: {}" .format(daemon, self.supported_daemons, datapoint)) return self.datapoints_queue.put((daemon, datapoint)) def process_queued_datapoints(self, stop_threads): log.debug('Process datapoints thread starting..') while True and not stop_threads.isSet(): (daemon, datapoint) = self.datapoints_queue.get() metric_name = str(datapoint['metric']) if self.metrics_config[daemon][metric_name]['type'] == 'histogram': self.store_histogram(datapoint) else: self.store_counter(datapoint) self.datapoints_registered += 1 log.debug('Process datapoints thread shutting down..') def pull_datapoints_from_kafka(self, kafka_config, stop_threads): log.debug('Kafka datapoints puller thread starting..') consumer = KafkaConsumer( kafka_config['topic'], group_id=kafka_config['group_id'], bootstrap_servers=kafka_config['bootstrap_servers']) while True and not stop_threads.isSet(): consumer.poll() for message in consumer: try: json_message = json.loads(message.value.decode()) log.debug('Datapoint from kafka: %s', json_message) if type(json_message) == list: for datapoint in json_message: self.register_datapoint(datapoint) else: self.register_datapoint(json_message) except json.JSONDecodeError: log.exception( "Failed to decode message from Kafka, skipping..") except Exception as e: log.exception( "Generic exception while pulling datapoints from Kafka" ) log.debug('Kafka datapoints puller thread shutting down..')
def process_datapoint(self, datapoint): global sep_config if (datapoint['feed'] != 'metrics'): log.debug( "'feed' field is not 'metrics' in datapoint, skipping: {}". format(datapoint)) return daemon = str(datapoint['service']).replace('druid/', '').lower() if (daemon not in self.supported_metrics): log.warn("daemon '{}' is not supported, skipping: {}".format( daemon, datapoint)) return metric_name = str(datapoint['metric']) if (metric_name not in self.supported_metrics[daemon]): log.warn("metric '{}' is not supported, skipping: {}".format( datapoint['metric'], datapoint)) return # if 'sep_config' not in locals(): # sep_config = {} if daemon not in sep_config: sep_config[daemon] = {} log.debug("Reverse Metric: {}".format(sep_config)) if metric_name not in sep_config[daemon]: sep_config[daemon][metric_name] = copy.copy( self.supported_metrics[daemon][metric_name]) log.debug("Reverse IFtrue: {}") else: sep_config[daemon][metric_name] = sep_config[daemon][metric_name] log.debug("Reverse IFelse: {}") #config = self.supported_metrics[daemon][metric_name] log.debug("Reverse Metric: {}".format(sep_config)) sep_config[daemon][metric_name].setdefault('labels', []) sep_config[daemon][metric_name].setdefault('type', 'gauge') sep_config[daemon][metric_name].setdefault('suffix', '_count') metric_type = sep_config[daemon][metric_name]['type'] if metric_type == 'skip': return metric_name_full = self._get_metric_name( daemon, metric_name, sep_config[daemon][metric_name]) metric_value = float(datapoint['value']) metric_labels = tuple( sorted(sep_config[daemon][metric_name]['labels'] + ['host'])) log.debug("Labels: {}".format(metric_labels)) label_values = tuple([ datapoint[label_name.replace('_', ' ')] for label_name in metric_labels ]) log.debug("Labels value: {}".format(label_values)) if '_metric_' not in sep_config[daemon][metric_name]: if metric_type == 'counter': sep_config[daemon][metric_name]['_metric_'] = Counter( metric_name_full, metric_name_full, metric_labels) if metric_type == 'gauge': sep_config[daemon][metric_name]['_metric_'] = Gauge( metric_name_full, metric_name_full, metric_labels) elif metric_type == 'summary': sep_config[daemon][metric_name]['_metric_'] = Summary( metric_name_full, metric_name_full, metric_labels) elif metric_type == 'histogram': sep_config[daemon][metric_name]['_metric_'] = Histogram( metric_name_full, metric_name_full, metric_labels, buckets=sep_config[daemon][metric_name]['buckets']) log.debug("final metric_name: {}".format(metric_name)) log.debug("sep config : {}".format(sep_config[daemon])) metric = sep_config[daemon][metric_name]['_metric_'] if len(metric_labels) > 0: metric = metric.labels(*label_values) if metric_type == 'counter': metric.inc(metric_value) if metric_type == 'gauge': metric.set(metric_value) elif metric_type == 'summary': metric.observe(metric_value) elif metric_type == 'histogram': metric.observe(metric_value) self.datapoints_processed.inc()
dotenv_path = os.path.join(os.path.dirname(__file__), '.env') load_dotenv(dotenv_path) # 设置监听的web服务器端口 HTTP_PORT = int(os.environ.get("HTTP_PORT")) # 设置oracle连接用户名密码 DATA_SOURCE_NAME = os.environ.get("DATA_SOURCE_NAME") # 设置日志格式 LOG_LEVEL = os.environ.get("LOG_LEVEL") FORMAT = '%(asctime)-15s %(thread)-5d:%(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger('oracledb_exporter') logger.setLevel(LOG_LEVEL) # 收集数据所用时长 COLLECTION_TIME = Summary( 'oracledb_collector_collect_seconds', 'Time spent to collect metrics from Oracle' ) SCRAPE_INTERVAL = float(os.environ.get("SCRAPE_INTERVAL")) class OracleCollector(object): def __init__(self): self.db_connect, self.db_cursor = '', '' self.database_version_gauge = Gauge( "oracledb_version_info", "" "TYPE gauge.", ["version"] ) # ["oracle", "plsql", "core", "tns", "nlsrtl"],
import json import datetime import sys import logging import asyncio from concurrent import futures from cachetools import cached, TTLCache, cachedmethod from aliyunsdkcore.client import AcsClient from aliyunsdkrds.request.v20140815.DescribeDBInstancesRequest import DescribeDBInstancesRequest from aliyunsdkrds.request.v20140815.DescribeDBInstancePerformanceRequest import DescribeDBInstancePerformanceRequest from aliyunsdkrds.request.v20140815.DescribeResourceUsageRequest import DescribeResourceUsageRequest from prometheus_client.core import Summary, GaugeMetricFamily, InfoMetricFamily from prometheus_client import Counter, Info # 这里的api_request是用来记录阿里云API调用的延迟 api_request_summry = Summary('aliyun_api_request_latency_seconds', 'CloudMonitor request latency', ['api']) api_request_failed_summry = Summary( 'aliyun_api_failed_request_latency_seconds', 'CloudMonitor failed request latency', ['api']) # 记录阿里云API调用次数 api_request_count = Counter( 'aliyun_api_request_counter', 'Aliyun API request counter', ) class CollectorConfig(object): def __init__( self, file_opts, command_args,
class DruidCollector(object): scrape_duration = Summary('druid_scrape_duration_seconds', 'Druid scrape duration') def __init__(self, metrics_config): # Datapoints successfully registered self.datapoints_registered = 0 # Data structure holding histogram data # Format: {daemon: {metric_name: {bucket2: value, bucket2: value, ...}} self.histograms = defaultdict(lambda: {}) # Data structure holding counters data # Format: {daemon: {label_name: {label2_name: value}} # The order of the labels listed in supported_metric_names is important # since it is reflected in this data structure. The layering is not # strictly important for the final prometheus metrics but # it is simplifies the code that creates them (collect method). self.counters = defaultdict(lambda: {}) # List of metrics to collect/expose via the exporter self.metrics_config = metrics_config self.supported_daemons = self.metrics_config.keys() @staticmethod def sanitize_field(datapoint_field): return datapoint_field.replace('druid/', '').lower() def store_counter(self, datapoint): """ This function adds data to the self.counters dictiorary following its convention, creating on the fly the missing bits. For example, given: self.counters = {} datapoint = {'service': 'druid/broker', 'metric'='segment/size', 'datasource': 'test', 'value': 10} This function will creates the following: self.counters = { 'segment/size': { 'broker': { ('test'): 10 } } } The algorithm is generic enough to support all metrics handled by self.counters without caring about the number of labels needed. """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) metrics_storage = self.counters[metric_name] metric_labels = self.metrics_config[daemon][metric_name]['labels'] metrics_storage.setdefault(daemon, {}) label_values = [] if metric_labels: for label in metric_labels: try: label_values.append(str(datapoint[label])) except KeyError as e: log.error( 'Missing label {} for datapoint {} (expected labels: {}), ' 'dropping it. Please check your metric configuration file.' .format(label, metric_labels, datapoint)) return # Convert the list of labels to a tuple to allow indexing metrics_storage[daemon][tuple(label_values)] = metric_value log.debug("The datapoint {} modified the counters dictionary to: \n{}". format(datapoint, self.counters)) def store_histogram(self, datapoint): """ Store datapoints that will end up in histogram buckets using a dictiorary. This function is highly customized for the only histograms configured so far, rather than being generic like store_counter. Example of how it works: self.histograms = {} datapoint = {'service': 'druid/broker', 'metric'='query/time', 'datasource': 'test', 'value': 10} This function will creates the following: self.histograms = { 'query/time': { 'broker': { ('test'): {'10': 1, '100': 1, etc.., 'sum': 10 } } } } """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) metric_labels = self.metrics_config[daemon][metric_name]['labels'] metric_buckets = self.metrics_config[daemon][metric_name]['buckets'] self.histograms.setdefault(metric_name, {daemon: {}}) self.histograms[metric_name].setdefault(daemon, {}) label_values = [] if metric_labels: for label in metric_labels: try: label_values.append(str(datapoint[label])) except KeyError as e: log.error( 'Missing label {} for datapoint {} (expected labels: {}), ' 'dropping it. Please check your metric configuration file.' .format(label, metric_labels, datapoint)) return # Convert the list of labels to a tuple to allow indexing self.histograms[metric_name][daemon].setdefault( tuple(label_values), {}) stored_buckets = self.histograms[metric_name][daemon][tuple( label_values)] for bucket in metric_buckets: if bucket not in stored_buckets: stored_buckets[bucket] = 0 if bucket != 'sum' and metric_value <= float(bucket): stored_buckets[bucket] += 1 stored_buckets['sum'] += metric_value log.debug( "The datapoint {} modified the histograms dictionary to: \n{}". format(datapoint, self.histograms)) @scrape_duration.time() def collect(self): # Loop through all metrics configured, and get datapoints # for them saved by the exporter. for daemon in self.metrics_config.keys(): for druid_metric_name in self.metrics_config[daemon]: metric_type = self.metrics_config[daemon][druid_metric_name][ 'type'] if metric_type == 'gauge' or metric_type == 'counter': try: self.counters[druid_metric_name] self.counters[druid_metric_name][daemon] except KeyError: continue if metric_type == 'gauge': metric_family_obj = GaugeMetricFamily else: metric_family_obj = CounterMetricFamily prometheus_metric = metric_family_obj( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.counters[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.counters[druid_metric_name][daemon][ label_value] prometheus_metric.add_metric(label_value, value) elif metric_type == 'histogram': try: self.histograms[druid_metric_name] self.histograms[druid_metric_name][daemon] except KeyError: continue prometheus_metric = HistogramMetricFamily( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.histograms[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.histograms[druid_metric_name][daemon][ label_value] buckets_without_sum = [[key, value] for key, value in value.items() if key != 'sum'] prometheus_metric.add_metric( label_value, buckets=buckets_without_sum, sum_value=value['sum']) else: log.info( 'metric type not supported: {}'.format(metric_type)) continue yield prometheus_metric registered = CounterMetricFamily( 'druid_exporter_datapoints_registered', 'Number of datapoints successfully registered ' 'by the exporter.') registered.add_metric([], self.datapoints_registered) yield registered def register_datapoint(self, datapoint): if (datapoint['feed'] != 'metrics'): log.debug("The following feed does not contain a datapoint, " "dropping it: {}".format(datapoint)) return daemon = DruidCollector.sanitize_field(str(datapoint['service'])) if (datapoint['feed'] != 'metrics' or daemon not in self.supported_daemons or datapoint['metric'] not in self.metrics_config[daemon].keys()): log.debug( "The following datapoint is not supported, either " "because the 'feed' field is not 'metrics' or " "the metric itself is not supported: {}".format(datapoint)) return metric_name = str(datapoint['metric']) if self.metrics_config[daemon][metric_name]['type'] == 'histogram': self.store_histogram(datapoint) else: self.store_counter(datapoint) self.datapoints_registered += 1
class Logger: """Class used to display logs on the console. """ def __init__(self): logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) self.logger = logger # Create a metric to track time spent and requests made. REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request') class CloudwatchLogsCollector: """Class used to get metrics from AWS Cloudwatch Logs. """ def __init__(self): self.client = boto3.client('logs') self.metric_prefix = "aws_logs_" self.logger = Logger().logger @REQUEST_TIME.time() def collect_log_groups(self): log_group_stored_byte = GaugeMetricFamily( self.metric_prefix + 'stored_bytes',
class TestSummary(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.summary = Summary('s', 'help', registry=self.registry) def test_repr(self): self.assertEqual(repr(self.summary), "prometheus_client.metrics.Summary(s)") def test_summary(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) self.assertEqual(0, self.registry.get_sample_value('s_sum')) self.summary.observe(10) self.assertEqual(1, self.registry.get_sample_value('s_count')) self.assertEqual(10, self.registry.get_sample_value('s_sum')) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) @self.summary.time() def f(): pass self.assertEqual(([], None, None, None), getargspec(f)) f() self.assertEqual(1, self.registry.get_sample_value('s_count')) def test_function_decorator_multithread(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) summary2 = Summary('s2', 'help', registry=self.registry) workers = 3 duration = 0.1 pool = ThreadPoolExecutor(max_workers=workers) @self.summary.time() def f(): time.sleep(duration / 2) # Testing that different instances of timer do not interfere summary2.time()(lambda: time.sleep(duration / 2))() jobs = workers * 3 for i in range(jobs): pool.submit(f) pool.shutdown(wait=True) self.assertEqual(jobs, self.registry.get_sample_value('s_count')) rounding_coefficient = 0.9 total_expected_duration = jobs * duration * rounding_coefficient self.assertLess(total_expected_duration, self.registry.get_sample_value('s_sum')) self.assertLess(total_expected_duration / 2, self.registry.get_sample_value('s2_sum')) def test_function_decorator_reentrancy(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) iterations = 2 sleep = 0.1 @self.summary.time() def f(i=1): time.sleep(sleep) if i == iterations: return f(i + 1) f() self.assertEqual(iterations, self.registry.get_sample_value('s_count')) # Arithmetic series with d == a_1 total_expected_duration = sleep * (iterations ** 2 + iterations) / 2 rounding_coefficient = 0.9 total_expected_duration *= rounding_coefficient self.assertLess(total_expected_duration, self.registry.get_sample_value('s_sum')) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('s_count')) with self.summary.time(): pass self.assertEqual(1, self.registry.get_sample_value('s_count')) def test_timer_not_observable(self): s = Summary('test', 'help', labelnames=('label',), registry=self.registry) try: s.time() except ValueError as e: self.assertIn('missing label values', str(e))
class DruidCollector(object): scrape_duration = Summary('druid_scrape_duration_seconds', 'Druid scrape duration') def __init__(self): # Datapoints successfully registered self.datapoints_registered = 0 # List of supported metrics and their fields of the JSON dictionary # sent by a Druid daemon. These fields will be added as labels # when returning the available metrics in @collect. # Due to the fact that metric names are not unique (like segment/count), # it is necessary to split the data structure by daemon. self.supported_metric_names = { 'middlemanager': { 'jetty/numOpenConnections': None, 'jvm/pool/committed': ['poolKind', 'poolName'], 'jvm/pool/init': ['poolKind', 'poolName'], 'jvm/pool/max': ['poolKind', 'poolName'], 'jvm/pool/used': ['poolKind', 'poolName'], 'jvm/mem/init': ['memKind'], 'jvm/mem/max': ['memKind'], 'jvm/mem/used': ['memKind'], 'jvm/mem/committed': ['memKind'], 'jvm/gc/count': ['gcName'], 'jvm/gc/time': ['gcName'], }, 'broker': { 'jetty/numOpenConnections': None, 'query/time': ['dataSource'], 'query/bytes': ['dataSource'], 'query/node/time': None, 'query/node/bytes': None, 'query/node/ttfb': None, 'query/intervalChunk/time': None, 'query/success/count': None, 'query/failed/count': None, 'query/interrupted/count': None, 'query/cache/total/numEntries': None, 'query/cache/total/sizeBytes': None, 'query/cache/total/hits': None, 'query/cache/total/misses': None, 'query/cache/total/evictions': None, 'query/cache/total/timeouts': None, 'query/cache/total/errors': None, 'jvm/pool/committed': ['poolKind', 'poolName'], 'jvm/pool/init': ['poolKind', 'poolName'], 'jvm/pool/max': ['poolKind', 'poolName'], 'jvm/pool/used': ['poolKind', 'poolName'], 'jvm/mem/init': ['memKind'], 'jvm/mem/max': ['memKind'], 'jvm/mem/used': ['memKind'], 'jvm/mem/committed': ['memKind'], 'jvm/gc/count': ['gcName'], 'jvm/gc/time': ['gcName'], }, 'historical': { 'jetty/numOpenConnections': None, 'query/time': ['dataSource'], 'query/bytes': ['dataSource'], 'query/cpu/time': ['dataSource'], 'query/segment/time': None, 'query/wait/time': None, 'query/success/count': None, 'query/failed/count': None, 'query/interrupted/count': None, 'query/segmentAndCache/time': None, 'query/cache/total/numEntries': None, 'query/cache/total/sizeBytes': None, 'query/cache/total/hits': None, 'query/cache/total/misses': None, 'query/cache/total/evictions': None, 'query/cache/total/timeouts': None, 'query/cache/total/errors': None, 'segment/count': ['tier', 'dataSource'], 'segment/max': None, 'segment/used': ['tier', 'dataSource'], 'segment/usedPercent': ['tier', 'dataSource'], 'segment/scan/pending': None, 'jvm/pool/committed': None, 'jvm/pool/init': None, 'jvm/pool/max': None, 'jvm/pool/used': None, 'jvm/mem/init': None, 'jvm/mem/max': None, 'jvm/mem/used': None, 'jvm/mem/committed': None, 'jvm/gc/count': None, 'jvm/gc/time': None, }, 'coordinator': { 'jetty/numOpenConnections': None, 'segment/count': ['dataSource'], 'segment/assigned/count': ['tier'], 'segment/moved/count': ['tier'], 'segment/dropped/count': ['tier'], 'segment/deleted/count': ['tier'], 'segment/unneeded/count': ['tier'], 'segment/overShadowed/count': None, 'segment/loadQueue/failed': ['server'], 'segment/loadQueue/count': ['server'], 'segment/dropQueue/count': ['server'], 'segment/size': ['dataSource'], 'segment/unavailable/count': ['dataSource'], 'segment/underReplicated/count': ['tier', 'dataSource'], 'jvm/pool/committed': ['poolKind', 'poolName'], 'jvm/pool/init': ['poolKind', 'poolName'], 'jvm/pool/max': ['poolKind', 'poolName'], 'jvm/pool/used': ['poolKind', 'poolName'], 'jvm/mem/init': ['memKind'], 'jvm/mem/max': ['memKind'], 'jvm/mem/used': ['memKind'], 'jvm/mem/committed': ['memKind'], 'jvm/gc/count': ['gcName'], 'jvm/gc/time': ['gcName'], }, 'peon': { 'jetty/numOpenConnections': None, 'query/time': ['dataSource'], 'query/bytes': ['dataSource'], 'segment/scan/pending': None, 'query/wait/time': None, 'query/success/count': None, 'query/failed/count': None, 'query/interrupted/count': None, 'ingest/events/thrownAway': ['dataSource'], 'ingest/events/unparseable': ['dataSource'], 'ingest/events/processed': ['dataSource'], 'ingest/rows/output': ['dataSource'], 'ingest/persists/count': ['dataSource'], 'ingest/persists/failed': ['dataSource'], 'ingest/handoff/failed': ['dataSource'], 'ingest/handoff/count': ['dataSource'], }, } # Buckets used when storing histogram metrics. # 'sum' is a special bucket that will be used to collect the sum # of all values ending up in the various buckets. self.metric_buckets = { 'query/time': ['10', '100', '500', '1000', '10000', 'inf', 'sum'], 'query/bytes': ['10', '100', '500', '1000', '10000', 'inf', 'sum'], } # Data structure holding histogram data # Format: {daemon: {metric_name: {bucket2: value, bucket2: value, ...}} self.histograms = defaultdict(lambda: {}) self.histograms_metrics = set([ 'query/time', 'query/bytes', ]) # Data structure holding counters data # Format: {daemon: {label_name: {label2_name: value}} # The order of the labels listed in supported_metric_names is important # since it is reflected in this data structure. The layering is not # strictly important for the final prometheus metrics but it is simplifies # the code that creates them (collect method). self.counters = defaultdict(lambda: {}) self.counters_metrics = set([ 'query/cache/total/numEntries', 'query/cache/total/sizeBytes', 'query/cache/total/hits', 'query/cache/total/misses', 'query/cache/total/evictions', 'query/cache/total/timeouts', 'query/cache/total/errors', 'segment/max', 'segment/count', 'segment/used', 'segment/scan/pending', 'segment/assigned/count', 'segment/moved/count', 'segment/dropped/count', 'segment/deleted/count', 'segment/unneeded/count', 'segment/overShadowed/count', 'segment/loadQueue/failed', 'segment/loadQueue/count', 'segment/dropQueue/count', 'segment/size', 'segment/unavailable/count', 'segment/underReplicated/count', 'ingest/events/thrownAway', 'ingest/events/unparseable', 'ingest/events/processed', 'ingest/rows/output', 'ingest/persists/count', 'ingest/persists/failed', 'ingest/handoff/failed', 'ingest/handoff/count', 'jvm/pool/committed', 'jvm/pool/init', 'jvm/pool/max', 'jvm/pool/used', 'jvm/mem/init', 'jvm/mem/max', 'jvm/mem/used', 'jvm/mem/committed', 'jvm/gc/count', 'jvm/gc/time', 'jetty/numOpenConnections', ]) @staticmethod def sanitize_field(datapoint_field): return datapoint_field.replace('druid/', '').lower() def _get_general_counters(self, daemon): return { 'jetty/numOpenConnections': GaugeMetricFamily('druid_' + daemon + '_jetty_num_connections', 'Number of open connections.'), 'jvm/pool/committed': GaugeMetricFamily('druid_' + daemon + '_jvm_pool_committed', 'Number of Committed pool.', labels=['poolKind', 'poolName']), 'jvm/pool/init': GaugeMetricFamily('druid_' + daemon + '_jvm_pool_init', 'Number of Initial pool.', labels=['poolKind', 'poolName']), 'jvm/pool/max': GaugeMetricFamily('druid_' + daemon + '_jvm_pool_max', 'Number of Max pool.', labels=['poolKind', 'poolName']), 'jvm/pool/used': GaugeMetricFamily('druid_' + daemon + '_jvm_pool_used', 'Number of Pool used.', labels=['poolKind', 'poolName']), 'jvm/mem/init': GaugeMetricFamily('druid_' + daemon + '_jvm_mem_init', 'Number of Initial memory.', labels=['memKind']), 'jvm/mem/max': GaugeMetricFamily('druid_' + daemon + '_jvm_mem_max', 'Number of Max memory.', labels=['memKind']), 'jvm/mem/used': GaugeMetricFamily('druid_' + daemon + '_jvm_mem_used', 'Number of Used memory.', labels=['memKind']), 'jvm/mem/committed': GaugeMetricFamily('druid_' + daemon + '_jvm_mem_committed', 'Number of Committed memory.', labels=['memKind']), 'jvm/gc/count': GaugeMetricFamily('druid_' + daemon + '_jvm_gc_count', 'Number of Garbage collection count.', labels=['gcName']), 'jvm/gc/time': GaugeMetricFamily('druid_' + daemon + '_jvm_gc_time', 'Number of Garbage collection time.', labels=['gcName']), } def _get_realtime_counters(self): return { 'ingest/events/thrownAway': GaugeMetricFamily('druid_realtime_ingest_events_thrown_away_count', 'Number of events rejected because ' 'they are outside the windowPeriod.', labels=['datasource']), 'ingest/events/unparseable': GaugeMetricFamily( 'druid_realtime_ingest_events_unparseable_count', 'Number of events rejected because the events are unparseable.', labels=['datasource']), 'ingest/events/processed': GaugeMetricFamily( 'druid_realtime_ingest_events_processed_count', 'Number of events successfully processed per emission period.', labels=['datasource']), 'ingest/rows/output': GaugeMetricFamily('druid_realtime_ingest_rows_output_count', 'Number of Druid rows persisted.', labels=['datasource']), 'ingest/persists/count': GaugeMetricFamily('druid_realtime_ingest_persists_count', 'Number of times persist occurred.', labels=['datasource']), 'ingest/persists/failed': GaugeMetricFamily('druid_realtime_ingest_persists_failed_count', 'Number of times persist failed.', labels=['datasource']), 'ingest/handoff/failed': GaugeMetricFamily('druid_realtime_ingest_handoff_failed_count', 'Number of times handoff failed.', labels=['datasource']), 'ingest/handoff/count': GaugeMetricFamily('druid_realtime_ingest_handoff_count', 'Number of times handoff has happened.', labels=['datasource']), } def _get_query_histograms(self, daemon): return { 'query/time': HistogramMetricFamily('druid_' + daemon + '_query_time_ms', 'Milliseconds taken to complete a query.', labels=['datasource']), 'query/bytes': HistogramMetricFamily( 'druid_' + daemon + '_query_bytes', 'Number of bytes returned in query response.', labels=['datasource']), } def _get_query_counters(self): return { 'query/success/count': GaugeMetricFamily('druid_broker_query_success_count', 'Number of success queries.'), 'query/failed/count': GaugeMetricFamily('druid_broker_query_failed_count', 'Number of failed queries'), 'query/interrupted/count': GaugeMetricFamily('druid_broker_query_interrupted_count', 'Number of interrupted queries.'), 'query/node/time': GaugeMetricFamily('druid_broker_query_node_time_ms', 'Number of query time.'), 'query/node/bytes': GaugeMetricFamily('druid_broker_query_node_bytes', 'Number of query bytes'), 'query/node/ttfb': GaugeMetricFamily('druid_broker_query_node_ttfb_ms', 'Time to first byte..'), } def _get_cache_counters(self, daemon): return { 'query/cache/total/numEntries': GaugeMetricFamily( 'druid_' + daemon + '_query_cache_numentries_count', 'Number of cache entries.'), 'query/cache/total/sizeBytes': GaugeMetricFamily('druid_' + daemon + '_query_cache_size_bytes', 'Size in bytes of cache entries.'), 'query/cache/total/hits': GaugeMetricFamily('druid_' + daemon + '_query_cache_hits_count', 'Number of cache hits.'), 'query/cache/total/misses': GaugeMetricFamily('druid_' + daemon + '_query_cache_misses_count', 'Number of cache misses.'), 'query/cache/total/evictions': GaugeMetricFamily( 'druid_' + daemon + '_query_cache_evictions_count', 'Number of cache evictions.'), 'query/cache/total/timeouts': GaugeMetricFamily( 'druid_' + daemon + '_query_cache_timeouts_count', 'Number of cache timeouts.'), 'query/cache/total/errors': GaugeMetricFamily('druid_' + daemon + '_query_cache_errors_count', 'Number of cache errors.'), } def _get_historical_counters(self): return { 'segment/max': GaugeMetricFamily('druid_historical_max_segment_bytes', 'Maximum byte limit available for segments.'), 'segment/count': GaugeMetricFamily('druid_historical_segment_count', 'Number of served segments.', labels=['tier', 'datasource']), 'segment/used': GaugeMetricFamily('druid_historical_segment_used_bytes', 'Bytes used for served segments.', labels=['tier', 'datasource']), 'segment/scan/pending': GaugeMetricFamily( 'druid_historical_segment_scan_pending', 'Number of segments in queue waiting to be scanned.'), 'query/success/count': GaugeMetricFamily('druid_historical_query_success_count', 'Number of success queries.'), 'query/failed/count': GaugeMetricFamily('druid_historical_query_failed_count', 'Number of failed queries'), 'query/interrupted/count': GaugeMetricFamily('druid_historical_query_interrupted_count', 'Number of interrupted queries.'), } def _get_coordinator_counters(self): return { 'segment/assigned/count': GaugeMetricFamily( 'druid_coordinator_segment_assigned_count', 'Number of segments assigned to be loaded in the cluster.', labels=['tier']), 'segment/moved/count': GaugeMetricFamily( 'druid_coordinator_segment_moved_count', 'Number of segments assigned to be loaded in the cluster.', labels=['tier']), 'segment/dropped/count': GaugeMetricFamily( 'druid_coordinator_segment_dropped_count', 'Number of segments dropped due to being overshadowed.', labels=['tier']), 'segment/deleted/count': GaugeMetricFamily('druid_coordinator_segment_deleted_count', 'Number of segments dropped due to rules.', labels=['tier']), 'segment/unneeded/count': GaugeMetricFamily( 'druid_coordinator_segment_unneeded_count', 'Number of segments dropped due to being marked as unused.', labels=['tier']), 'segment/overShadowed/count': GaugeMetricFamily('druid_coordinator_segment_overshadowed_count', 'Number of overShadowed segments.'), 'segment/loadQueue/failed': GaugeMetricFamily( 'druid_coordinator_segment_loadqueue_failed_count', 'Number of segments that failed to load.', labels=['server']), 'segment/loadQueue/count': GaugeMetricFamily('druid_coordinator_segment_loadqueue_count', 'Number of segments to load.', labels=['server']), 'segment/dropQueue/count': GaugeMetricFamily('druid_coordinator_segment_dropqueue_count', 'Number of segments to drop.', labels=['server']), 'segment/size': GaugeMetricFamily('druid_coordinator_segment_size_bytes', 'Size in bytes of available segments.', labels=['datasource']), 'segment/count': GaugeMetricFamily('druid_coordinator_segment_count', 'Number of served segments.', labels=['datasource']), 'segment/unavailable/count': GaugeMetricFamily( 'druid_coordinator_segment_unavailable_count', 'Number of segments (not including replicas) left to load ' 'until segments that should be loaded in the cluster ' 'are available for queries.', labels=['datasource']), 'segment/underReplicated/count': GaugeMetricFamily( 'druid_coordinator_segment_under_replicated_count', 'Number of segments (including replicas) left to load until ' 'segments that should be loaded in the cluster are ' 'available for queries.', labels=['tier', 'datasource']), } def store_counter(self, datapoint): """ This function adds data to the self.counters dictiorary following its convention, creating on the fly the missing bits. For example, given: self.counters = {} datapoint = {'service': 'druid/broker', 'metric'='segment/size', 'datasource': 'test', 'value': 10} This function will creates the following: self.counters = {'segment/size': {'broker': {'test': 10}}} The algorithm is generic enough to support all metrics handled by self.counters without caring about the number of labels needed. """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) metrics_storage = self.counters[metric_name] metric_labels = self.supported_metric_names[daemon][metric_name] metrics_storage.setdefault(daemon, {}) if metric_labels: metrics_storage_cursor = metrics_storage[daemon] for label in metric_labels: label_value = str(datapoint[label]) if metric_labels[-1] != label: metrics_storage_cursor.setdefault(label_value, {}) metrics_storage_cursor = metrics_storage_cursor[ label_value] else: metrics_storage_cursor[label_value] = metric_value else: metrics_storage[daemon] = metric_value log.debug("The datapoint {} modified the counters dictionary to: \n{}". format(datapoint, self.counters)) def store_histogram(self, datapoint): """ Store datapoints that will end up in histogram buckets using a dictiorary. This function is highly customized for the only histograms configured so far, rather than being generic like store_counter. Example of how it works: self.histograms = {} datapoint = {'service': 'druid/broker', 'metric'='query/time', 'datasource': 'test', 'value': 10} This function will creates the following: self.counters = {'query/time': {'broker': {'test': {'10': 1, '100': 1, etc.., 'sum': 10}}}}} """ daemon = DruidCollector.sanitize_field(str(datapoint['service'])) metric_name = str(datapoint['metric']) metric_value = float(datapoint['value']) datasource = str(datapoint['dataSource']) self.histograms.setdefault(metric_name, {daemon: {datasource: {}}}) self.histograms[metric_name].setdefault(daemon, {datasource: {}}) self.histograms[metric_name][daemon].setdefault(datasource, {}) for bucket in self.metric_buckets[metric_name]: stored_buckets = self.histograms[metric_name][daemon][datasource] if bucket not in stored_buckets: stored_buckets[bucket] = 0 if bucket != 'sum' and metric_value <= float(bucket): stored_buckets[bucket] += 1 stored_buckets['sum'] += metric_value log.debug( "The datapoint {} modified the histograms dictionary to: \n{}". format(datapoint, self.histograms)) @scrape_duration.time() def collect(self): # Metrics common to Broker, Historical and Peon for daemon in ['broker', 'historical', 'peon']: query_metrics = self._get_query_histograms(daemon) cache_metrics = self._get_cache_counters(daemon) for metric in query_metrics: if not self.histograms[metric]: continue if daemon in self.histograms[metric]: for datasource in self.histograms[metric][daemon]: buckets = self.histograms[metric][daemon][datasource] buckets_without_sum = [(k, v) for k, v in buckets.items() if k != 'sum'] query_metrics[metric].add_metric( [datasource], buckets=buckets_without_sum, sum_value=self.histograms[metric][daemon] [datasource]['sum']) yield query_metrics[metric] # Metrics common to Broker and Historical for daemon in ['broker', 'historical']: cache_metrics = self._get_cache_counters(daemon) for metric in cache_metrics: if not self.counters[metric] or daemon not in self.counters[ metric]: if not self.supported_metric_names[daemon][metric]: cache_metrics[metric].add_metric([], float('nan')) else: continue else: cache_metrics[metric].add_metric( [], self.counters[metric][daemon]) yield cache_metrics[metric] # Metrics common to all for daemon in ['middlemanager', 'broker', 'historical', 'coordinator']: generic_metrics = self._get_general_counters(daemon) for metric in generic_metrics: if not self.counters[metric] or daemon not in self.counters[ metric]: if not self.supported_metric_names[daemon][metric]: generic_metrics[metric].add_metric([], float('nan')) else: continue else: labels = self.supported_metric_names[daemon][metric] if not labels: generic_metrics[metric].add_metric( [], self.counters[metric][daemon]) elif len(labels) == 1: for label in self.counters[metric][daemon]: generic_metrics[metric].add_metric( [label], self.counters[metric][daemon][label]) else: for outer_label in self.counters[metric][daemon]: for inner_label in self.counters[metric][daemon][ outer_label]: generic_metrics[metric].add_metric( [outer_label, inner_label], self.counters[metric][daemon][outer_label] [inner_label]) yield generic_metrics[metric] historical_health_metrics = self._get_historical_counters() coordinator_metrics = self._get_coordinator_counters() realtime_metrics = self._get_realtime_counters() broker_metrics = self._get_query_counters() for daemon, metrics in [('coordinator', coordinator_metrics), ('historical', historical_health_metrics), ('peon', realtime_metrics), ('broker', broker_metrics)]: for metric in metrics: if not self.counters[metric] or daemon not in self.counters[ metric]: if not self.supported_metric_names[daemon][metric]: metrics[metric].add_metric([], float('nan')) else: continue else: labels = self.supported_metric_names[daemon][metric] if not labels: metrics[metric].add_metric( [], self.counters[metric][daemon]) elif len(labels) == 1: for label in self.counters[metric][daemon]: metrics[metric].add_metric( [label], self.counters[metric][daemon][label]) else: for outer_label in self.counters[metric][daemon]: for inner_label in self.counters[metric][daemon][ outer_label]: metrics[metric].add_metric( [outer_label, inner_label], self.counters[metric][daemon][outer_label] [inner_label]) yield metrics[metric] registered = CounterMetricFamily( 'druid_exporter_datapoints_registered_count', 'Number of datapoints successfully registered ' 'by the exporter.') registered.add_metric([], self.datapoints_registered) yield registered def register_datapoint(self, datapoint): if datapoint['feed'] != 'metrics': log.debug("The following feed does not contain a datapoint, " "dropping it: {}".format(datapoint)) return daemon = DruidCollector.sanitize_field(str(datapoint['service'])) if (datapoint['feed'] != 'metrics' or daemon not in self.supported_metric_names or datapoint['metric'] not in self.supported_metric_names[daemon]): log.debug( "The following datapoint is not supported, either " "because the 'feed' field is not 'metrics' or " "the metric itself is not supported: {}".format(datapoint)) return metric_name = str(datapoint['metric']) if metric_name in self.histograms_metrics: self.store_histogram(datapoint) elif metric_name in self.counters_metrics: self.store_counter(datapoint) self.datapoints_registered += 1
def setUp(self): self.registry = CollectorRegistry() self.summary = Summary('s', 'help', registry=self.registry) self.summary_with_labels = Summary('s_with_labels', 'help', labelnames=("label1",), registry=self.registry)
def test_timer_not_observable(self): s = Summary('test', 'help', labelnames=('label',), registry=self.registry) assert_not_observable(s.time)