def test_hierarchical_sensors(metrics): parent1 = metrics.sensor('test.parent1') parent1.add(metrics.metric_name('test.parent1.count', 'grp1'), Count()) parent2 = metrics.sensor('test.parent2') parent2.add(metrics.metric_name('test.parent2.count', 'grp1'), Count()) child1 = metrics.sensor('test.child1', parents=[parent1, parent2]) child1.add(metrics.metric_name('test.child1.count', 'grp1'), Count()) child2 = metrics.sensor('test.child2', parents=[parent1]) child2.add(metrics.metric_name('test.child2.count', 'grp1'), Count()) grandchild = metrics.sensor('test.grandchild', parents=[child1]) grandchild.add(metrics.metric_name('test.grandchild.count', 'grp1'), Count()) # increment each sensor one time parent1.record() parent2.record() child1.record() child2.record() grandchild.record() p1 = parent1.metrics[0].value() p2 = parent2.metrics[0].value() c1 = child1.metrics[0].value() c2 = child2.metrics[0].value() gc = grandchild.metrics[0].value() # each metric should have a count equal to one + its children's count assert 1.0 == gc assert 1.0 + gc == c1 assert 1.0 == c2 assert 1.0 + c1 == p2 assert 1.0 + c1 + c2 == p1 assert [child1, child2] == metrics._children_sensors.get(parent1) assert [child1] == metrics._children_sensors.get(parent2) assert metrics._children_sensors.get(grandchild) is None
def test_remove_inactive_metrics(mocker, time_keeper, metrics): mocker.patch('time.time', side_effect=time_keeper.time) s1 = metrics.sensor('test.s1', None, 1) s1.add(metrics.metric_name('test.s1.count', 'grp1'), Count()) s2 = metrics.sensor('test.s2', None, 3) s2.add(metrics.metric_name('test.s2.count', 'grp1'), Count()) purger = Metrics.ExpireSensorTask purger.run(metrics) assert metrics.get_sensor('test.s1') is not None, \ 'Sensor test.s1 must be present' assert metrics.metrics.get(metrics.metric_name('test.s1.count', 'grp1')) is not None, \ 'MetricName test.s1.count must be present' assert metrics.get_sensor('test.s2') is not None, \ 'Sensor test.s2 must be present' assert metrics.metrics.get(metrics.metric_name('test.s2.count', 'grp1')) is not None, \ 'MetricName test.s2.count must be present' time_keeper.sleep(1.001) purger.run(metrics) assert metrics.get_sensor('test.s1') is None, \ 'Sensor test.s1 should have been purged' assert metrics.metrics.get(metrics.metric_name('test.s1.count', 'grp1')) is None, \ 'MetricName test.s1.count should have been purged' assert metrics.get_sensor('test.s2') is not None, \ 'Sensor test.s2 must be present' assert metrics.metrics.get(metrics.metric_name('test.s2.count', 'grp1')) is not None, \ 'MetricName test.s2.count must be present' # record a value in sensor s2. This should reset the clock for that sensor. # It should not get purged at the 3 second mark after creation s2.record() time_keeper.sleep(2) purger.run(metrics) assert metrics.get_sensor('test.s2') is not None, \ 'Sensor test.s2 must be present' assert metrics.metrics.get(metrics.metric_name('test.s2.count', 'grp1')) is not None, \ 'MetricName test.s2.count must be present' # After another 1 second sleep, the metric should be purged time_keeper.sleep(1) purger.run(metrics) assert metrics.get_sensor('test.s1') is None, \ 'Sensor test.s2 should have been purged' assert metrics.metrics.get(metrics.metric_name('test.s1.count', 'grp1')) is None, \ 'MetricName test.s2.count should have been purged' # After purging, it should be possible to recreate a metric s1 = metrics.sensor('test.s1', None, 1) s1.add(metrics.metric_name('test.s1.count', 'grp1'), Count()) assert metrics.get_sensor('test.s1') is not None, \ 'Sensor test.s1 must be present' assert metrics.metrics.get(metrics.metric_name('test.s1.count', 'grp1')) is not None, \ 'MetricName test.s1.count must be present'
def test_simple_stats(mocker, time_keeper, config, metrics): mocker.patch('time.time', side_effect=time_keeper.time) measurable = ConstantMeasurable() metrics.add_metric(metrics.metric_name('direct.measurable', 'grp1', 'The fraction of time an appender waits for space allocation.'), measurable) sensor = metrics.sensor('test.sensor') sensor.add(metrics.metric_name('test.avg', 'grp1'), Avg()) sensor.add(metrics.metric_name('test.max', 'grp1'), Max()) sensor.add(metrics.metric_name('test.min', 'grp1'), Min()) sensor.add(metrics.metric_name('test.rate', 'grp1'), Rate(TimeUnit.SECONDS)) sensor.add(metrics.metric_name('test.occurences', 'grp1'),Rate(TimeUnit.SECONDS, Count())) sensor.add(metrics.metric_name('test.count', 'grp1'), Count()) percentiles = [Percentile(metrics.metric_name('test.median', 'grp1'), 50.0), Percentile(metrics.metric_name('test.perc99_9', 'grp1'), 99.9)] sensor.add_compound(Percentiles(100, BucketSizing.CONSTANT, 100, -100, percentiles=percentiles)) sensor2 = metrics.sensor('test.sensor2') sensor2.add(metrics.metric_name('s2.total', 'grp1'), Total()) sensor2.record(5.0) sum_val = 0 count = 10 for i in range(count): sensor.record(i) sum_val += i # prior to any time passing elapsed_secs = (config.time_window_ms * (config.samples - 1)) / 1000.0 assert abs(count / elapsed_secs - metrics.metrics.get(metrics.metric_name('test.occurences', 'grp1')).value()) \ < EPS, 'Occurrences(0...%d) = %f' % (count, count / elapsed_secs) # pretend 2 seconds passed... sleep_time_seconds = 2.0 time_keeper.sleep(sleep_time_seconds) elapsed_secs += sleep_time_seconds assert abs(5.0 - metrics.metrics.get(metrics.metric_name('s2.total', 'grp1')).value()) \ < EPS, 's2 reflects the constant value' assert abs(4.5 - metrics.metrics.get(metrics.metric_name('test.avg', 'grp1')).value()) \ < EPS, 'Avg(0...9) = 4.5' assert abs((count - 1) - metrics.metrics.get(metrics.metric_name('test.max', 'grp1')).value()) \ < EPS, 'Max(0...9) = 9' assert abs(0.0 - metrics.metrics.get(metrics.metric_name('test.min', 'grp1')).value()) \ < EPS, 'Min(0...9) = 0' assert abs((sum_val / elapsed_secs) - metrics.metrics.get(metrics.metric_name('test.rate', 'grp1')).value()) \ < EPS, 'Rate(0...9) = 1.40625' assert abs((count / elapsed_secs) - metrics.metrics.get(metrics.metric_name('test.occurences', 'grp1')).value()) \ < EPS, 'Occurrences(0...%d) = %f' % (count, count / elapsed_secs) assert abs(count - metrics.metrics.get(metrics.metric_name('test.count', 'grp1')).value()) \ < EPS, 'Count(0...9) = 10'
def test_time_windowing(mocker, time_keeper): mocker.patch('time.time', side_effect=time_keeper.time) count = Count() config = MetricConfig(time_window_ms=1, samples=2) count.record(config, 1.0, time_keeper.ms()) time_keeper.sleep(.001) count.record(config, 1.0, time_keeper.ms()) assert 2.0 == count.measure(config, time_keeper.ms()) time_keeper.sleep(.001) count.record(config, 1.0, time_keeper.ms()) # oldest event times out assert 2.0 == count.measure(config, time_keeper.ms())
def test_remove_metric(metrics): size = len(metrics.metrics) metrics.add_metric(metrics.metric_name('test1', 'grp1'), Count()) metrics.add_metric(metrics.metric_name('test2', 'grp1'), Count()) assert metrics.remove_metric(metrics.metric_name('test1', 'grp1')) is not None assert metrics.metrics.get(metrics.metric_name('test1', 'grp1')) is None assert metrics.metrics.get(metrics.metric_name('test2', 'grp1')) is not None assert metrics.remove_metric(metrics.metric_name('test2', 'grp1')) is not None assert metrics.metrics.get(metrics.metric_name('test2', 'grp1')) is None assert size == len(metrics.metrics)
def __init__(self, heartbeat, metrics, prefix, tags=None): self.heartbeat = heartbeat self.metrics = metrics self.metric_group_name = prefix + "-coordinator-metrics" self.heartbeat_latency = metrics.sensor('heartbeat-latency') self.heartbeat_latency.add( metrics.metric_name( 'heartbeat-response-time-max', self.metric_group_name, 'The max time taken to receive a response to a heartbeat request', tags), Max()) self.heartbeat_latency.add( metrics.metric_name('heartbeat-rate', self.metric_group_name, 'The average number of heartbeats per second', tags), Rate(sampled_stat=Count())) self.join_latency = metrics.sensor('join-latency') self.join_latency.add( metrics.metric_name('join-time-avg', self.metric_group_name, 'The average time taken for a group rejoin', tags), Avg()) self.join_latency.add( metrics.metric_name('join-time-max', self.metric_group_name, 'The max time taken for a group rejoin', tags), Max()) self.join_latency.add( metrics.metric_name('join-rate', self.metric_group_name, 'The number of group joins per second', tags), Rate(sampled_stat=Count())) self.sync_latency = metrics.sensor('sync-latency') self.sync_latency.add( metrics.metric_name('sync-time-avg', self.metric_group_name, 'The average time taken for a group sync', tags), Avg()) self.sync_latency.add( metrics.metric_name('sync-time-max', self.metric_group_name, 'The max time taken for a group sync', tags), Max()) self.sync_latency.add( metrics.metric_name('sync-rate', self.metric_group_name, 'The number of group syncs per second', tags), Rate(sampled_stat=Count())) metrics.add_metric( metrics.metric_name( 'last-heartbeat-seconds-ago', self.metric_group_name, 'The number of seconds since the last controller heartbeat was sent', tags), AnonMeasurable(lambda _, now: (now / 1000) - self.heartbeat.last_send))
def test_remove_sensor(metrics): size = len(metrics.metrics) parent1 = metrics.sensor('test.parent1') parent1.add(metrics.metric_name('test.parent1.count', 'grp1'), Count()) parent2 = metrics.sensor('test.parent2') parent2.add(metrics.metric_name('test.parent2.count', 'grp1'), Count()) child1 = metrics.sensor('test.child1', parents=[parent1, parent2]) child1.add(metrics.metric_name('test.child1.count', 'grp1'), Count()) child2 = metrics.sensor('test.child2', parents=[parent2]) child2.add(metrics.metric_name('test.child2.count', 'grp1'), Count()) grandchild1 = metrics.sensor('test.gchild2', parents=[child2]) grandchild1.add(metrics.metric_name('test.gchild2.count', 'grp1'), Count()) sensor = metrics.get_sensor('test.parent1') assert sensor is not None metrics.remove_sensor('test.parent1') assert metrics.get_sensor('test.parent1') is None assert metrics.metrics.get( metrics.metric_name('test.parent1.count', 'grp1')) is None assert metrics.get_sensor('test.child1') is None assert metrics._children_sensors.get(sensor) is None assert metrics.metrics.get(metrics.metric_name('test.child1.count', 'grp1')) is None sensor = metrics.get_sensor('test.gchild2') assert sensor is not None metrics.remove_sensor('test.gchild2') assert metrics.get_sensor('test.gchild2') is None assert metrics._children_sensors.get(sensor) is None assert metrics.metrics.get( metrics.metric_name('test.gchild2.count', 'grp1')) is None sensor = metrics.get_sensor('test.child2') assert sensor is not None metrics.remove_sensor('test.child2') assert metrics.get_sensor('test.child2') is None assert metrics._children_sensors.get(sensor) is None assert metrics.metrics.get(metrics.metric_name('test.child2.count', 'grp1')) is None sensor = metrics.get_sensor('test.parent2') assert sensor is not None metrics.remove_sensor('test.parent2') assert metrics.get_sensor('test.parent2') is None assert metrics._children_sensors.get(sensor) is None assert metrics.metrics.get( metrics.metric_name('test.parent2.count', 'grp1')) is None assert size == len(metrics.metrics)
def __init__(self, metrics, prefix): self.metrics = metrics self.group_name = '%s-fetch-manager-metrics' % prefix self.bytes_fetched = metrics.sensor('bytes-fetched') self.bytes_fetched.add(metrics.metric_name('fetch-size-avg', self.group_name, 'The average number of bytes fetched per request'), Avg()) self.bytes_fetched.add(metrics.metric_name('fetch-size-max', self.group_name, 'The maximum number of bytes fetched per request'), Max()) self.bytes_fetched.add(metrics.metric_name('bytes-consumed-rate', self.group_name, 'The average number of bytes consumed per second'), Rate()) self.records_fetched = self.metrics.sensor('records-fetched') self.records_fetched.add(metrics.metric_name('records-per-request-avg', self.group_name, 'The average number of records in each request'), Avg()) self.records_fetched.add(metrics.metric_name('records-consumed-rate', self.group_name, 'The average number of records consumed per second'), Rate()) self.fetch_latency = metrics.sensor('fetch-latency') self.fetch_latency.add(metrics.metric_name('fetch-latency-avg', self.group_name, 'The average time taken for a fetch request.'), Avg()) self.fetch_latency.add(metrics.metric_name('fetch-latency-max', self.group_name, 'The max time taken for any fetch request.'), Max()) self.fetch_latency.add(metrics.metric_name('fetch-rate', self.group_name, 'The number of fetch requests per second.'), Rate(sampled_stat=Count())) self.records_fetch_lag = metrics.sensor('records-lag') self.records_fetch_lag.add(metrics.metric_name('records-lag-max', self.group_name, 'The maximum lag in terms of number of records for any partition in self window'), Max()) self.fetch_throttle_time_sensor = metrics.sensor('fetch-throttle-time') self.fetch_throttle_time_sensor.add(metrics.metric_name('fetch-throttle-time-avg', self.group_name, 'The average throttle time in ms'), Avg()) self.fetch_throttle_time_sensor.add(metrics.metric_name('fetch-throttle-time-max', self.group_name, 'The maximum throttle time in ms'), Max())
def __init__(self, metrics, metric_group_prefix, subscription): self.metrics = metrics self.metric_group_name = '%s-coordinator-metrics' % ( metric_group_prefix, ) self.commit_latency = metrics.sensor('commit-latency') self.commit_latency.add( metrics.metric_name('commit-latency-avg', self.metric_group_name, 'The average time taken for a commit request'), Avg()) self.commit_latency.add( metrics.metric_name('commit-latency-max', self.metric_group_name, 'The max time taken for a commit request'), Max()) self.commit_latency.add( metrics.metric_name('commit-rate', self.metric_group_name, 'The number of commit calls per second'), Rate(sampled_stat=Count())) num_parts = AnonMeasurable( lambda config, now: len(subscription.assigned_partitions())) metrics.add_metric( metrics.metric_name( 'assigned-partitions', self.metric_group_name, 'The number of partitions currently assigned to this consumer' ), num_parts)
def test_event_windowing(mocker, time_keeper): mocker.patch('time.time', side_effect=time_keeper.time) count = Count() config = MetricConfig(event_window=1, samples=2) count.record(config, 1.0, time_keeper.ms()) count.record(config, 1.0, time_keeper.ms()) assert 2.0 == count.measure(config, time_keeper.ms()) count.record(config, 1.0, time_keeper.ms()) # first event times out assert 2.0 == count.measure(config, time_keeper.ms())
def test_old_data_has_no_effect(mocker, time_keeper): mocker.patch('time.time', side_effect=time_keeper.time) max_stat = Max() min_stat = Min() avg_stat = Avg() count_stat = Count() window_ms = 100 samples = 2 config = MetricConfig(time_window_ms=window_ms, samples=samples) max_stat.record(config, 50, time_keeper.ms()) min_stat.record(config, 50, time_keeper.ms()) avg_stat.record(config, 50, time_keeper.ms()) count_stat.record(config, 50, time_keeper.ms()) time_keeper.sleep(samples * window_ms / 1000.0) assert float('-inf') == max_stat.measure(config, time_keeper.ms()) assert float(sys.maxsize) == min_stat.measure(config, time_keeper.ms()) assert 0.0 == avg_stat.measure(config, time_keeper.ms()) assert 0 == count_stat.measure(config, time_keeper.ms())
def __init__(self, metrics, metric_group_prefix, conns): self.metrics = metrics self.metric_group_name = metric_group_prefix + '-metrics' self.connection_closed = metrics.sensor('connections-closed') self.connection_closed.add( metrics.metric_name( 'connection-close-rate', self.metric_group_name, 'Connections closed per second in the window.'), Rate()) self.connection_created = metrics.sensor('connections-created') self.connection_created.add( metrics.metric_name( 'connection-creation-rate', self.metric_group_name, 'New connections established per second in the window.'), Rate()) self.select_time = metrics.sensor('select-time') self.select_time.add( metrics.metric_name( 'select-rate', self.metric_group_name, 'Number of times the I/O layer checked for new I/O to perform per' ' second'), Rate(sampled_stat=Count())) self.select_time.add( metrics.metric_name( 'io-wait-time-ns-avg', self.metric_group_name, 'The average length of time the I/O thread spent waiting for a' ' socket ready for reads or writes in nanoseconds.'), Avg()) self.select_time.add( metrics.metric_name( 'io-wait-ratio', self.metric_group_name, 'The fraction of time the I/O thread spent waiting.'), Rate(time_unit=TimeUnit.NANOSECONDS)) self.io_time = metrics.sensor('io-time') self.io_time.add( metrics.metric_name( 'io-time-ns-avg', self.metric_group_name, 'The average length of time for I/O per select call in nanoseconds.' ), Avg()) self.io_time.add( metrics.metric_name( 'io-ratio', self.metric_group_name, 'The fraction of time the I/O thread spent doing I/O'), Rate(time_unit=TimeUnit.NANOSECONDS)) metrics.add_metric( metrics.metric_name('connection-count', self.metric_group_name, 'The current number of active connections.'), AnonMeasurable(lambda config, now: len(conns)))
def __init__(self, metrics, metric_group_prefix, node_id): self.metrics = metrics # Any broker may have registered summary metrics already # but if not, we need to create them so we can set as parents below all_conns_transferred = metrics.get_sensor('bytes-sent-received') if not all_conns_transferred: metric_group_name = metric_group_prefix + '-metrics' bytes_transferred = metrics.sensor('bytes-sent-received') bytes_transferred.add(metrics.metric_name( 'network-io-rate', metric_group_name, 'The average number of network operations (reads or writes) on all' ' connections per second.'), Rate(sampled_stat=Count())) bytes_sent = metrics.sensor('bytes-sent', parents=[bytes_transferred]) bytes_sent.add(metrics.metric_name( 'outgoing-byte-rate', metric_group_name, 'The average number of outgoing bytes sent per second to all' ' servers.'), Rate()) bytes_sent.add(metrics.metric_name( 'request-rate', metric_group_name, 'The average number of requests sent per second.'), Rate(sampled_stat=Count())) bytes_sent.add(metrics.metric_name( 'request-size-avg', metric_group_name, 'The average size of all requests in the window.'), Avg()) bytes_sent.add(metrics.metric_name( 'request-size-max', metric_group_name, 'The maximum size of any request sent in the window.'), Max()) bytes_received = metrics.sensor('bytes-received', parents=[bytes_transferred]) bytes_received.add(metrics.metric_name( 'incoming-byte-rate', metric_group_name, 'Bytes/second read off all sockets'), Rate()) bytes_received.add(metrics.metric_name( 'response-rate', metric_group_name, 'Responses received sent per second.'), Rate(sampled_stat=Count())) request_latency = metrics.sensor('request-latency') request_latency.add(metrics.metric_name( 'request-latency-avg', metric_group_name, 'The average request latency in ms.'), Avg()) request_latency.add(metrics.metric_name( 'request-latency-max', metric_group_name, 'The maximum request latency in ms.'), Max()) # if one sensor of the metrics has been registered for the connection, # then all other sensors should have been registered; and vice versa node_str = 'node-{0}'.format(node_id) node_sensor = metrics.get_sensor(node_str + '.bytes-sent') if not node_sensor: metric_group_name = metric_group_prefix + '-node-metrics.' + node_str bytes_sent = metrics.sensor( node_str + '.bytes-sent', parents=[metrics.get_sensor('bytes-sent')]) bytes_sent.add(metrics.metric_name( 'outgoing-byte-rate', metric_group_name, 'The average number of outgoing bytes sent per second.'), Rate()) bytes_sent.add(metrics.metric_name( 'request-rate', metric_group_name, 'The average number of requests sent per second.'), Rate(sampled_stat=Count())) bytes_sent.add(metrics.metric_name( 'request-size-avg', metric_group_name, 'The average size of all requests in the window.'), Avg()) bytes_sent.add(metrics.metric_name( 'request-size-max', metric_group_name, 'The maximum size of any request sent in the window.'), Max()) bytes_received = metrics.sensor( node_str + '.bytes-received', parents=[metrics.get_sensor('bytes-received')]) bytes_received.add(metrics.metric_name( 'incoming-byte-rate', metric_group_name, 'Bytes/second read off node-connection socket'), Rate()) bytes_received.add(metrics.metric_name( 'response-rate', metric_group_name, 'The average number of responses received per second.'), Rate(sampled_stat=Count())) request_time = metrics.sensor( node_str + '.latency', parents=[metrics.get_sensor('request-latency')]) request_time.add(metrics.metric_name( 'request-latency-avg', metric_group_name, 'The average request latency in ms.'), Avg()) request_time.add(metrics.metric_name( 'request-latency-max', metric_group_name, 'The maximum request latency in ms.'), Max()) self.bytes_sent = metrics.sensor(node_str + '.bytes-sent') self.bytes_received = metrics.sensor(node_str + '.bytes-received') self.request_time = metrics.sensor(node_str + '.latency')
def __init__(self, metrics, metric_group_prefix, conns): self.metrics = metrics self.metric_group_name = metric_group_prefix + "-metrics" self.connection_closed = metrics.sensor("connections-closed") self.connection_closed.add( metrics.metric_name( "connection-close-rate", self.metric_group_name, "Connections closed per second in the window.", ), Rate(), ) self.connection_created = metrics.sensor("connections-created") self.connection_created.add( metrics.metric_name( "connection-creation-rate", self.metric_group_name, "New connections established per second in the window.", ), Rate(), ) self.select_time = metrics.sensor("select-time") self.select_time.add( metrics.metric_name( "select-rate", self.metric_group_name, "Number of times the I/O layer checked for new I/O to perform per" " second", ), Rate(sampled_stat=Count()), ) self.select_time.add( metrics.metric_name( "io-wait-time-ns-avg", self.metric_group_name, "The average length of time the I/O thread spent waiting for a" " socket ready for reads or writes in nanoseconds.", ), Avg(), ) self.select_time.add( metrics.metric_name( "io-wait-ratio", self.metric_group_name, "The fraction of time the I/O thread spent waiting.", ), Rate(time_unit=TimeUnit.NANOSECONDS), ) self.io_time = metrics.sensor("io-time") self.io_time.add( metrics.metric_name( "io-time-ns-avg", self.metric_group_name, "The average length of time for I/O per select call in nanoseconds.", ), Avg(), ) self.io_time.add( metrics.metric_name( "io-ratio", self.metric_group_name, "The fraction of time the I/O thread spent doing I/O", ), Rate(time_unit=TimeUnit.NANOSECONDS), ) metrics.add_metric( metrics.metric_name( "connection-count", self.metric_group_name, "The current number of active connections.", ), AnonMeasurable(lambda config, now: len(conns)), )