def api_call_latency(title, metric, verb, scope, threshold): return d.Graph( title=title, targets=[ d.Target(expr=str(threshold), legendFormat="threshold"), d.Target( expr='quantile_over_time(0.99, %(metric)s{quantile="0.99", verb=~"%(verb)s", scope=~"%(scope)s"}[12h])' % {"metric": metric, "verb": verb, "scope": scope} ), ], yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), )
expr= 'sum(rate(probes_in_cluster_dns_lookup_count{namespace="probes", job="dns"}[1m]))', legendFormat="lookup rate", ), g.Target( expr= 'sum(rate(probes_in_cluster_network_latency_error{namespace="probes", job="dns"}[1m]))', legendFormat="error rate", ), ], ), d.Graph( title="probe: # running", targets=[ d.Target( expr= 'count(container_memory_usage_bytes{namespace="probes", container="dns"}) by (container, namespace)' ) ], nullPointMode="null", ), d.Graph( title="probe: memory usage", targets=d.min_max_avg( base='process_resident_memory_bytes{namespace="probes", job="dns"}', by=["job", "namespace"], legend="{{job}}", ), nullPointMode="null", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), ]
"fs bytes writes by container", "sum(rate(container_fs_writes_bytes_total[1m])) by (container, instance)", legend="{{instance}}: {{container}}", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), d.simple_graph( "fs writes by container", "sum(rate(container_fs_writes_total[1m])) by (container, instance)", legend="{{instance}}: {{container}}", ), d.Graph( title="CPU usage by container", targets=[ d.Target( expr= 'sum(rate(container_cpu_usage_seconds_total{container!=""}[1m])) by (container, instance)', legendFormat="{{instance}}: {{container}}", ), d.Target(expr="machine_cpu_cores", legendFormat="limit"), ], ), d.Graph( title="memory usage by container", targets=[ d.Target( expr= 'sum(container_memory_usage_bytes{container!=""}) by (container, instance)', legendFormat="{{instance}}: {{container}}", ), d.Target(expr="machine_memory_bytes", legendFormat="limit"), ],
NETWORK_LATENCY_PANEL = [ d.Graph( title="Network latency", targets=d.show_quantiles( 'probes:in_cluster_network_latency:histogram_quantile{{quantile="{quantile}"}}', legend="{{quantile}}", ), yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), nullPointMode="null", ), d.Graph( title="probes: ping rate", targets=[ d.Target( expr='sum(rate(probes_in_cluster_network_latency_ping_count{namespace="probes", job="ping-client"}[1m])) by (job)', legendFormat="rate", ), d.Target( expr='sum(rate(probes_in_cluster_network_latency_error{namespace="probes", job="ping-client"}[1m])) by (job)', legendFormat="error rate", ), ], nullPointMode="null", ), d.Graph( title="probe: # running", targets=[ d.TargetWithInterval( expr='count(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container, namespace)' ) ],
g.Target( expr='sum(rate(probes_in_cluster_network_latency_ping_count{namespace="probes", job="ping-client"}[1m])) by (job)', legendFormat="rate", ), g.Target( expr='sum(rate(probes_in_cluster_network_latency_error{namespace="probes", job="ping-client"}[1m])) by (job)', legendFormat="error rate", ), ], nullPointMode="null", ), d.Graph( title="probe: # running", targets=[ d.Target( expr='count(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container, namespace)' ) ], nullPointMode="null", ), d.Graph( title="probes: memory usage", targets=[ g.Target( expr='min(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="min {{container}}", ), g.Target( expr='avg(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="avg {{container}}", ),
), d.Graph( title="DNS latency", targets=d.show_quantiles( 'probes:dns_lookup_latency:histogram_quantile{{quantile="{quantile}"}}', legend="{{quantile}}", ), yAxes=g.single_y_axis(format=g.SECONDS_FORMAT), nullPointMode="null", ), d.Graph( title="probe: lookup rate", targets=[ d.Target( expr= 'sum(rate(probes_in_cluster_dns_lookup_count{namespace="probes", job="dns"}[1m]))', legendFormat="lookup rate", ), d.Target( expr= 'sum(rate(probes_in_cluster_network_latency_error{namespace="probes", job="dns"}[1m]))', legendFormat="error rate", ), ], ), d.Graph( title="probe: # running", targets=[ d.TargetWithInterval( expr= 'count(container_memory_usage_bytes{namespace="probes", container="dns"}) by (container, namespace)'