d.Graph( title="coredns: # running", targets=[ d.Target( expr= 'count(container_memory_usage_bytes{namespace="kube-system", container="coredns"}) by (container, namespace)' ) ], nullPointMode="null", ), d.Graph( title="coredns: memory usage", targets=d.min_max_avg( base= 'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}', by=["job", "namespace"], legend="{{job}}", ), nullPointMode="null", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), ] dashboard = d.Dashboard( title="DNS", rows=[ d.Row(title="In-cluster DNS latency", panels=DNS_LATENCY_PANEL), d.Row(title="CoreDNS", panels=COREDNS_PANELS), ], ).auto_panel_ids()
g.Target( expr= "sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)", legendFormat="RetransSegs {{instance}}", ), ], yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10), ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="Master dashboard", refresh="", rows=[ d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS), d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True), d.Row(title="etcd", panels=ETCD_PANELS, collapse=True), d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}", ) ], collapse=True,
d.Graph( title="Service: # running", targets=[ d.Target( expr= 'count(process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}) by (job, namespace)' ) ], nullPointMode="null", ), d.Graph( title="Service: memory usage", targets=d.min_max_avg( base= 'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}', by=["job", "namespace"], legend="{{job}}", ), nullPointMode="null", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), ] dashboard = d.Dashboard( title="DNS", rows=[ d.Row(title="In-cluster DNS prober", panels=PROBER_PANEL), d.Row(title="In-cluster DNS service", panels=SERVICE_PANELS), ], ).auto_panel_ids()
), g.Target( expr= "sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)", legendFormat="RetransSegs {{instance}}", ), ], yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10), ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="Master dashboard", rows=[ d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS), d.Row(title="Overall cluster health", panels=HEALTH_PANELS), d.Row(title="etcd", panels=ETCD_PANELS), d.Row(title="kube-apiserver", panels=APISERVER_PANELS), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}", ) ], ), d.Row(title="Master VM", panels=VM_PANELS), d.Row(
# secondary panel # same criteria, different data source and starting point panel.title = "[SECONDARY] " + panel.title panel.dataSource = "$secondary_source" panel.timeShift = "$timeshift" extended_panels.append(panel) return extended_panels dashboard = d.Dashboard( title="Comparison Master dashboard", refresh="", rows=[ d.Row(title="API call latency", panels=extended_copy(API_CALL_LATENCY_PANELS)), d.Row(title="API call latency aggregated with quantile", panels=extended_copy(QUANTILE_API_CALL_LATENCY_PANELS), collapse=True), d.Row(title="P&F metrics", panels=extended_copy(PAF_PANELS), collapse=True), d.Row(title="Overall cluster health", panels=extended_copy(HEALTH_PANELS), collapse=True), d.Row(title="etcd", panels=extended_copy(ETCD_PANELS), collapse=True), d.Row(title="kube-apiserver", panels=extended_copy(APISERVER_PANELS), collapse=True), d.Row(title="kube-controller-manager", panels=extended_copy(CONTROLLER_MANAGER_PANELS), collapse=True), d.Row(title="Master VM", panels=extended_copy(VM_PANELS), collapse=True), ], templating=g.Templating( list=[ d.SOURCE_TEMPLATE, g.Template( name="secondary_source", type="datasource", query="prometheus",
g.Target( expr= "sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)", legendFormat="RetransSegs {{instance}}", ), ], yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10), ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="Master dashboard", refresh="", rows=[ d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS), d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True), d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True), d.Row(title="etcd", panels=ETCD_PANELS, collapse=True), d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}",
api_call_latency( title="Read-only API call latency (scope=cluster, threshold=30s)", metric=metric, verb="LIST", scope="cluster", threshold=30, ), api_call_latency( title="Mutating API call latency (threshold=1s)", metric=metric, verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"), scope=d.any_of("namespace", "cluster"), threshold=1, ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="SLO", rows=[ d.Row(title="SLO", panels=create_slo_panel()), d.Row( title="Experimental: SLO (window 1m)", panels=create_slo_panel( metric="apiserver:apiserver_request_latency_1m:histogram_quantile" ), ), ], ).auto_panel_ids()
nullPointMode="null", ), d.Graph( title="probes: memory usage", targets=[ d.Target( expr='min(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="min {{container}}", ), d.Target( expr='avg(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="avg {{container}}", ), d.Target( expr='max(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="max {{container}}", ), ], nullPointMode="null", ), ] dashboard = d.Dashboard( title="Network", rows=[ d.Row(title="Network progamming latency", panels=NETWORK_PROGRAMMING_PANEL), d.Row(title="In-cluster network latency", panels=NETWORK_LATENCY_PANEL), ], ).auto_panel_ids()