Ejemplo n.º 1
0
    d.Graph(
        title="coredns: # running",
        targets=[
            d.Target(
                expr=
                'count(container_memory_usage_bytes{namespace="kube-system", container="coredns"}) by (container, namespace)'
            )
        ],
        nullPointMode="null",
    ),
    d.Graph(
        title="coredns: memory usage",
        targets=d.min_max_avg(
            base=
            'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}',
            by=["job", "namespace"],
            legend="{{job}}",
        ),
        nullPointMode="null",
        yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
    ),
]

dashboard = d.Dashboard(
    title="DNS",
    rows=[
        d.Row(title="In-cluster DNS latency", panels=DNS_LATENCY_PANEL),
        d.Row(title="CoreDNS", panels=COREDNS_PANELS),
    ],
).auto_panel_ids()
Ejemplo n.º 2
0
        ],
        yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10),
    ),
]

# The final dashboard must be named 'dashboard' so that grafanalib will find it.
dashboard = d.Dashboard(
    title="Master dashboard",
    refresh="",
    rows=[
        d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS),
        d.Row(title="Overall cluster health",
              panels=HEALTH_PANELS,
              collapse=True),
        d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
        d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),
        d.Row(
            title="kube-controller-manager",
            panels=[
                d.simple_graph(
                    "Workqueue depths",
                    'workqueue_depth{endpoint="kube-controller-manager"}',
                    legend="{{name}}",
                )
            ],
            collapse=True,
        ),
        d.Row(title="Master VM", panels=VM_PANELS, collapse=True),
    ],
).auto_panel_ids()
Ejemplo n.º 3
0
    d.Graph(
        title="Service: # running",
        targets=[
            d.Target(
                expr=
                'count(process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}) by (job, namespace)'
            )
        ],
        nullPointMode="null",
    ),
    d.Graph(
        title="Service: memory usage",
        targets=d.min_max_avg(
            base=
            'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}',
            by=["job", "namespace"],
            legend="{{job}}",
        ),
        nullPointMode="null",
        yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
    ),
]

dashboard = d.Dashboard(
    title="DNS",
    rows=[
        d.Row(title="In-cluster DNS prober", panels=PROBER_PANEL),
        d.Row(title="In-cluster DNS service", panels=SERVICE_PANELS),
    ],
).auto_panel_ids()
dashboard = d.Dashboard(
    title="Master dashboard",
    rows=[
        d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS),
        d.Row(title="Overall cluster health", panels=HEALTH_PANELS),
        d.Row(title="etcd", panels=ETCD_PANELS),
        d.Row(title="kube-apiserver", panels=APISERVER_PANELS),
        d.Row(
            title="kube-controller-manager",
            panels=[
                d.simple_graph(
                    "Workqueue depths",
                    'workqueue_depth{endpoint="kube-controller-manager"}',
                    legend="{{name}}",
                )
            ],
        ),
        d.Row(title="Master VM", panels=VM_PANELS),
        d.Row(
            title="Addons",
            panels=[
                d.Graph(
                    title="Coredns memory",
                    dataSource="$source",
                    targets=[
                        g.Target(
                            expr=
                            'quantile(1, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))',
                            legendFormat="coredns-mem-100pctl",
                        ),
                        g.Target(
                            expr=
                            'quantile(0.99, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))',
                            legendFormat="coredns-mem-99ctl",
                        ),
                        g.Target(
                            expr=
                            'quantile(0.90, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))',
                            legendFormat="coredns-mem-90ctl",
                        ),
                        g.Target(
                            expr=
                            'quantile(0.50, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))',
                            legendFormat="coredns-mem-50ctl",
                        ),
                    ],
                    yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
                )
            ],
        ),
    ],
).auto_panel_ids()
dashboard = d.Dashboard(
    title="Comparison Master dashboard",
    refresh="",
    rows=[
        d.Row(title="API call latency", panels=extended_copy(API_CALL_LATENCY_PANELS)),
        d.Row(title="API call latency aggregated with quantile", panels=extended_copy(QUANTILE_API_CALL_LATENCY_PANELS), collapse=True),
        d.Row(title="P&F metrics", panels=extended_copy(PAF_PANELS), collapse=True),
        d.Row(title="Overall cluster health", panels=extended_copy(HEALTH_PANELS), collapse=True),
        d.Row(title="etcd", panels=extended_copy(ETCD_PANELS), collapse=True),
        d.Row(title="kube-apiserver", panels=extended_copy(APISERVER_PANELS), collapse=True),
        d.Row(title="kube-controller-manager", panels=extended_copy(CONTROLLER_MANAGER_PANELS), collapse=True),
        d.Row(title="Master VM", panels=extended_copy(VM_PANELS), collapse=True),
    ],
    templating=g.Templating(
        list=[
            d.SOURCE_TEMPLATE,
            g.Template(
                name="secondary_source",
                type="datasource",
                query="prometheus",
            ),
            g.Template(
                name="timeshift",
                type="interval",
                query="",
            ),
            g.Template(
                name="etcd_type",
                type="query",
                dataSource="$source",
                regex=r"\*\[+\]+(.*)",
                query="label_values(etcd_request_duration_seconds_count, type)",
                multi=True,
                includeAll=True,
                refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
            ),
            g.Template(
                name="etcd_operation",
                type="query",
                dataSource="$source",
                query="label_values(etcd_request_duration_seconds_count, operation)",
                multi=True,
                includeAll=True,
                refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
            ),
            g.Template(
                name="verb",
                type="query",
                dataSource="$source",
                query="label_values(apiserver_request_duration_seconds_count, verb)",
                multi=True,
                includeAll=True,
                refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
            ),
            g.Template(
                name="resource",
                type="query",
                dataSource="$source",
                regex="(.*)s",
                query="label_values(apiserver_request_duration_seconds_count, resource)",
                multi=True,
                includeAll=True,
                refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
            ),
        ]
    ),
).auto_panel_ids()
Ejemplo n.º 6
0
dashboard = d.Dashboard(
    title="Master dashboard",
    refresh="",
    rows=[
        d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS),
        d.Row(title="API call latency aggregated with quantile",
              panels=QUANTILE_API_CALL_LATENCY_PANELS,
              collapse=True),
        d.Row(title="Overall cluster health",
              panels=HEALTH_PANELS,
              collapse=True),
        d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
        d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),
        d.Row(
            title="kube-controller-manager",
            panels=[
                d.simple_graph(
                    "Workqueue depths",
                    'workqueue_depth{endpoint="kube-controller-manager"}',
                    legend="{{name}}",
                )
            ],
            collapse=True,
        ),
        d.Row(title="Master VM", panels=VM_PANELS, collapse=True),
    ],
    templating=g.Templating(list=[
        d.SOURCE_TEMPLATE,
        g.Template(
            name="etcd_type",
            type="query",
            dataSource="$source",
            regex=r"\*\[+\]+(.*)",
            query="label_values(etcd_request_duration_seconds_count, type)",
            multi=True,
            includeAll=True,
            refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
        ),
        g.Template(
            name="etcd_operation",
            type="query",
            dataSource="$source",
            query=
            "label_values(etcd_request_duration_seconds_count, operation)",
            multi=True,
            includeAll=True,
            refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
        ),
        g.Template(
            name="verb",
            type="query",
            dataSource="$source",
            query=
            "label_values(apiserver_request_duration_seconds_count, verb)",
            multi=True,
            includeAll=True,
            refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
        ),
        g.Template(
            name="resource",
            type="query",
            dataSource="$source",
            regex="(.*)s",
            query=
            "label_values(apiserver_request_duration_seconds_count, resource)",
            multi=True,
            includeAll=True,
            refresh=g.REFRESH_ON_TIME_RANGE_CHANGE,
        ),
    ]),
).auto_panel_ids()
Ejemplo n.º 7
0
        api_call_latency(
            title="Read-only API call latency (scope=cluster, threshold=30s)",
            metric=metric,
            verb="LIST",
            scope="cluster",
            threshold=30,
        ),
        api_call_latency(
            title="Mutating API call latency (threshold=1s)",
            metric=metric,
            verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"),
            scope=d.any_of("namespace", "cluster"),
            threshold=1,
        ),
    ]


# The final dashboard must be named 'dashboard' so that grafanalib will find it.
dashboard = d.Dashboard(
    title="SLO",
    rows=[
        d.Row(title="SLO", panels=create_slo_panel()),
        d.Row(
            title="Experimental: SLO (window 1m)",
            panels=create_slo_panel(
                metric="apiserver:apiserver_request_latency_1m:histogram_quantile"
            ),
        ),
    ],
).auto_panel_ids()
Ejemplo n.º 8
0
        nullPointMode="null",
    ),
    d.Graph(
        title="probes: memory usage",
        targets=[
            d.Target(
                expr='min(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)',
                legendFormat="min {{container}}",
            ),
            d.Target(
                expr='avg(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)',
                legendFormat="avg {{container}}",
            ),
            d.Target(
                expr='max(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)',
                legendFormat="max {{container}}",
            ),
        ],
        nullPointMode="null",
    ),
]


dashboard = d.Dashboard(
    title="Network",
    rows=[
        d.Row(title="Network progamming latency", panels=NETWORK_PROGRAMMING_PANEL),
        d.Row(title="In-cluster network latency", panels=NETWORK_LATENCY_PANEL),
    ],
).auto_panel_ids()