def process_open_fds(datasource):
    return G.Graph(
        title="Open File Descriptors",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="none",
                label="Count",
            ),
            G.YAxis(
                show=False,
            ),
        ],

        targets=[
            G.Target(
                expr="""
                process_open_fds{pod=~".+"}
                """,
                refId="A",
                legendFormat="{{pod}}",
            ),
        ],
    )
Ejemplo n.º 2
0
def QPSGraph(title, expressions, id, **kwargs):
    """Create a graph of QPS, broken up by response code.

    Data is drawn from Prometheus.

    :param title: Title of the graph.
    :param expressions: List of Prometheus expressions. Must be 5.
    :param id: The id for the graph, unique within the dashboard.
    :param kwargs: Passed on to Graph.
    """
    if len(expressions) != 5:
        raise ValueError('Expected 5 expressions, got {}: {}'.format(
            len(expressions), expressions))
    legends = sorted(ALIAS_COLORS.keys())
    exprs = zip(legends, expressions)
    return stacked(
        PromGraph(title=title,
                  expressions=exprs,
                  aliasColors=ALIAS_COLORS,
                  id=id,
                  yAxes=[
                      G.YAxis(format=G.OPS_FORMAT),
                      G.YAxis(format=G.SHORT_FORMAT),
                  ],
                  **kwargs))
def filesystem_usage(datasource):
    return G.Graph(
        title="Filesystem Usage",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="percent",
            ),
            G.YAxis(
                show=False,
            ),
        ],
        targets=[
            G.Target(
                # Get the proportion used of each filesystem on a volume from
                # a PersistentVolumeClaim on each node of the cluster.  It's
                # hard to figure out the role each filesystem serves from this
                # graph (since all we get is the PVC name).  Better than
                # nothing, though.  Hopefully later we can do better.
                expr="""
                100
                * filesystem_used_bytes{volume=~"pvc-.*"}
                / filesystem_size_bytes{volume=~"pvc-.*"}
                """,
                legendFormat="{{volume}}",
                refId="A",
            ),
        ],
    )
def unhandled_errors(datasource):
    return G.Graph(
        title="Unhandled Errors",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="none",
                label="Count",
            ),
            G.YAxis(
                show=False,
            ),
        ],

        targets=[
            G.Target(
                expr="""
                sum(s4_unhandled_error_counter)
                """,
                refId="A",
                legendFormat="Total Unhandled Errors",
            ),
        ],
    )
def last_convergence(datasource):
    return G.Graph(
        title="Since Last Convergence",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="none",
                label="Period",
            ),
            G.YAxis(
                show=False,
            ),
        ],

        targets=[
            G.Target(
                expr="""
                time()
                - max(
                    s4_last_convergence_succeeded{
                        pod=~"subscription-converger-.*"
                    }
                )
                """,
                refId="A",
                legendFormat="Time Since Last Convergence Success",
            ),
        ],
    )
def memory_usage(datasource):
    return G.Graph(
        title="Memory Usage",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                # 2 ^ 30 bytes
                format="gbytes",
                label="Memory",
            ),
            G.YAxis(
                show=False,
            ),
        ],
        targets=[
            G.Target(
                expr="""
                sum(machine_memory_bytes) / 2 ^ 30
                """,
                legendFormat="Total Physical Memory",
                refId="A",
            ),
            G.Target(
                expr="""
                rss:container_memory:total / 2 ^ 30
                """,
                legendFormat="Total Container RSS",
                refId="B",
            ),
        ],
    )
Ejemplo n.º 7
0
def test_auto_refids():
    """auto_ref_ids() provides refIds for all targets without refIds already set."""
    dashboard = G.Dashboard(
        title="Test dashboard",
        rows=[
            G.Row(panels=[
                G.Graph(
                    title="CPU Usage by Namespace (rate[5m])",
                    dataSource="My data source",
                    targets=[
                        G.Target(
                            expr='whatever #Q',
                            legendFormat='{{namespace}}',
                        ),
                        G.Target(
                            expr='hidden whatever',
                            legendFormat='{{namespace}}',
                            refId='Q',
                            hide=True
                        ),
                        G.Target(
                            expr='another target'
                        ),
                    ],
                    yAxes=[
                        G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds"),
                        G.YAxis(format=G.SHORT_FORMAT),
                    ],
                ).auto_ref_ids()
            ]),
        ],
    )
    assert dashboard.rows[0].panels[0].targets[0].refId == 'A'
    assert dashboard.rows[0].panels[0].targets[1].refId == 'Q'
    assert dashboard.rows[0].panels[0].targets[2].refId == 'B'
Ejemplo n.º 8
0
def QPSGraph(data_source, title, expressions, **kwargs):
    """Create a graph of QPS, broken up by response code.

    Data is drawn from Prometheus.

    :param title: Title of the graph.
    :param expressions: List of Prometheus expressions. Must be 5.
    :param kwargs: Passed on to Graph.
    """
    if len(expressions) != 5 and len(expressions) != 7:
        raise ValueError('Expected 5 or 7 expressions, got {}: {}'.format(
            len(expressions), expressions))
    legends = sorted(ALIAS_COLORS.keys())
    exprs = zip(legends, expressions)
    return stacked(prometheus.PromGraph(
        data_source=data_source,
        title=title,
        expressions=exprs,
        aliasColors=ALIAS_COLORS,
        yAxes=[
            G.YAxis(format=G.OPS_FORMAT),
            G.YAxis(format=G.SHORT_FORMAT),
        ],
        **kwargs
    ))
Ejemplo n.º 9
0
def test_auto_id():
    """auto_panel_ids() provides IDs for all panels without IDs already set."""
    dashboard = G.Dashboard(
        title="Test dashboard",
        rows=[
            G.Row(panels=[
                G.Graph(
                    title="CPU Usage by Namespace (rate[5m])",
                    dataSource="My data source",
                    targets=[
                        G.Target(
                            expr='whatever',
                            legendFormat='{{namespace}}',
                            refId='A',
                        ),
                    ],
                    yAxes=[
                        G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds"),
                        G.YAxis(format=G.SHORT_FORMAT),
                    ],
                )
            ]),
        ],
    ).auto_panel_ids()
    assert dashboard.rows[0].panels[0].id == 1
def s4_customer_deployments(datasource):
    return G.Graph(
        title="Customer Deployments",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="none",
                label="Total Customer Deployments",
                min=0,
                max=100,
            ),
            G.YAxis(
                show=False,
            ),
        ],

        targets=[
            G.Target(
                # Each replicaset and pod end up with their own series.  Label
                # these more succinctly.  Leave them distinct in case it is
                # interesting to see where restarts have happened.
                expr="""
                label_replace(
                    s4_deployment_gauge{pod=~"subscription-converger-.*"},
                    "shortpod",
                    "# Deploys ($1)",
                    "pod",
                    "subscription-converger-(.*)"
                )
                """,
                refId="A",
                legendFormat="{{shortpod}}",
            ),
            G.Target(
                # As above.
                expr="""
                label_replace(
                    s4_running_pod_gauge{pod=~"subscription-converger-.*"},
                    "shortpod",
                    "# Running ($1)",
                    "pod",
                    "subscription-converger-(.*)"
                )
                """,
                refId="B",
                legendFormat="{{shortpod}}",
            ),
        ],
    )
Ejemplo n.º 11
0
def _row(title):
    return core.Row(panels=[
        core.Graph(title=title,
                   dataSource='prometheus',
                   targets=[
                       core.Target(
                           expr=title,
                           legendFormat='{{namespace}}',
                       ),
                   ],
                   yAxes=[
                       core.YAxis(format=core.NO_FORMAT),
                       core.YAxis(format=core.SHORT_FORMAT),
                   ])
    ])
Ejemplo n.º 12
0
def test_serialization_cloudwatch_metrics_target():
    """Serializing a graph doesn't explode."""
    graph = G.Graph(
        title="Lambda Duration",
        dataSource="Cloudwatch data source",
        targets=[
            C.CloudwatchMetricsTarget(),
        ],
        id=1,
        yAxes=G.YAxes(
            G.YAxis(format=G.SHORT_FORMAT, label="ms"),
            G.YAxis(format=G.SHORT_FORMAT),
        ),
    )
    stream = StringIO()
    _gen.write_dashboard(graph, stream)
    assert stream.getvalue() != ''
Ejemplo n.º 13
0
def test_serialization_humio_metrics_target():
    """Serializing a graph doesn't explode."""
    graph = G.Graph(
        title="Humio Logs",
        dataSource="Humio data source",
        targets=[
            H.HumioTarget(),
        ],
        id=1,
        yAxes=G.YAxes(
            G.YAxis(format=G.SHORT_FORMAT, label="ms"),
            G.YAxis(format=G.SHORT_FORMAT),
        ),
    )
    stream = StringIO()
    _gen.write_dashboard(graph, stream)
    assert stream.getvalue() != ''
Ejemplo n.º 14
0
def PercentUnitAxis(label=None):
    """A Y axis that shows a percentage based on a unit value."""
    return G.YAxis(
        format=G.PERCENT_UNIT_FORMAT,
        label=label,
        logBase=1,
        max=1,
        min=0,
    )
def cpu_usage(datasource, intervals):
    return G.Graph(
        title="CPU usage",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="percent",
                label="Average",
                min=0,
                max=100,
            ),
            G.YAxis(
                format="percent",
                label="Average",
            ),
        ],
        targets=list(
            G.Target(
                # CPU usage (as a percentage of maximum possible) averaged
                # over a period is given as 100 times the sum (over all
                # containers) of the rate of increase (in seconds) divided by
                # the maximum possible increase (1 second per CPU).
                #
                # The sums are taken from recording rules because recomputing
                # them for every point on the graph for every graph request
                # becomes prohitively expensive.  Only a few specific rates
                # are "recorded" and the ``interval`` parameter must match one
                # of those. :(
                #
                # See prometheus.yaml for the recording rules.
                expr="""
                  100
                * cpu:container_usage_seconds:rate{}
                / cores:machine_cpu:total
                """.format(interval),
                legendFormat="CPU Usage ({} avg)".format(interval),
                refId=refId(n),
            )
            for n, interval in enumerate(intervals),
        ),
    )
Ejemplo n.º 16
0
 def AddGraphPanel(self, title: Text, raw_sql: Text, y_axis_title: Text):
     self.AddPanel(
         core.Graph(
             title=title,
             targets=[
                 core.SqlTarget(
                     rawSql=raw_sql,
                     format=core.TABLE_TARGET_FORMAT,
                 ),
             ],
             yAxes=core.YAxes(core.YAxis(format=y_axis_title), ),
         ))
Ejemplo n.º 17
0
def test_serialization():
    """Serializing a graph doesn't explode."""
    graph = G.Graph(
        title="CPU Usage by Namespace (rate[5m])",
        dataSource="My data source",
        targets=[
            G.Target(
                expr='namespace:container_cpu_usage_seconds_total:sum_rate',
                legendFormat='{{namespace}}',
                refId='A',
            ),
        ],
        id=1,
        yAxes=[
            G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"),
            G.YAxis(format=G.SHORT_FORMAT),
        ],
    )
    stream = StringIO()
    _gen.write_dashboard(graph, stream)
    assert stream.getvalue() != ''
Ejemplo n.º 18
0
def StatusQPSGraph(data_source, title, expression, **kwargs):
    """Create a graph of QPS, coloured by status code.

    :param title: Title of the graph.
    :param expression: Format and PromQL expression; must sum by label
                       which is http code like 404 or "success" and "error"
    :param kwargs: Passed on to Graph.
    """
    return W.stacked(
        prometheus.PromGraph(
            data_source=data_source,
            title=title,
            expressions=[('{{status_code}}', 'sum by (status_code)(%s)' % (expression))],
            seriesOverrides=QPS_SERIES_OVERRIDES,
            legend=G.Legend(hideZero=True),
            yAxes=[
                G.YAxis(format=G.OPS_FORMAT),
                G.YAxis(format=G.SHORT_FORMAT),
            ],
            **kwargs
        )
    )
Ejemplo n.º 19
0
def test_serialization_zabbix_target():
    """Serializing a graph doesn't explode."""
    graph = G.Graph(
        title="CPU Usage",
        dataSource="Zabbix data source",
        targets=[
            Z.zabbixMetricTarget(group="Zabbix Group",
                                 host="Zabbix Host",
                                 application="CPU",
                                 item="/CPU (load)/",
                                 functions=[
                                     Z.ZabbixSetAliasFunction("View alias"),
                                 ]),
        ],
        id=1,
        yAxes=[
            G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"),
            G.YAxis(format=G.SHORT_FORMAT),
        ],
    )
    stream = StringIO()
    _gen.write_dashboard(graph, stream)
    assert stream.getvalue() != ''
Ejemplo n.º 20
0
def test_serialization_opentsdb_target():
    """Serializing a graph doesn't explode."""
    graph = G.Graph(
        title="CPU Usage",
        dataSource="OpenTSDB data source",
        targets=[
            O.OpenTSDBTarget(metric='cpu',
                             alias='$tag_instance',
                             filters=[
                                 O.OpenTSDBFilter(value='*',
                                                  tag='instance',
                                                  type='wildcard',
                                                  groupBy=True),
                             ]),
        ],
        id=1,
        yAxes=[
            G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"),
            G.YAxis(format=G.SHORT_FORMAT),
        ],
    )
    stream = StringIO()
    _gen.write_dashboard(graph, stream)
    assert stream.getvalue() != ''
def network_usage(datasource):
    return G.Graph(
        title="Network Usage",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                # 2^20 bytes / second
                format="MBs",
                label="Transferred",
            ),
            G.YAxis(
                show=False,
            ),
        ],
        targets=[
            G.Target(
                # Get the rate of data received on the public interface (eth0)
                # for each entire node (id="/") over the last minute.
                expr="""
                receive:container_network_bytes:rate1m / 2 ^ 20
                """,
                legendFormat="receive",
                refId="A",
            ),
            G.Target(
                # And rate of data sent.
                expr="""
                transmit:container_network_bytes:rate1m / 2 ^ 20
                """,
                legendFormat="transmit",
                refId="B",
            ),
        ],
    )
Ejemplo n.º 22
0
 def yaxis(self, **kw):
     self.yAxes = gf.YAxes(left=gf.YAxis(**kw))
Ejemplo n.º 23
0
def YAxis(format='none', label='', min=0, show=True):
    return core.YAxis(format=format, label=label, min=min, show=show)
Ejemplo n.º 24
0
import grafanalib.core as G

dashboard = G.Dashboard(
    title='Test dashboard',
    panels=[
        G.Graph(title='CPU Usage by Namespace',
                dataSource='metricbeat',
                targets=[
                    G.Target(
                        expr='whatever',
                        legendFormat='{{namespace}}',
                        refId='A',
                    ),
                ],
                yAxes=[
                    G.YAxis(format=G.SHORT_FORMAT, label='CPU seconds'),
                    G.YAxis(format=G.SHORT_FORMAT),
                ],
                gridPos=G.GridPos(h=8, w=12, x=0, y=0))
    ],
).auto_panel_ids()
Ejemplo n.º 25
0
         ),
         (
             "50th Percentile",
             'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m]))  by (le))'
         ),
         (
             "10th Percentile",
             'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m]))  by (le))'
         ),
         (
             "Mean",
             'sum(rate(cortex_ingester_chunk_size_bytes_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_size_bytes_count{job="cortex/ingester"}[2m]))'
         ),
     ],
     yAxes=[
         G.YAxis(format=G.BYTES_FORMAT),
         G.YAxis(format=G.SHORT_FORMAT),
     ],
 ),
 common.PromGraph(
     title="Chunk Age (on flush)",
     expressions=[
         (
             "99th Percentile",
             'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))'
         ),
         (
             "50th Percentile",
             'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))'
         ),
         (
def dashboard():
    PROMETHEUS = "prometheus"
    return G.Dashboard(
        title="S4",
        rows=[
            G.Row(panels=[
                G.Graph(
                    title="Signups",
                    dataSource=PROMETHEUS,
                    xAxis=X_TIME,
                    yAxes=[
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                    ],
                    targets=[
                        G.Target(
                            expr='wormhole_signup_started{pod=~"s4-signup.*"}',
                            legendFormat="Wormhole Signups Started",
                            refId="A",
                        ),
                        G.Target(
                            expr='wormhole_signup_success{pod=~"s4-signup.*"}',
                            legendFormat="Wormhole Signups Completed",
                            refId="B",
                        ),
                        G.Target(
                            expr='wormhole_signup_failure{pod=~"s4-signup.*"}',
                            legendFormat="Wormhole Signups Failed",
                            refId="C",
                        ),
                    ],
                ),
                G.Graph(
                    title="Usage",
                    dataSource=PROMETHEUS,

                    # Stack the connection graphs on each other, revealing
                    # both a total and a distribution across different grid
                    # router instances.
                    stack=True,
                    tooltip=G.Tooltip(
                        valueType=G.INDIVIDUAL,
                    ),

                    xAxis=X_TIME,
                    yAxes=[
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                    ],
                    targets=[
                        G.Target(
                            expr="grid_router_connections",
                            legendFormat="Tahoe-LAFS Connections",
                            refId="D",
                        ),
                    ],
                ),
            ]),
            G.Row(
                title="Cluster",
                panels=[
                    cpu_usage(PROMETHEUS, ["1m", "5m", "10m"]),
                    memory_usage(PROMETHEUS),
                    network_usage(PROMETHEUS),
                    filesystem_usage(PROMETHEUS),
                ],
            ),
            G.Row(panels=[
                G.SingleStat(
                    title='Current Customer Deployments',
                    dataSource='prometheus',
                    valueName='current',
                    sparkline=G.SparkLine(show=True),
                    targets=[
                        G.Target(
                            expr='s4_deployment_gauge',
                            refId="E",
                        ),
                    ],
                ),
                G.SingleStat(
                    title='Unhandled Errors',
                    dataSource='prometheus',
                    valueName='current',
                    sparkline=G.SparkLine(show=True),
                    targets=[
                        G.Target(
                            expr='s4_unhandled_error_counter',
                            refId="F",
                        ),
                    ],
                ),
            ]),
        ],
    ).auto_panel_ids()
def tahoe_lafs_transfer_rate(datasource):
    def refidgen():
        for i in count():
            yield unicode(i)
    refid = refidgen()


    return G.Graph(
        title="Tahoe-LAFS Benchmarked Transfer Rate",
        dataSource=datasource,

        xAxis=X_TIME,
        yAxes=[
            G.YAxis(
                format="Bps",
                label="Transfer Rate",
            ),
            G.YAxis(
                show=False,
            ),
        ],

        targets=list(
            G.Target(
                # The metric is a Histogram.  The _sum goes up by the number of
                # bytes/second observed by each sample taken.  For example, if
                # the first benchmark run observes 100 bytes/sec transfer
                # rate, the _sum is 100.  If the second benchmark run observes
                # 75 bytes/sec transfer rate, the _sum is then 175.  The
                # _count gives the total number of samples present in the
                # _sum.
                #
                # The rate() of the _sum over a recent interval is
                # bytes/sec/sec.  The rate() of the _count over the same
                # interval is 1/sec.  The quotient is bytes/sec and gives an
                # average for the metric over the recent interval.
                #
                # Take the average of all such results to squash series from
                # different pods into a single result.  There should be
                # minimal overlap but whenever the pod gets recreated (because
                # the deploying is updated, for example) there's a little.
                expr="""
                avg without (pod,instance) (
                    rate(tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_sum{{service="tahoe-lafs-transfer-rate-monitor"}}[60m])
                  / rate(tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_count{{service="tahoe-lafs-transfer-rate-monitor"}}[60m])
                )
                """.format(metric=metric),
                legendFormat="avg " + legend_format,
                refId=next(refid),
            )
            for (legend_format, metric)
            in [("upload", "write"), ("download", "read")]
        ) + list(
            G.Target(
                # The average above is nice, I suppose.  It doesn't give the
                # full picture, though.  So also compute the rate which is
                # slower than 90% of the results (faster than 10% of the
                # results).  This is basically what a 90% transfer speed SLA
                # would talk about.  Put another way, 90% of uploads should
                # occur at a rate equal to or greater than the one plotted by
                # this expression.
                expr="""
                avg without (pod,instance) (
                    histogram_quantile(
                        0.10,
                        rate(
                            tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_bucket{{service="tahoe-lafs-transfer-rate-monitor"}}[60m]
                        )
                    )
                )
                """.format(metric=metric),
                legendFormat="90% " + legend_format,
                refId=next(refid),
            )
            for (legend_format, metric)
            in [("upload", "write"), ("download", "read")]
        ),
    )
Ejemplo n.º 28
0
             legendFormat='3xx',
             refId='C'),
         G.Target(
             expr=
             'service_status:http_request_duration_seconds_count:irate{service="ucdapi",status_code=~"4.."}',
             legendFormat='4xx',
             refId='D'),
         G.Target(
             expr=
             'service_status:http_request_duration_seconds_count:irate{service="ucdapi",status_code=~"5.."}',
             legendFormat='5xx',
             refId='E'),
     ],
     aliasColors=ALIAS_COLORS,
     yAxes=[
         G.YAxis(format=G.OPS_FORMAT),
         G.YAxis(format=G.SHORT_FORMAT, show=False)
     ],
     nullPointMode=G.NULL_AS_ZERO,
     stack=True,
     lineWidth=0,
     fill=10,
     tooltip=G.Tooltip(valueType=G.INDIVIDUAL)),
 G.Graph(
     title='RPS',
     dataSource='prometheus',
     targets=[
         G.Target(
             expr=
             'sum(irate(http_request_duration_seconds_count{service="ucdapi"}[1m])) by (status_code, method)',
         ),
Ejemplo n.º 29
0
def make(prefix, title):
    def target(expr, **kw):
        return G.Target(expr=expr.format(prefix), **kw)

    return G.Dashboard(
        title=title,
        rows=[
            G.Row(panels=[
                G.SingleStat(
                    title='Pods up (web)',
                    dataSource='prometheus',
                    valueName='current',
                    sparkline=G.SparkLine(show=True),
                    targets=[
                        target(
                            expr=
                            'count by(service) (up{{service="{}-isaacranks-web"}} == 1)'
                        )
                    ]),
                G.SingleStat(
                    title='Pods up (rebuild)',
                    dataSource='prometheus',
                    valueName='current',
                    sparkline=G.SparkLine(show=True),
                    targets=[
                        target(
                            expr=
                            'count by(service) (up{{service="{}-isaacranks-rebuild"}} == 1)'
                        )
                    ]),
            ]),
            G.Row(panels=[
                G.Graph(
                    title='HTTP RPS',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"1.."}}',
                            legendFormat='1xx',
                            refId='A'),
                        target(
                            expr=
                            'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"2.."}}',
                            legendFormat='2xx',
                            refId='B'),
                        target(
                            expr=
                            'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"3.."}}',
                            legendFormat='3xx',
                            refId='C'),
                        target(
                            expr=
                            'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"4.."}}',
                            legendFormat='4xx',
                            refId='D'),
                        target(
                            expr=
                            'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"5.."}}',
                            legendFormat='5xx',
                            refId='E'),
                    ],
                    aliasColors=ALIAS_COLORS,
                    yAxes=[
                        G.YAxis(format=G.OPS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ],
                    nullPointMode=G.NULL_AS_ZERO,
                    stack=True,
                    lineWidth=0,
                    fill=10,
                    tooltip=G.Tooltip(valueType=G.INDIVIDUAL)),
                G.Graph(
                    title='HTTP latency',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service:http_request_duration_seconds:50p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.5q',
                            refId='A'),
                        target(
                            expr=
                            'service:http_request_duration_seconds:90p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.9q',
                            refId='B'),
                        target(
                            expr=
                            'service:http_request_duration_seconds:99p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.99q',
                            refId='C'),
                    ],
                    aliasColors=ALIAS_COLORS,
                    yAxes=[
                        G.YAxis(format=G.MILLISECONDS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ]),
            ]),
            G.Row(panels=[
                G.Graph(
                    title='Ballots',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service_version:isaacranks_ballot_generation_seconds_count:irate{{service="{}-isaacranks-web"}}',
                            legendFormat='{{version}}',
                            refId='A')
                    ],
                    yAxes=[
                        G.YAxis(format=G.OPS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False),
                    ],
                    nullPointMode=G.NULL_AS_ZERO,
                    stack=True,
                    lineWidth=0,
                    fill=10,
                    tooltip=G.Tooltip(valueType=G.INDIVIDUAL)),
                G.Graph(
                    title='Ballot latency',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service:isaacranks_ballot_generation_seconds:50p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.5q',
                            refId='A'),
                        target(
                            expr=
                            'service:isaacranks_ballot_generation_seconds:90p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.9q',
                            refId='B'),
                        target(
                            expr=
                            'service:isaacranks_ballot_generation_seconds:99p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.99q',
                            refId='C'),
                    ],
                    yAxes=[
                        G.YAxis(format=G.MILLISECONDS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ]),
            ]),
            G.Row(panels=[
                G.Graph(
                    title='Votes',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service_version:isaacranks_vote_casting_seconds_count:irate{{service="{}-isaacranks-web"}}',
                            legendFormat='{{version}}',
                            refId='A')
                    ],
                    yAxes=[
                        G.YAxis(format=G.OPS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ],
                    nullPointMode=G.NULL_AS_ZERO,
                    stack=True,
                    lineWidth=0,
                    fill=10,
                    tooltip=G.Tooltip(valueType=G.INDIVIDUAL)),
                G.Graph(
                    title='Vote latency',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'service:isaacranks_vote_casting_seconds:50p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.5q',
                            refId='A'),
                        target(
                            expr=
                            'service:isaacranks_vote_casting_seconds:90p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.9q',
                            refId='B'),
                        target(
                            expr=
                            'service:isaacranks_vote_casting_seconds:99p{{service="{}-isaacranks-web"}} * 1000',
                            legendFormat='0.99q',
                            refId='C'),
                    ],
                    yAxes=[
                        G.YAxis(format=G.MILLISECONDS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ]),
            ]),
            G.Row(panels=[
                G.Graph(
                    title='Time since last rebuild',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'time() - (isaacranks_last_rebuild_timestamp{{service="{}-isaacranks-rebuild"}} != 0)',
                            legendFormat='Age')
                    ],
                    legend=G.Legend(current=True),
                    yAxes=[
                        G.YAxis(format=G.SECONDS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ]),
                G.Graph(
                    title='Rebuild duration',
                    dataSource='prometheus',
                    targets=[
                        target(
                            expr=
                            'isaacranks_last_rebuild_duration_seconds{{service="{}-isaacranks-rebuild"}} != 0',
                            legendFormat='Duration')
                    ],
                    legend=G.Legend(current=True),
                    yAxes=[
                        G.YAxis(format=G.SECONDS_FORMAT),
                        G.YAxis(format=G.SHORT_FORMAT, show=False)
                    ]),
            ])
        ]).auto_panel_ids()
def dashboard():
    PROMETHEUS = "prometheus"
    return G.Dashboard(
        title="S4",
        rows=[
            G.Row(panels=[
                G.Graph(
                    title="Signups",
                    dataSource=PROMETHEUS,
                    xAxis=X_TIME,
                    yAxes=[
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                    ],
                    targets=[
                        G.Target(
                            # Filter down to just the signup pod since that's
                            # the only one where this metric value is
                            # meaningful.  Some other pods report a 0 value
                            # for this metric because they happen to import
                            # the Python code that defines the object
                            # representing it.
                            #
                            # Also, sum over the selected series to account
                            # for pod replacement.
                            expr='sum(wormhole_signup_started{pod=~"s4-signup.*"})',
                            legendFormat="Wormhole Signups Started",
                            refId="A",
                        ),
                        G.Target(
                            expr='sum(wormhole_signup_success{pod=~"s4-signup.*"})',
                            legendFormat="Wormhole Signups Completed",
                            refId="B",
                        ),
                        G.Target(
                            expr='sum(wormhole_signup_failure{pod=~"s4-signup.*"})',
                            legendFormat="Wormhole Signups Failed",
                            refId="C",
                        ),
                    ],
                ),
                G.Graph(
                    title="Usage",
                    dataSource=PROMETHEUS,

                    # Stack the connection graphs on each other, revealing
                    # both a total and a distribution across different grid
                    # router instances.
                    stack=True,
                    tooltip=G.Tooltip(
                        valueType=G.INDIVIDUAL,
                    ),

                    xAxis=X_TIME,
                    yAxes=[
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                        G.YAxis(
                            format="none",
                            label="Count",
                        ),
                    ],
                    targets=[
                        G.Target(
                            expr="grid_router_connections",
                            legendFormat="Tahoe-LAFS Connections",
                            refId="D",
                        ),
                    ],
                ),
                last_convergence(PROMETHEUS),
            ]),
            G.Row(
                title="Cluster",
                panels=[
                    cpu_usage(PROMETHEUS, ["1m", "5m", "10m"]),
                    memory_usage(PROMETHEUS),
                    network_usage(PROMETHEUS),
                    filesystem_usage(PROMETHEUS),
                ],
            ),
            G.Row(
                title="Cluster2",
                panels=[
                    process_open_fds(PROMETHEUS),
                ],
            ),
            G.Row(panels=[
                tahoe_lafs_transfer_rate(PROMETHEUS),
                s4_customer_deployments(PROMETHEUS),
                unhandled_errors(PROMETHEUS),
            ]),
        ],
    ).auto_panel_ids()