def process_open_fds(datasource): return G.Graph( title="Open File Descriptors", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( show=False, ), ], targets=[ G.Target( expr=""" process_open_fds{pod=~".+"} """, refId="A", legendFormat="{{pod}}", ), ], )
def QPSGraph(title, expressions, id, **kwargs): """Create a graph of QPS, broken up by response code. Data is drawn from Prometheus. :param title: Title of the graph. :param expressions: List of Prometheus expressions. Must be 5. :param id: The id for the graph, unique within the dashboard. :param kwargs: Passed on to Graph. """ if len(expressions) != 5: raise ValueError('Expected 5 expressions, got {}: {}'.format( len(expressions), expressions)) legends = sorted(ALIAS_COLORS.keys()) exprs = zip(legends, expressions) return stacked( PromGraph(title=title, expressions=exprs, aliasColors=ALIAS_COLORS, id=id, yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], **kwargs))
def filesystem_usage(datasource): return G.Graph( title="Filesystem Usage", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="percent", ), G.YAxis( show=False, ), ], targets=[ G.Target( # Get the proportion used of each filesystem on a volume from # a PersistentVolumeClaim on each node of the cluster. It's # hard to figure out the role each filesystem serves from this # graph (since all we get is the PVC name). Better than # nothing, though. Hopefully later we can do better. expr=""" 100 * filesystem_used_bytes{volume=~"pvc-.*"} / filesystem_size_bytes{volume=~"pvc-.*"} """, legendFormat="{{volume}}", refId="A", ), ], )
def unhandled_errors(datasource): return G.Graph( title="Unhandled Errors", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( show=False, ), ], targets=[ G.Target( expr=""" sum(s4_unhandled_error_counter) """, refId="A", legendFormat="Total Unhandled Errors", ), ], )
def last_convergence(datasource): return G.Graph( title="Since Last Convergence", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Period", ), G.YAxis( show=False, ), ], targets=[ G.Target( expr=""" time() - max( s4_last_convergence_succeeded{ pod=~"subscription-converger-.*" } ) """, refId="A", legendFormat="Time Since Last Convergence Success", ), ], )
def memory_usage(datasource): return G.Graph( title="Memory Usage", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( # 2 ^ 30 bytes format="gbytes", label="Memory", ), G.YAxis( show=False, ), ], targets=[ G.Target( expr=""" sum(machine_memory_bytes) / 2 ^ 30 """, legendFormat="Total Physical Memory", refId="A", ), G.Target( expr=""" rss:container_memory:total / 2 ^ 30 """, legendFormat="Total Container RSS", refId="B", ), ], )
def test_auto_refids(): """auto_ref_ids() provides refIds for all targets without refIds already set.""" dashboard = G.Dashboard( title="Test dashboard", rows=[ G.Row(panels=[ G.Graph( title="CPU Usage by Namespace (rate[5m])", dataSource="My data source", targets=[ G.Target( expr='whatever #Q', legendFormat='{{namespace}}', ), G.Target( expr='hidden whatever', legendFormat='{{namespace}}', refId='Q', hide=True ), G.Target( expr='another target' ), ], yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds"), G.YAxis(format=G.SHORT_FORMAT), ], ).auto_ref_ids() ]), ], ) assert dashboard.rows[0].panels[0].targets[0].refId == 'A' assert dashboard.rows[0].panels[0].targets[1].refId == 'Q' assert dashboard.rows[0].panels[0].targets[2].refId == 'B'
def QPSGraph(data_source, title, expressions, **kwargs): """Create a graph of QPS, broken up by response code. Data is drawn from Prometheus. :param title: Title of the graph. :param expressions: List of Prometheus expressions. Must be 5. :param kwargs: Passed on to Graph. """ if len(expressions) != 5 and len(expressions) != 7: raise ValueError('Expected 5 or 7 expressions, got {}: {}'.format( len(expressions), expressions)) legends = sorted(ALIAS_COLORS.keys()) exprs = zip(legends, expressions) return stacked(prometheus.PromGraph( data_source=data_source, title=title, expressions=exprs, aliasColors=ALIAS_COLORS, yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], **kwargs ))
def test_auto_id(): """auto_panel_ids() provides IDs for all panels without IDs already set.""" dashboard = G.Dashboard( title="Test dashboard", rows=[ G.Row(panels=[ G.Graph( title="CPU Usage by Namespace (rate[5m])", dataSource="My data source", targets=[ G.Target( expr='whatever', legendFormat='{{namespace}}', refId='A', ), ], yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds"), G.YAxis(format=G.SHORT_FORMAT), ], ) ]), ], ).auto_panel_ids() assert dashboard.rows[0].panels[0].id == 1
def s4_customer_deployments(datasource): return G.Graph( title="Customer Deployments", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Total Customer Deployments", min=0, max=100, ), G.YAxis( show=False, ), ], targets=[ G.Target( # Each replicaset and pod end up with their own series. Label # these more succinctly. Leave them distinct in case it is # interesting to see where restarts have happened. expr=""" label_replace( s4_deployment_gauge{pod=~"subscription-converger-.*"}, "shortpod", "# Deploys ($1)", "pod", "subscription-converger-(.*)" ) """, refId="A", legendFormat="{{shortpod}}", ), G.Target( # As above. expr=""" label_replace( s4_running_pod_gauge{pod=~"subscription-converger-.*"}, "shortpod", "# Running ($1)", "pod", "subscription-converger-(.*)" ) """, refId="B", legendFormat="{{shortpod}}", ), ], )
def _row(title): return core.Row(panels=[ core.Graph(title=title, dataSource='prometheus', targets=[ core.Target( expr=title, legendFormat='{{namespace}}', ), ], yAxes=[ core.YAxis(format=core.NO_FORMAT), core.YAxis(format=core.SHORT_FORMAT), ]) ])
def test_serialization_cloudwatch_metrics_target(): """Serializing a graph doesn't explode.""" graph = G.Graph( title="Lambda Duration", dataSource="Cloudwatch data source", targets=[ C.CloudwatchMetricsTarget(), ], id=1, yAxes=G.YAxes( G.YAxis(format=G.SHORT_FORMAT, label="ms"), G.YAxis(format=G.SHORT_FORMAT), ), ) stream = StringIO() _gen.write_dashboard(graph, stream) assert stream.getvalue() != ''
def test_serialization_humio_metrics_target(): """Serializing a graph doesn't explode.""" graph = G.Graph( title="Humio Logs", dataSource="Humio data source", targets=[ H.HumioTarget(), ], id=1, yAxes=G.YAxes( G.YAxis(format=G.SHORT_FORMAT, label="ms"), G.YAxis(format=G.SHORT_FORMAT), ), ) stream = StringIO() _gen.write_dashboard(graph, stream) assert stream.getvalue() != ''
def PercentUnitAxis(label=None): """A Y axis that shows a percentage based on a unit value.""" return G.YAxis( format=G.PERCENT_UNIT_FORMAT, label=label, logBase=1, max=1, min=0, )
def cpu_usage(datasource, intervals): return G.Graph( title="CPU usage", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="percent", label="Average", min=0, max=100, ), G.YAxis( format="percent", label="Average", ), ], targets=list( G.Target( # CPU usage (as a percentage of maximum possible) averaged # over a period is given as 100 times the sum (over all # containers) of the rate of increase (in seconds) divided by # the maximum possible increase (1 second per CPU). # # The sums are taken from recording rules because recomputing # them for every point on the graph for every graph request # becomes prohitively expensive. Only a few specific rates # are "recorded" and the ``interval`` parameter must match one # of those. :( # # See prometheus.yaml for the recording rules. expr=""" 100 * cpu:container_usage_seconds:rate{} / cores:machine_cpu:total """.format(interval), legendFormat="CPU Usage ({} avg)".format(interval), refId=refId(n), ) for n, interval in enumerate(intervals), ), )
def AddGraphPanel(self, title: Text, raw_sql: Text, y_axis_title: Text): self.AddPanel( core.Graph( title=title, targets=[ core.SqlTarget( rawSql=raw_sql, format=core.TABLE_TARGET_FORMAT, ), ], yAxes=core.YAxes(core.YAxis(format=y_axis_title), ), ))
def test_serialization(): """Serializing a graph doesn't explode.""" graph = G.Graph( title="CPU Usage by Namespace (rate[5m])", dataSource="My data source", targets=[ G.Target( expr='namespace:container_cpu_usage_seconds_total:sum_rate', legendFormat='{{namespace}}', refId='A', ), ], id=1, yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"), G.YAxis(format=G.SHORT_FORMAT), ], ) stream = StringIO() _gen.write_dashboard(graph, stream) assert stream.getvalue() != ''
def StatusQPSGraph(data_source, title, expression, **kwargs): """Create a graph of QPS, coloured by status code. :param title: Title of the graph. :param expression: Format and PromQL expression; must sum by label which is http code like 404 or "success" and "error" :param kwargs: Passed on to Graph. """ return W.stacked( prometheus.PromGraph( data_source=data_source, title=title, expressions=[('{{status_code}}', 'sum by (status_code)(%s)' % (expression))], seriesOverrides=QPS_SERIES_OVERRIDES, legend=G.Legend(hideZero=True), yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], **kwargs ) )
def test_serialization_zabbix_target(): """Serializing a graph doesn't explode.""" graph = G.Graph( title="CPU Usage", dataSource="Zabbix data source", targets=[ Z.zabbixMetricTarget(group="Zabbix Group", host="Zabbix Host", application="CPU", item="/CPU (load)/", functions=[ Z.ZabbixSetAliasFunction("View alias"), ]), ], id=1, yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"), G.YAxis(format=G.SHORT_FORMAT), ], ) stream = StringIO() _gen.write_dashboard(graph, stream) assert stream.getvalue() != ''
def test_serialization_opentsdb_target(): """Serializing a graph doesn't explode.""" graph = G.Graph( title="CPU Usage", dataSource="OpenTSDB data source", targets=[ O.OpenTSDBTarget(metric='cpu', alias='$tag_instance', filters=[ O.OpenTSDBFilter(value='*', tag='instance', type='wildcard', groupBy=True), ]), ], id=1, yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label="CPU seconds / second"), G.YAxis(format=G.SHORT_FORMAT), ], ) stream = StringIO() _gen.write_dashboard(graph, stream) assert stream.getvalue() != ''
def network_usage(datasource): return G.Graph( title="Network Usage", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( # 2^20 bytes / second format="MBs", label="Transferred", ), G.YAxis( show=False, ), ], targets=[ G.Target( # Get the rate of data received on the public interface (eth0) # for each entire node (id="/") over the last minute. expr=""" receive:container_network_bytes:rate1m / 2 ^ 20 """, legendFormat="receive", refId="A", ), G.Target( # And rate of data sent. expr=""" transmit:container_network_bytes:rate1m / 2 ^ 20 """, legendFormat="transmit", refId="B", ), ], )
def yaxis(self, **kw): self.yAxes = gf.YAxes(left=gf.YAxis(**kw))
def YAxis(format='none', label='', min=0, show=True): return core.YAxis(format=format, label=label, min=min, show=show)
import grafanalib.core as G dashboard = G.Dashboard( title='Test dashboard', panels=[ G.Graph(title='CPU Usage by Namespace', dataSource='metricbeat', targets=[ G.Target( expr='whatever', legendFormat='{{namespace}}', refId='A', ), ], yAxes=[ G.YAxis(format=G.SHORT_FORMAT, label='CPU seconds'), G.YAxis(format=G.SHORT_FORMAT), ], gridPos=G.GridPos(h=8, w=12, x=0, y=0)) ], ).auto_panel_ids()
), ( "50th Percentile", 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "10th Percentile", 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "Mean", 'sum(rate(cortex_ingester_chunk_size_bytes_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_size_bytes_count{job="cortex/ingester"}[2m]))' ), ], yAxes=[ G.YAxis(format=G.BYTES_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ), common.PromGraph( title="Chunk Age (on flush)", expressions=[ ( "99th Percentile", 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "50th Percentile", 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' ), (
def dashboard(): PROMETHEUS = "prometheus" return G.Dashboard( title="S4", rows=[ G.Row(panels=[ G.Graph( title="Signups", dataSource=PROMETHEUS, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( format="none", label="Count", ), ], targets=[ G.Target( expr='wormhole_signup_started{pod=~"s4-signup.*"}', legendFormat="Wormhole Signups Started", refId="A", ), G.Target( expr='wormhole_signup_success{pod=~"s4-signup.*"}', legendFormat="Wormhole Signups Completed", refId="B", ), G.Target( expr='wormhole_signup_failure{pod=~"s4-signup.*"}', legendFormat="Wormhole Signups Failed", refId="C", ), ], ), G.Graph( title="Usage", dataSource=PROMETHEUS, # Stack the connection graphs on each other, revealing # both a total and a distribution across different grid # router instances. stack=True, tooltip=G.Tooltip( valueType=G.INDIVIDUAL, ), xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( format="none", label="Count", ), ], targets=[ G.Target( expr="grid_router_connections", legendFormat="Tahoe-LAFS Connections", refId="D", ), ], ), ]), G.Row( title="Cluster", panels=[ cpu_usage(PROMETHEUS, ["1m", "5m", "10m"]), memory_usage(PROMETHEUS), network_usage(PROMETHEUS), filesystem_usage(PROMETHEUS), ], ), G.Row(panels=[ G.SingleStat( title='Current Customer Deployments', dataSource='prometheus', valueName='current', sparkline=G.SparkLine(show=True), targets=[ G.Target( expr='s4_deployment_gauge', refId="E", ), ], ), G.SingleStat( title='Unhandled Errors', dataSource='prometheus', valueName='current', sparkline=G.SparkLine(show=True), targets=[ G.Target( expr='s4_unhandled_error_counter', refId="F", ), ], ), ]), ], ).auto_panel_ids()
def tahoe_lafs_transfer_rate(datasource): def refidgen(): for i in count(): yield unicode(i) refid = refidgen() return G.Graph( title="Tahoe-LAFS Benchmarked Transfer Rate", dataSource=datasource, xAxis=X_TIME, yAxes=[ G.YAxis( format="Bps", label="Transfer Rate", ), G.YAxis( show=False, ), ], targets=list( G.Target( # The metric is a Histogram. The _sum goes up by the number of # bytes/second observed by each sample taken. For example, if # the first benchmark run observes 100 bytes/sec transfer # rate, the _sum is 100. If the second benchmark run observes # 75 bytes/sec transfer rate, the _sum is then 175. The # _count gives the total number of samples present in the # _sum. # # The rate() of the _sum over a recent interval is # bytes/sec/sec. The rate() of the _count over the same # interval is 1/sec. The quotient is bytes/sec and gives an # average for the metric over the recent interval. # # Take the average of all such results to squash series from # different pods into a single result. There should be # minimal overlap but whenever the pod gets recreated (because # the deploying is updated, for example) there's a little. expr=""" avg without (pod,instance) ( rate(tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_sum{{service="tahoe-lafs-transfer-rate-monitor"}}[60m]) / rate(tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_count{{service="tahoe-lafs-transfer-rate-monitor"}}[60m]) ) """.format(metric=metric), legendFormat="avg " + legend_format, refId=next(refid), ) for (legend_format, metric) in [("upload", "write"), ("download", "read")] ) + list( G.Target( # The average above is nice, I suppose. It doesn't give the # full picture, though. So also compute the rate which is # slower than 90% of the results (faster than 10% of the # results). This is basically what a 90% transfer speed SLA # would talk about. Put another way, 90% of uploads should # occur at a rate equal to or greater than the one plotted by # this expression. expr=""" avg without (pod,instance) ( histogram_quantile( 0.10, rate( tahoe_lafs_roundtrip_benchmark_{metric}_bytes_per_second_bucket{{service="tahoe-lafs-transfer-rate-monitor"}}[60m] ) ) ) """.format(metric=metric), legendFormat="90% " + legend_format, refId=next(refid), ) for (legend_format, metric) in [("upload", "write"), ("download", "read")] ), )
legendFormat='3xx', refId='C'), G.Target( expr= 'service_status:http_request_duration_seconds_count:irate{service="ucdapi",status_code=~"4.."}', legendFormat='4xx', refId='D'), G.Target( expr= 'service_status:http_request_duration_seconds_count:irate{service="ucdapi",status_code=~"5.."}', legendFormat='5xx', refId='E'), ], aliasColors=ALIAS_COLORS, yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ], nullPointMode=G.NULL_AS_ZERO, stack=True, lineWidth=0, fill=10, tooltip=G.Tooltip(valueType=G.INDIVIDUAL)), G.Graph( title='RPS', dataSource='prometheus', targets=[ G.Target( expr= 'sum(irate(http_request_duration_seconds_count{service="ucdapi"}[1m])) by (status_code, method)', ),
def make(prefix, title): def target(expr, **kw): return G.Target(expr=expr.format(prefix), **kw) return G.Dashboard( title=title, rows=[ G.Row(panels=[ G.SingleStat( title='Pods up (web)', dataSource='prometheus', valueName='current', sparkline=G.SparkLine(show=True), targets=[ target( expr= 'count by(service) (up{{service="{}-isaacranks-web"}} == 1)' ) ]), G.SingleStat( title='Pods up (rebuild)', dataSource='prometheus', valueName='current', sparkline=G.SparkLine(show=True), targets=[ target( expr= 'count by(service) (up{{service="{}-isaacranks-rebuild"}} == 1)' ) ]), ]), G.Row(panels=[ G.Graph( title='HTTP RPS', dataSource='prometheus', targets=[ target( expr= 'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"1.."}}', legendFormat='1xx', refId='A'), target( expr= 'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"2.."}}', legendFormat='2xx', refId='B'), target( expr= 'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"3.."}}', legendFormat='3xx', refId='C'), target( expr= 'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"4.."}}', legendFormat='4xx', refId='D'), target( expr= 'service_status:http_request_duration_seconds_count:irate{{service="{}-isaacranks-web",status_code=~"5.."}}', legendFormat='5xx', refId='E'), ], aliasColors=ALIAS_COLORS, yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ], nullPointMode=G.NULL_AS_ZERO, stack=True, lineWidth=0, fill=10, tooltip=G.Tooltip(valueType=G.INDIVIDUAL)), G.Graph( title='HTTP latency', dataSource='prometheus', targets=[ target( expr= 'service:http_request_duration_seconds:50p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.5q', refId='A'), target( expr= 'service:http_request_duration_seconds:90p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.9q', refId='B'), target( expr= 'service:http_request_duration_seconds:99p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.99q', refId='C'), ], aliasColors=ALIAS_COLORS, yAxes=[ G.YAxis(format=G.MILLISECONDS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ]), ]), G.Row(panels=[ G.Graph( title='Ballots', dataSource='prometheus', targets=[ target( expr= 'service_version:isaacranks_ballot_generation_seconds_count:irate{{service="{}-isaacranks-web"}}', legendFormat='{{version}}', refId='A') ], yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False), ], nullPointMode=G.NULL_AS_ZERO, stack=True, lineWidth=0, fill=10, tooltip=G.Tooltip(valueType=G.INDIVIDUAL)), G.Graph( title='Ballot latency', dataSource='prometheus', targets=[ target( expr= 'service:isaacranks_ballot_generation_seconds:50p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.5q', refId='A'), target( expr= 'service:isaacranks_ballot_generation_seconds:90p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.9q', refId='B'), target( expr= 'service:isaacranks_ballot_generation_seconds:99p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.99q', refId='C'), ], yAxes=[ G.YAxis(format=G.MILLISECONDS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ]), ]), G.Row(panels=[ G.Graph( title='Votes', dataSource='prometheus', targets=[ target( expr= 'service_version:isaacranks_vote_casting_seconds_count:irate{{service="{}-isaacranks-web"}}', legendFormat='{{version}}', refId='A') ], yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ], nullPointMode=G.NULL_AS_ZERO, stack=True, lineWidth=0, fill=10, tooltip=G.Tooltip(valueType=G.INDIVIDUAL)), G.Graph( title='Vote latency', dataSource='prometheus', targets=[ target( expr= 'service:isaacranks_vote_casting_seconds:50p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.5q', refId='A'), target( expr= 'service:isaacranks_vote_casting_seconds:90p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.9q', refId='B'), target( expr= 'service:isaacranks_vote_casting_seconds:99p{{service="{}-isaacranks-web"}} * 1000', legendFormat='0.99q', refId='C'), ], yAxes=[ G.YAxis(format=G.MILLISECONDS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ]), ]), G.Row(panels=[ G.Graph( title='Time since last rebuild', dataSource='prometheus', targets=[ target( expr= 'time() - (isaacranks_last_rebuild_timestamp{{service="{}-isaacranks-rebuild"}} != 0)', legendFormat='Age') ], legend=G.Legend(current=True), yAxes=[ G.YAxis(format=G.SECONDS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ]), G.Graph( title='Rebuild duration', dataSource='prometheus', targets=[ target( expr= 'isaacranks_last_rebuild_duration_seconds{{service="{}-isaacranks-rebuild"}} != 0', legendFormat='Duration') ], legend=G.Legend(current=True), yAxes=[ G.YAxis(format=G.SECONDS_FORMAT), G.YAxis(format=G.SHORT_FORMAT, show=False) ]), ]) ]).auto_panel_ids()
def dashboard(): PROMETHEUS = "prometheus" return G.Dashboard( title="S4", rows=[ G.Row(panels=[ G.Graph( title="Signups", dataSource=PROMETHEUS, xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( format="none", label="Count", ), ], targets=[ G.Target( # Filter down to just the signup pod since that's # the only one where this metric value is # meaningful. Some other pods report a 0 value # for this metric because they happen to import # the Python code that defines the object # representing it. # # Also, sum over the selected series to account # for pod replacement. expr='sum(wormhole_signup_started{pod=~"s4-signup.*"})', legendFormat="Wormhole Signups Started", refId="A", ), G.Target( expr='sum(wormhole_signup_success{pod=~"s4-signup.*"})', legendFormat="Wormhole Signups Completed", refId="B", ), G.Target( expr='sum(wormhole_signup_failure{pod=~"s4-signup.*"})', legendFormat="Wormhole Signups Failed", refId="C", ), ], ), G.Graph( title="Usage", dataSource=PROMETHEUS, # Stack the connection graphs on each other, revealing # both a total and a distribution across different grid # router instances. stack=True, tooltip=G.Tooltip( valueType=G.INDIVIDUAL, ), xAxis=X_TIME, yAxes=[ G.YAxis( format="none", label="Count", ), G.YAxis( format="none", label="Count", ), ], targets=[ G.Target( expr="grid_router_connections", legendFormat="Tahoe-LAFS Connections", refId="D", ), ], ), last_convergence(PROMETHEUS), ]), G.Row( title="Cluster", panels=[ cpu_usage(PROMETHEUS, ["1m", "5m", "10m"]), memory_usage(PROMETHEUS), network_usage(PROMETHEUS), filesystem_usage(PROMETHEUS), ], ), G.Row( title="Cluster2", panels=[ process_open_fds(PROMETHEUS), ], ), G.Row(panels=[ tahoe_lafs_transfer_rate(PROMETHEUS), s4_customer_deployments(PROMETHEUS), unhandled_errors(PROMETHEUS), ]), ], ).auto_panel_ids()