dashboard = common.Dashboard( uid='writes', title="Cortex > Services (Writes)", rows=[ G.Row( title="Retrieval Stats", collapse=True, panels=[ common.PromGraph( title="Retrieval sent batches", expressions=[ ('{{url}}', 'sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[1m])) by (url)' ), ], ), common.PromGraph( title="Retrieval batch latency", expressions=[ ('{{url}} 99th', 'histogram_quantile(0.99, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3' ), ('{{url}} 50th', 'histogram_quantile(0.50, sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket[2m])) by (url, le)) * 1e3' ), ('{{url}} mean', '(sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum[2m])) by (url) / sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count[2m])) by (url)) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), common.PromGraph( title="Retrieval sent samples", expressions=[ ('{{url}} success', 'sum(rate(prometheus_remote_storage_succeeded_samples_total[1m])) by (url)' ), ('{{url}} dropped', 'sum(rate(prometheus_remote_storage_dropped_samples_total[1m])) by (url)' ), ('{{url}} retried', 'sum(rate(prometheus_remote_storage_retried_samples_total[1m])) by (url)' ), ('{{url}} failure', 'sum(rate(prometheus_remote_storage_failed_samples_total[1m])) by (url)' ), ], ), common.PromGraph( title="Queue", expressions=[ ('{{url}}: queue length', 'sum(prometheus_remote_storage_pending_samples) by (url)' ), ('{{url}}: lag', 'max(time()-prometheus_remote_storage_queue_highest_sent_timestamp_seconds) by (url)' ), ('{{url}}: shards', 'max(prometheus_remote_storage_shards) by (url)'), ], ), ], ), G.Row( title="Distributor", panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Distributor write QPS", 'rate(cortex_request_duration_seconds_count{job="cortex/distributor"}[1m])' ), common.LatencyGraph("cortex", "Distributor Write", "cortex/distributor"), ], ), G.Row( title="Distributor breakdown", collapse=True, panels=[ common.PromGraph( title="Distributor Error Rate", expressions=[ ('{{instance}}', 'sum by (instance)(rate(cortex_request_duration_seconds_count{job="cortex/distributor", status_code =~ "5.."}[1m]))' ), ], ), common.PromGraph( title="Distributor write latency", expressions=[ ('99th centile {{instance}}', 'histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{job="cortex/distributor"}[2m])) by (instance,le)) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="Distributor sends", collapse=True, panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Distributor send QPS", 'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[1m])' ), common.PromGraph( title="Distributor send latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' ), ('50th centile', 'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' ), ('Mean', 'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/distributor",operation="/cortex.Ingester/Push"}[2m]))' ), ], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="Samples", collapse=True, panels=[ common.PromGraph( title="Push sample ingest rate by instance (>1%)", expressions=[ ('{{user}}', 'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m])) > ignoring(user) group_left() (sum(rate(cortex_distributor_received_samples_total{job="cortex/distributor"}[1m]))/100)' ), ], legend=G.Legend(show=False), yAxes=common.OPS_AXIS, ), common.PromGraph( title="Rule sample ingest rate by instance", expressions=[ ( '{{user}}', # '> 1' is to exclude instances which are not connected and simply alerting on absent metrics 'sum by (user)(rate(cortex_distributor_received_samples_total{job="cortex/ruler"}[1m])) > 1' ), ], legend=G.Legend(show=False), yAxes=common.OPS_AXIS, ), common.PromGraph( title="Sample discard rate by instance ID & reason", expressions=[ ('{{user}} - {{reason}} ', 'sum by (user, reason) (rate(cortex_discarded_samples_total{reason!="duplicate-sample"}[1m])) > 0' ), ], yAxes=common.OPS_AXIS, ), ], ), G.Row( title="Ingester", panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Ingester write QPS", 'rate(cortex_request_duration_seconds_count{job="cortex/ingester"}[1m])' ), common.PromGraph( title="Ingester write latency", expressions=[ ('99th centile', 'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3' ), ('50th centile', 'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route="/cortex.Ingester/Push"} * 1e3' ), ('Mean', 'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m])) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route="/cortex.Ingester/Push"}[2m]))' ), ], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="DynamoDB", panels=[ common.PromGraph( title="DynamoDB write QPS", expressions=[ ('BatchWriteItem {{job}}: {{status_code}}', 'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (job, status_code)' ), ], ), common.PromGraph( title="DynamoDB write latency", expressions= [('BatchWriteItem: 99th', 'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3' ), ('BatchWriteItem: 50th', 'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) by (le)) * 1e3' ), ('BatchWriteItem: Mean', 'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[2m]))' )], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="DynamoDB", panels=[ common.PromGraph( title="DynamoDB write capacity consumed [rate1m]", expressions=[ ('{{table}} consumed', 'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*", operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0' ), ('{{table}} provisioned', 'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' ), ('{{table}} provisioned', 'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="DynamoDB write errors", expressions=[ ('{{table}} - {{error}}', 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' ), ('{{table}} - Throttled', 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' ), ], yAxes=common.OPS_AXIS, ), ], ), G.Row( title="Memcache", panels=[ common.PromGraph( title="Ingester hit rate", expressions=[ ('{{name}}', 'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)' ), ], yAxes=common.PercentageAxes(), ), common.PromGraph( title="Memcache QPS", expressions=[ ('{{method}} {{status_code}}', 'sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[1m])) by (method,status_code)' ), ], ), common.PromGraph( title="Memcache latency", expressions=[ ('{{method}} 99th centile', 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3' ), ('{{method}} 50th centile', 'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ingester"}[2m])) by (le,method)) * 1e3' ), ('{{method}} Mean', 'sum by (method)(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ingester"}[2m])) * 1e3 / sum by (method)(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ingester"}[2m]))' ), ], yAxes=common.LATENCY_AXES, ), ], ), ], )
dashboard = common.Dashboard( uid='reads', title="Cortex > Services (Reads)", rows=[ common.REDRow( 'cortex', 'Query Frontend read', 'cortex/query-frontend', rule_root="job_route:", extra_conditions=",route=\"api_prom_api_v1_query_range\""), common.REDRow('cortex', 'Querier read', 'cortex/querier'), G.Row( title="Ingester", panels=[ common.PromGraph( title="Ingester read QPS", expressions=[ ('{{route}}: {{status_code}}', 'sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[1m])) by (route, status_code)' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="Ingester read latency", expressions=[ ('{{route}}: 99th centile', 'job_route:cortex_request_duration_seconds:99quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3' ), ('{{route}}: 50th centile', 'job_route:cortex_request_duration_seconds:50quantile{job="cortex/ingester", route!="/cortex.Ingester/Push"} * 1e3' ), ('{{route}}: Mean', 'sum(rate(cortex_request_duration_seconds_sum{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route) * 1e3 / sum(rate(cortex_request_duration_seconds_count{job="cortex/ingester", route!="/cortex.Ingester/Push"}[2m])) by (route)' ), ], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="DynamoDB", panels=[ common.PromGraph( title="DynamoDB read QPS", expressions=[ ('QueryPages {{job}}: {{status_code}}', 'sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[1m])) by (job, status_code)' ), ], ), common.PromGraph( title="DynamoDB read latency", expressions= [('QueryPages: 99th', 'histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3' ), ('QueryPages: 50th', 'histogram_quantile(0.5, sum(rate(cortex_dynamo_request_duration_seconds_bucket{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) by (le)) * 1e3' ), ('QueryPages: Mean', 'sum(rate(cortex_dynamo_request_duration_seconds_sum{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m])) * 1e3 / sum(rate(cortex_dynamo_request_duration_seconds_count{job=~"cortex/.*", operation="DynamoDB.QueryPages"}[2m]))' )], yAxes=common.LATENCY_AXES, ), ], ), G.Row( title="DynamoDB", panels=[ common.PromGraph( title="DynamoDB read capacity consumed [rate1m]", expressions=[ ('{{table}} consumed', 'sum(rate(cortex_dynamo_consumed_capacity_total{job=~"cortex/.*",operation!~".*Write.*"}[1m])) by (table) > 0' ), ('{{table}} provisioned', 'max(cortex_dynamo_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0' ), ('{{table}} provisioned', 'max(cortex_table_capacity_units{job="cortex/table-manager", op="read"}) by (table) > 0' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="DynamoDB read errors", expressions=[ ('{{job}} - {{table}} - {{error}}', 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0' ), ('{{job}} - {{table}} - Throttled', 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation!~".*Write.*"}[1m])) by (job, error, table) > 0' ), ], yAxes=common.OPS_AXIS, ), ], ), G.Row( title="Memcache (blocks)", panels=[ common.PromGraph( title="Memcache read QPS (blocks)", expressions=[ ('{{name}} {{operation}}', 'sum(rate(thanos_memcached_operation_duration_seconds_count{kubernetes_namespace="cortex"}[1m])) by (name, operation)' ), ('{{name}} {{operation}} {{reason}}', 'sum(rate(thanos_memcached_operation_failures_total{kubernetes_namespace="cortex"}[1m])) by (name, operation, reason) > 0' ), ], yAxes=G.single_y_axis(format=G.OPS_FORMAT), ), common.PromGraph( title="Memcache read latency (blocks)", expressions=[ ('99% {{name}}', 'histogram_quantile(0.99, sum(rate(thanos_memcached_operation_duration_seconds_bucket{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) by (le, name))' ), ('Mean', 'sum(rate(thanos_memcached_operation_duration_seconds_sum{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m])) / sum(rate(thanos_memcached_operation_duration_seconds_count{job=~"cortex/querier|cortex/store-gateway",operation="getmulti"}[2m]))' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="Memcache (chunks)", panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Memcache read QPS (chunks)", 'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job=~"cortex/querier|cortex/query-frontend"}[1m]))' ), common.PromGraph( title="Memcache read latency (chunks)", expressions=[ ('99% {{name}}', 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) by (le, name))' ), ('Mean', 'sum(rate(cortex_memcache_request_duration_seconds_sum{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m])) / sum(rate(cortex_memcache_request_duration_seconds_count{job=~"cortex/querier|cortex/query-frontend",method="Memcache.GetMulti"}[2m]))' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="Cache", panels=[ common.PromGraph( title="Querier Cache hit rate", expressions=[ ('{{name}}', 'sum(rate(cortex_cache_hits{job="cortex/querier"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/querier"}[2m])) by (name)' ), ], yAxes=common.PercentageAxes(), ), common.PromGraph( title="Query-frontend cache hit rate", expressions=[ ('{{name}}', 'sum(rate(cortex_cache_hits{job="cortex/query-frontend"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/query-frontend"}[2m])) by (name)' ), ], yAxes=common.PercentageAxes(), ), ], ), G.Row( title="S3", collapse=True, panels=[ common.StatusQPSGraph( common.PROMETHEUS, "S3 read QPS", 'rate(cortex_s3_request_duration_seconds_count{operation="S3.GetObject", job=~"cortex/.*"}[1m])' ), common.PromGraph( title="S3 read latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3' ), ('50th centile', 'histogram_quantile(0.5, sum(rate(cortex_s3_request_duration_seconds_bucket{job=~"cortex/.*", operation="S3.GetObject"}[2m])) by (le)) * 1e3' ), ('Mean', 'sum(rate(cortex_s3_request_duration_seconds_sum{job=~"cortex/.*", operation="S3.PutObject"}[2m])) * 1e3/ sum(rate(cortex_s3_request_duration_seconds_count{job=~"cortex/.*", operation="S3.GetObject"}[2m]))' ), ], yAxes=common.LATENCY_AXES, ), ], ), ], )
dashboard = common.Dashboard( uid='chunks', title="Cortex > Chunks", rows=[ G.Row( panels=[ common.PromGraph( title="Number of chunks (in memory, in ingesters)", expressions=[ ('', 'sum(cortex_ingester_memory_chunks{job="cortex/ingester"})'), ], ), common.PromGraph( title="Chunks per series", expressions=[ ( '', 'sum(cortex_ingester_memory_chunks{job="cortex/ingester"}) / sum(cortex_ingester_memory_series{job="cortex/ingester"})' ), ], ), ] ), G.Row( panels=[ common.PromGraph( title="Chunk Size Bytes (on flush)", expressions=[ ( "99th Percentile", 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "50th Percentile", 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "10th Percentile", 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_size_bytes_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "Mean", 'sum(rate(cortex_ingester_chunk_size_bytes_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_size_bytes_count{job="cortex/ingester"}[2m]))' ), ], yAxes=[ G.YAxis(format=G.BYTES_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ), common.PromGraph( title="Chunk Age (on flush)", expressions=[ ( "99th Percentile", 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "50th Percentile", 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "10th Percentile", 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_age_seconds_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "Mean", 'sum(rate(cortex_ingester_chunk_age_seconds_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_ingester_chunk_age_seconds_count{job="cortex/ingester"}[2m]))' ), ], yAxes=[ G.YAxis(format=G.DURATION_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ), common.PromGraph( title="Chunk Length (on flush)", expressions=[ ( "99th Percentile", 'histogram_quantile(0.99, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "50th Percentile", 'histogram_quantile(0.5, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "10th Percentile", 'histogram_quantile(0.1, sum(rate(cortex_ingester_chunk_length_bucket{job="cortex/ingester"}[2m])) by (le))' ), ( "Mean", 'sum(rate(cortex_ingester_chunk_length_sum{job=\"cortex/ingester\"}[2m])) / sum(rate(cortex_ingester_chunk_length_count{job=\"cortex/ingester\"}[2m]))' ), ], ), ] ), G.Row( panels=[ W.stacked( common.PromGraph( title="Series Flush Queue Length", expressions=[ ("{{instance}}", 'cortex_ingester_flush_queue_length{job="cortex/ingester"}'), ], ) ), W.stacked( common.PromGraph( title="Chunk Flush Rate (rate[1m])", expressions=[ # This is the rate at which chunks are added to the flush queue ( "{{reason}}", 'sum by (reason)(rate(cortex_ingester_flush_reasons[1m]) or rate(cortex_ingester_series_flushed_total[1m]) or rate(cortex_ingester_flushing_enqueued_series_total[1m]))' ), # This is the rate at which chunks are removed from the flush queue ("Flushed", 'sum(rate(cortex_ingester_chunks_stored_total[1m]))'), # Chunks dropped for being too small ("Dropped", 'sum(rate(cortex_ingester_dropped_chunks_total[1m]))'), ], # Show flush and dropped rates as a line overlayed on enqueue rates, not stacked and not filled seriesOverrides=[ { "alias": "Flushed", "fill": 1, "linewidth": 1, "stack": False }, { "alias": "Dropped", "fill": 1, "linewidth": 1, "stack": False } ], ) ), ] ), G.Row( title="DynamoDB", collapse=True, panels=[ common.PromGraph( title="DynamoDB write capacity consumed [rate1m]", expressions=[ ( '{{table}} consumed', 'sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0' ), ( '{{table}} provisioned', 'max(cortex_table_capacity_units{job="cortex/table-manager", op="write"}) by (table) > 0' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="DynamoDB write errors", expressions=[ ( '{{table}} - {{error}}', 'sum(rate(cortex_dynamo_failures_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' ), ( '{{table}} - Throttled', 'sum(rate(cortex_dynamo_throttled_total{job=~"cortex/.*", operation=~".*Write.*"}[1m])) by (job, error, table) > 0' ), ], yAxes=common.OPS_AXIS, ), ], ), G.Row( title="Ring", collapse=True, panels=[ W.stacked( common.PromGraph( title="Ingester Ring Ownership", expressions=[ ( '{{ingester}}', 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")' ), ], # Show y-axis slightly above 100% in case series overlap yAxes=common.PercentageAxes(max=1.2), ) ), W.stacked( common.PromGraph( title="Ingesters In Ring", expressions=[ ( '{{state}}', 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)' ), ], yAxes=[ G.YAxis(format=G.NO_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ) ), ] ), G.Row( title="Index and Cache", panels=[ common.PromGraph( title="Index entries per chunk", expressions=[ ( '', 'sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{job="cortex/ingester"}[2m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{job="cortex/ingester"}[2m]))' ), ], ), common.PromGraph( title="Ingester hit rate", expressions=[ ( '{{name}}', 'sum(rate(cortex_cache_hits{job="cortex/ingester"}[2m])) by (name) / sum(rate(cortex_cache_fetched_keys{job="cortex/ingester"}[2m])) by (name)' ), ], yAxes=common.PercentageAxes(), ), ] ), ] )
dashboard = common.Dashboard( uid='ruler', title="Cortex > Recording Rules", rows=[ G.Row( title="Configs", collapse=True, panels=[ common.PromGraph( title="Known Configurations", expressions=[ ("Configurations", 'max(cortex_configs{job="cortex/ruler"})'), ("{{status}}", 'max by(status)(cortex_alertmanager_configs{job="cortex/alertmanager"})' ), ], ), common.QPSGraph('cortex_configs', 'Configs', 'cortex/ruler'), common.PromGraph( title="Configs Latency", expressions=[ ("99th centile", 'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), ]), common.REDRow('cortex', 'Ruler service', 'cortex/ruler', collapse=True), G.Row([ common.PromGraph( title="Group Evaluations per Second", expressions= [("Groups per second", 'sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[1m]))' ), ("Groups per second", 'sum(rate(cortex_prometheus_rule_group_duration_seconds_count{job="cortex/ruler"}[1m]))' )], yAxes=common.OPS_AXIS, ), common.PromGraph( title="Group Evaluation Durations", expressions=[ ("99th centile", 'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_group_evaluation_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_group_evaluation_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' ), ("Mean", 'avg(cortex_prometheus_rule_group_last_duration_seconds)*1e3' ), ("{{rule_group}}", 'max by (rule_group)(cortex_prometheus_rule_group_last_duration_seconds)*1e3 > 500' ), ], yAxes=common.LATENCY_AXES, ), common.PromGraph( title="Group Evaluation Latency", expressions=[ ("99th centile", 'histogram_quantile(0.99, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_group_evaluation_latency_seconds_bucket[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_group_evaluation_latency_seconds_sum[2m])) / sum(rate(cortex_group_evaluation_latency_seconds_count[2m])) * 1e3' ), ("Mean", 'avg(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1000' ), ("Max", 'max(time()-(cortex_prometheus_rule_group_last_evaluation_timestamp_seconds>0))*1e3' ), ], yAxes=common.LATENCY_AXES, ), ]), G.Row( title="Ingester Queries", panels=[ common.QPSGraph('cortex_distributor', 'Ingester Query', 'cortex/ruler', metric_root="query"), common.PromGraph( title="Ingester Query Latency", expressions=[ ("99th centile", 'histogram_quantile(0.99, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_distributor_query_duration_seconds_bucket{job="cortex/ruler"}[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_distributor_query_duration_seconds_sum{job="cortex/ruler"}[2m])) / sum(rate(cortex_distributor_query_duration_seconds_count{job="cortex/ruler"}[2m])) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), ]), G.Row( title="Ingester Push", panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Ingester Push", 'rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[1m])' ), common.PromGraph( title="Ingester Push Latency", expressions=[ ("99.7th centile", 'histogram_quantile(0.997, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_ingester_client_request_duration_seconds_bucket{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_ingester_client_request_duration_seconds_sum{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) / sum(rate(cortex_ingester_client_request_duration_seconds_count{job="cortex/ruler",operation="/cortex.Ingester/Push"}[2m])) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), ]), G.Row([ common.PromGraph( title="Rules per Second", expressions=[ ("Rules", 'sum(rate(cortex_rules_processed_total{job="cortex/ruler"}[1m]))' ), ("Rules/sec", 'sum(rate(cortex_prometheus_rule_evaluations_total{job="cortex/ruler"}[1m]))' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="Ruler DynamoDB errors", expressions=[ ('{{table}} - {{error}}', 'sum(rate(cortex_dynamo_failures_total{job="cortex/ruler"}[1m])) by (error, table) > 0' ), ], yAxes=common.OPS_AXIS, ), ]), G.Row( title="Memcache", panels=[ common.StatusQPSGraph( common.PROMETHEUS, "Memcache read QPS", 'sum by (job,status_code)(rate(cortex_memcache_request_duration_seconds_count{method="Memcache.GetMulti", job="cortex/ruler"}[1m]))' ), common.PromGraph( title="Memcache read latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3' ), ('50th centile', 'histogram_quantile(0.5, sum(rate(cortex_memcache_request_duration_seconds_bucket{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) by (le)) * 1e3' ), ('Mean', 'sum(rate(cortex_memcache_request_duration_seconds_sum{job="cortex/ruler",method="Memcache.GetMulti"}[2m])) * 1e3 / sum(rate(cortex_memcache_request_duration_seconds_count{job="cortex/ruler",method="Memcache.GetMulti"}[2m]))' ), ], yAxes=common.LATENCY_AXES, ), ], ), ], )
dashboard = common.Dashboard( uid='am', title="Cortex > Alertmanager", rows=[ G.Row( title='Operations', panels=[ common.PromGraph( title="Alerts", expressions=[ ("{{instance}} {{status}}", 'sum by (instance, status)(rate(alertmanager_alerts_received_total{job="cortex/alertmanager"}[2m]))' ), ("{{instance}} invalid", 'sum by (instance, status)(rate(alertmanager_alerts_invalid_total{job="cortex/alertmanager"}[2m]))' ), ], yAxes=common.OPS_AXIS, ), common.PromGraph( title="Notifications", expressions=[ ("{{integration}}", 'sum by (integration)(rate(alertmanager_notifications_total{job="cortex/alertmanager"}[2m]))' ), ], yAxes=common.OPS_AXIS, ), ]), G.Row( title='Alertmanager fetching configs', collapse=True, panels=[ common.QPSGraph('cortex_configs', 'Configs', 'cortex/alertmanager'), common.PromGraph( title="Configs Latency", expressions=[ ("99th centile", 'histogram_quantile(0.99, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3' ), ("50th centile", 'histogram_quantile(0.50, sum(rate(cortex_configs_request_duration_seconds_bucket{job="cortex/alertmanager"}[2m])) by (le)) * 1e3' ), ("Mean", 'sum(rate(cortex_configs_request_duration_seconds_sum{job="cortex/alertmanager"}[2m])) / sum(rate(cortex_configs_request_duration_seconds_count{job="cortex/alertmanager"}[2m])) * 1e3' ), ], yAxes=common.LATENCY_AXES, ), ]), common.REDRow('cortex', 'Alertmanager', 'cortex/alertmanager'), G.Row([ common.PromGraph( title="Known Configurations", expressions=[ ("{{instance}}", 'cortex_alertmanager_configs_total{job="cortex/alertmanager"}' ), ], ), common.PromGraph( title="Cluster Members", expressions=[ ("{{instance}}", 'sum(alertmanager_cluster_members{job="cortex/alertmanager"}) by (instance)' ), ], ), ]), ], )
dashboard = common.Dashboard( uid='cortex-blocks', title="Cortex > Blocks", rows=[ G.Row( title="Data", panels=[ common.PromGraph( title="Number of series in memory, in ingesters", expressions=[ ('', 'sum(cortex_ingester_memory_series{job="cortex/ingester"})' ), ], ), common.PromGraph( title="Head chunks", expressions=[ ('{{instance}}', 'cortex_ingester_tsdb_head_chunks'), ], ), ]), G.Row( title="Resources", panels=[ common.PromGraph( title="Memory Usage", expressions=[ ('{{pod}}', 'sum by(pod)(container_memory_usage_bytes{namespace="cortex",container!="POD",container!=""})' ), ], yAxes=[ G.YAxis(format=G.BYTES_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ), common.PromGraph( title="Disk space usage", expressions=[ ('{{persistentvolumeclaim}}', 'kubelet_volume_stats_used_bytes{namespace="cortex"} / kubelet_volume_stats_capacity_bytes{namespace="cortex"}' ), ], yAxes=common.PercentageAxes(), ), ], ), G.Row( title="Last runs", panels=[ G.SingleStat( dataSource=common.PROMETHEUS, title="Last Successful Compactor Run", targets=[ G.Target( '(time()-cortex_compactor_last_successful_run_timestamp_seconds) / 60', refId='A', ), ], format='m', # TODO: Add 'MINUTES_FORMAT' to grafanalib ), G.SingleStat( dataSource=common.PROMETHEUS, title="Last Successful Bucket Index Update", targets=[ G.Target( '(time()-max(cortex_bucket_index_last_successful_update_timestamp_seconds)) / 60', refId='A', ), ], format='m', # TODO: Add 'MINUTES_FORMAT' to grafanalib ), ], ), G.Row( title="Block Operations", panels=[ common.PromGraph( title="Rates", expressions=[ ('{{component}} loads', 'sum by(component)(rate(cortex_bucket_store_block_loads_total{}[1m]))' ), ('{{component}} errors', 'sum by(component)(rate(cortex_bucket_store_block_load_failures_total{}[1m])) > 0' ), ('Uploads', 'sum(rate(cortex_ingester_shipper_uploads_total[5m]))' ), ('Upload errors', 'sum(rate(cortex_ingester_shipper_upload_failures_total[5m]))' ), ('Dir syncs', 'sum(rate(cortex_ingester_shipper_dir_syncs_total[5m]))' ), ('Dir sync errors', 'sum(rate(cortex_ingester_shipper_dir_sync_failures_total[5m]))' ), ], yAxes=[ G.YAxis(format=G.OPS_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], ), common.PromGraph( title="Latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))' ), ('50th centile', 'histogram_quantile(0.5, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{operation="upload"}[5m])) by (le))' ), ('Mean', 'sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{operation="upload"}[5m])) / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{operation="upload"}[5m]))' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="Compactions", panels=[ common.PromGraph( title="Operations", expressions=[ ('Compactions', 'sum(rate(cortex_ingester_tsdb_compactions_total[5m]))' ), ('errors', 'sum(rate(cortex_ingester_tsdb_compactions_failed_total[5m]))' ), ], ), common.PromGraph( title="Latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))' ), ('50th centile', 'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_bucket{}[5m])) by (le))' ), ('Mean', 'sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_compaction_duration_seconds_count{}[5m]))' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="WAL", panels=[ common.PromGraph( title="Operations", expressions=[ ('Truncations', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total[5m]))' ), ('Truncation errors', 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]))' ), ('Checkpoint', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total[5m]))' ), ('Checkpoint errors', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]))' ), ('WAL corruptions', 'sum(rate(cortex_ingester_wal_corruptions_total[5m]))' ), ('mmap corruptions', 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total[5m]))' ), ], ), common.PromGraph( title="Latency", expressions=[ ('99th centile', 'histogram_quantile(0.99, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))' ), ('50th centile', 'histogram_quantile(0.5, sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_bucket{}[5m])) by (le))' ), ('Mean', 'sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{}[5m])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{}[5m]))' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="Bucket Operations", panels=[ common.PromGraph( title="Operations", expressions=[ ('{{component}}-{{operation}}', 'sum by(component,operation) (rate(thanos_objstore_bucket_operations_total[5m]))' ), ('errors {{component}}-{{operation}}', 'sum by(component,operation) (rate(thanos_objstore_bucket_operation_failures_total[5m]))' ), ], ), common.PromGraph( title="99% Latency", expressions=[ ('{{component}}-{{operation}}', 'histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket[5m])) by (le, component, operation)) > 0' ), ], yAxes=G.single_y_axis(format=G.SECONDS_FORMAT), ), ], ), G.Row( title="Ring", collapse=True, panels=[ W.stacked( common.PromGraph( title="Ingester Ring Ownership", expressions=[ ('{{ingester}}', 'max(cortex_ring_ingester_ownership_percent{job="cortex/distributor"}) by (ingester) or label_replace(max(cortex_ring_member_ownership_percent{job="cortex/distributor"}) by (member), "ingester", "$1", "member", "(.*)")' ), ], # Show y-axis slightly above 100% in case series overlap yAxes=common.PercentageAxes(max=1.2), )), W.stacked( common.PromGraph( title="Ingesters In Ring", expressions=[ ('{{state}}', 'max(cortex_ring_ingesters{job="cortex/distributor"}) by (state) or max(cortex_ring_members{job="cortex/distributor"}) by (state)' ), ], yAxes=[ G.YAxis(format=G.NO_FORMAT), G.YAxis(format=G.SHORT_FORMAT), ], )), ]), ], )