def error_codes(api: str, interval: int = 1) -> Graph: return Graph( title=f"{api} return codes", dataSource=DATASOURCE, targets=[ Target( expr=f'sum(irate(flyte:admin:{api}:codes:OK[{interval}m]))', legendFormat="ok", refId='A', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:InvalidArgument[{interval}m]))', legendFormat="invalid-args", refId='B', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:AlreadyExists[{interval}m]))', legendFormat="already-exists", refId='C', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:FailedPrecondition[{interval}m]))', legendFormat="failed-precondition", refId='D', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def generate_elasticsearch_status_red_alert_graph( name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) targets = [ CloudwatchMetricsTarget( alias="Red status", namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="ClusterStatus.red", ), ] alert = None if notifications: alert = Alert( name="Elasticsearch is in status red", message="Elasticsearch is in status red", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) return Graph( title="Status RED alerts", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def quota_stats(collapse: bool) -> Row: return Row( title="Kubernetes Quota Usage stats", collapse=collapse, panels=[ Graph( title="CPU Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max cpu", ), Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used cpu", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( title="Mem Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max mem", ), Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used mem", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), ])
def system_errors() -> Graph: return Graph( title="System errors", dataSource=DATASOURCE, targets=[ Target( expr='sum(deriv(flyte:propeller:all:round:system_error_unlabeled[5m]))*300', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def workflows_per_project() -> Graph: return Graph( title=f"Running Workflows per project", dataSource=DATASOURCE, targets=[ Target( expr=f'sum(flyte:propeller:all:collector:flyteworkflow) by (project)', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def create_free_workers() -> Graph: return Graph( title="Free workers count", dataSource=DATASOURCE, targets=[ Target( expr='sum(flyte:propeller:all:free_workers_count) by (kubernetes_pod_name)', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def round_panic() -> Graph: return Graph( title="Round panic", dataSource=DATASOURCE, targets=[ Target( expr='sum(rate(flyte:propeller:all:round:panic_unlabeled[5m]))', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def abort_errors() -> Graph: return Graph( title="System errors", dataSource=DATASOURCE, targets=[ Target( expr='sum(rate(flyte:propeller:all:round:abort_error[5m]))', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def streak_length() -> Graph: return Graph( title="Avg streak length", dataSource=DATASOURCE, targets=[ Target( expr='avg(flyte:propeller:all:round:streak_length_unlabeled)', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def db_count(entity: str, op: str, interval: int = 1) -> Graph: return Graph( title=f"{op} Count Ops", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:admin:database:postgres:repositories:{entity}:{op}_ms_count[{interval}m]))', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def lambda_generate_memory_utilization_graph( name: str, cloudwatch_data_source: str, lambda_insights_namespace: str, *args, **kwargs, ) -> Graph: """ Generate lambda graph """ targets = [ CloudwatchMetricsTarget( alias="used_memory_max", namespace=lambda_insights_namespace, statistics=["Maximum"], metricName="used_memory_max", dimensions={"function_name": name}, ), CloudwatchMetricsTarget( alias="allocated_memory", namespace=lambda_insights_namespace, statistics=["Maximum"], metricName="total_memory", dimensions={"function_name": name}, ), ] yAxes = YAxes(YAxis(format="decmbytes"), ) seriesOverrides = [ { "alias": "used_memory_max", "points": False, "color": colors.GREEN, }, { "alias": "allocated_memory", "points": False, "color": colors.RED, "fill": 0 }, ] alert = None return Graph( title="Lambda Memory Utilization", dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, # gridPos=GridPos(8,12,12,0) ).auto_ref_ids()
def metastore_cache_hit_percentage(interval: int) -> Graph: """ TODO replace with metric math maybe? """ return Graph( title="cache hit percentage", dataSource=DATASOURCE, targets=[ Target( expr=f'(sum(rate(flyte:propeller:all:metastore:cache_hit[{interval}m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[{interval}m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[{interval}m])))', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def metastore_failures(): # Copy counts sum(rate(flyte:propeller:all:metastore:copy:overall_unlabeled_ms_count[5m])) return Graph( title=f"Failures from metastore", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))', legendFormat="head-failure", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))', legendFormat="bad-container", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))', legendFormat="bad-key", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))', legendFormat="read-failure", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))', legendFormat="write-failure", refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def error_vs_success(api: str, interval: int = 1) -> Graph: return Graph( title=f"{api} success vs errors", dataSource=DATASOURCE, targets=[ Target( expr=f'sum(irate(flyte:admin:{api}:errors[{interval}m]))', legendFormat="errors", refId='A', ), Target( expr=f'sum(irate(flyte:admin:{api}:success[{interval}m]))', legendFormat="success", refId='B', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def sum_process_memory_bytes(grr_component): return Graph( title="Sum of Process Memory Bytes (across all instances)", targets=[ Target( expr='sum(process_resident_memory_bytes{{job="grr_{}"}})'. format(grr_component), legendFormat="Resident Memory", ), ], yAxes=YAxes(left=YAxis(format=BYTES_FORMAT)), )
def db_operations_errors(grr_component): return Graph( title="Database Operations Errors Rate by Call", targets=[ Target( expr= 'sum by (call) (rate(db_request_errors_total{{job="grr_{0}"}}[10m]))' .format(grr_component), legendFormat="{{call}}", ), ], yAxes=YAxes(left=YAxis(format=OPS_FORMAT)))
def plugin_success_vs_failures() -> Graph: """ TODO We need to convert the plugin names to be labels, so that prometheus can perform queries correctly """ return Graph( title=f"Plugin Failures", dataSource=DATASOURCE, targets=[ Target( expr='{__name__=~"flyte:propeller:all:node:plugin:.*_failure_unlabeled"}', refId='A', ), Target( expr='{__name__=~"flyte:propeller:all:node:plugin:.*_success_unlabeled"}', refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def db_operations_latency(grr_component): return Graph( title="Database Operations Latency by Call", targets=[ Target( expr= 'sum by (call) (rate(db_request_latency_sum{{job="grr_{0}"}}[10m]) / rate(db_request_latency_count{{job="grr_{0}"}}[10m]))' .format(grr_component), legendFormat="{{call}}", ), ], yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)), )
def avg_cpu_usage_percentage(grr_component): return Graph( title="CPU Usage", targets=[ Target( expr= 'avg(rate(process_cpu_seconds_total{{job="grr_{}"}}[30s])) * 100' .format(grr_component), legendFormat="Average Process CPU Usage in %", ), ], yAxes=YAxes(left=YAxis(max=105)), )
def threadpool_cpu_usage(grr_component): return Graph( title="Threadpool Average CPU Usage", targets=[ Target( expr='avg(rate(threadpool_cpu_use{{job="grr_{}"}}[30s])) * 100' .format(grr_component), legendFormat= "Average Process CPU Usage in % (over all jobs & pools)", ), ], yAxes=YAxes(left=YAxis(max=105)), )
client = boto3.client('ec2') instances = client.describe_instances() print instances GRAFANA_API_URL = 'http://statsd:3000/api/dashboards/db/' target = Target(target='servers.prod-mysql-maindb*.df-data.df_complex-used', datasource='default') panel = Graph( title='mysql-maindb disk consumption', dataSource='default', targets=[target], yAxes=[ YAxis(format=BYTES_FORMAT), YAxis(format=SHORT_FORMAT), ], ) row = Row(panels=[panel]) db = Dashboard( title='Autogenerated MySQL Disk Consumption', rows=[row], time=Time('now-6M', 'now'), ) s = StringIO.StringIO() write_dashboard(db, s) dashboard_json = s.getvalue()
title="{}s Dashboard".format(GRR_COMPONENT).title().replace("_", " "), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="API Method Latency Rate by Method", targets=[ Target( expr= 'sum by (method_name) (rate(api_method_latency_sum[10m]) / rate(api_method_latency_count[10m]))', legendFormat="{{method_name}}", ), ], yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)), ), Graph( title="API Calls Count Rate with Status SUCCESS", targets=[ Target( expr= 'sum(rate(api_method_latency_count{status="SUCCESS"}[10m]))', legendFormat="Successful Calls Rate", ), ], yAxes=YAxes(left=YAxis(format=OPS_FORMAT)), ), Graph( title= "API Calls Count Rate with other statuses (not SUCCESS) by Method",
Target( expr= 'sum(process_resident_memory_bytes{job="fleetspeak"})', legendFormat="Resident Memory", ), ]), Graph( title="CPU Usage", targets=[ Target( expr= 'avg(rate(process_cpu_seconds_total{job="fleetspeak"}[30s])) * 100', legendFormat="Average Process CPU Usage", ), ], yAxes=YAxes(left=YAxis(max=105, format="percent")), ), ]), Row(panels=[ Graph( title="Datastore Latency per Operation", targets=[ Target( expr= 'sum by (operation) (rate(fleetspeak_server_datastore_operations_completed_latency_sum[10m]) / rate(fleetspeak_server_datastore_operations_completed_latency_count[10m]))', legendFormat="{{operation}}", ), ], yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)), ), Heatmap(
def generate_elasticsearch_requests_graph( name: str, client_id: str, cloudwatch_data_source: str) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) xx2_alias = "2xx" xx3_alias = "3xx" xx4_alias = "4xx" xx5_alias = "5xx" targets = [ CloudwatchMetricsTarget( alias=xx2_alias, namespace=NAMESPACE, period="1m", statistics=["Sum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="2xx", ), CloudwatchMetricsTarget( alias=xx3_alias, namespace=NAMESPACE, period="1m", statistics=["Sum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="3xx", ), CloudwatchMetricsTarget( alias=xx4_alias, namespace=NAMESPACE, period="1m", statistics=["Sum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="4xx", ), CloudwatchMetricsTarget( alias=xx5_alias, namespace=NAMESPACE, period="1m", statistics=["Sum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="5xx", ), ] series_overrides = [ { "alias": xx2_alias, "color": colors.GREEN, "lines": True, "bars": False, }, { "alias": xx3_alias, "color": colors.YELLOW, "lines": True, "bars": False, }, { "alias": xx4_alias, "color": colors.ORANGE, "lines": True, "bars": False, }, { "alias": xx5_alias, "color": colors.ORANGE, "lines": True, "bars": False, }, ] return Graph( title="Requests", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, ).auto_ref_ids()
def generate_elasticsearch_storage_graph(name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=MEGA_BYTES), YAxis(format=MEGA_BYTES), ) free_storage_alias = "Free storage" cluster_used_space_alias = "Used space" targets = [ CloudwatchMetricsTarget( alias=free_storage_alias, namespace=NAMESPACE, period="1m", statistics=["Minimum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="FreeStorageSpace", refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=cluster_used_space_alias, namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="ClusterUsedSpace", ), ] alert = None if notifications: alert = Alert( name="Elasticsearch storage alert", message="Elasticsearch might be low on storage", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=LowerThan(10240), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) series_overrides = [ { "alias": free_storage_alias, "color": colors.GREEN, "lines": True, "bars": False, }, { "alias": cluster_used_space_alias, "color": colors.ORANGE, "lines": True, "bars": False, "yaxis": 2, }, ] return Graph( title="Storage", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def generate_elasticsearch_documents_graph( name: str, client_id: str, cloudwatch_data_source: str) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) searchable_documents_alias = "Searchable documents" deleted_documents_alias = "Deleted documents" targets = [ CloudwatchMetricsTarget( alias=searchable_documents_alias, namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="SearchableDocuments", ), CloudwatchMetricsTarget( alias=deleted_documents_alias, namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="DeletedDocuments", ), ] series_overrides = [ { "alias": searchable_documents_alias, "color": colors.GREEN, "lines": True, "bars": False, }, { "alias": deleted_documents_alias, "color": colors.ORANGE, "lines": True, "bars": False, "yaxis": 2, }, ] return Graph( title="Documents", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, ).auto_ref_ids()
query="label_values(instance)"), chip_template, ] dashboard = Dashboard( title="Temperature", templating=Templating(template_list), panels=[ RowPanel( title="New Row", gridPos=GridPos(h=1, w=24, x=0, y=8), ), Graph( title="$chip", dataSource="Prometheus", targets=[ Target(expr=("sensors_temp_input{" + 'instance="$instance",chip="$chip"}'), legendFormat="{{feature}}", refId="A"), ], repeat=Repeat("v", "chip"), yAxes=YAxes( YAxis(format=CELSIUS_FORMAT), YAxis(format=SHORT_FORMAT), ), gridPos=GridPos(h=10, w=24, x=0, y=9), ), ], ).auto_panel_ids()
dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title(), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="QPS", targets=[ Target( expr='sum(rate(frontend_request_count_total[1m]))', legendFormat="Requests", ), ], yAxes=YAxes(left=YAxis(format="reqps")), ), Graph( title="Request Latency", targets=[ Target( expr= 'sum(rate(frontend_request_latency_sum[10m])) / sum(rate(frontend_request_latency_count[10m]))', legendFormat="Latency", ), ], yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)), ), Graph( title="Well Known Flows Requests Rate by Flow", targets=[
def generate_api_gateway_requests_graph(name: str, cloudwatch_data_source: str, notifications: List[str], *args, **kwargs): targets = [ CloudwatchMetricsTarget( alias=API_GATEWAY_5XX_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="5XXError", dimensions={"ApiName": name}, refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=API_GATEWAY_REQUESTS_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="Count", dimensions={"ApiName": name}, refId=API_GATEWAY_REQUESTS_REF_ID, ), CloudwatchMetricsTarget( alias=API_GATEWAY_4XX_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="4XXError", dimensions={"ApiName": name}, refId=API_GATEWAY_4XX_REF_ID, ), ] yAxes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) seriesOverrides = [ { "alias": API_GATEWAY_REQUESTS_ALIAS, "points": False, "color": colors.GREEN, }, { "alias": API_GATEWAY_4XX_ALIAS, "color": colors.YELLOW, }, { "alias": API_GATEWAY_5XX_ALIAS, "color": colors.RED, }, ] alert = None # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html if notifications: alert = Alert( name="{} API Gateway 5XX Errors".format(name), message="{} is having 5XX errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) ], frequency="2m", gracePeriod="2m", notifications=notifications, ) return Graph( title="API Gateway Requests: {}".format(name), dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
templating=Templating(list=[Environment, Cluster]), rows=[ Row(panels=[ Graph( title="Freeable Memory", dataSource=DATASOURCE, targets=[ Target( expr= 'aws_ec_freeable_memory_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}', legendFormat="{{dimension_cache_cluster_id}}", refId='A', ), ], yAxes=YAxes( YAxis(format=BYTES_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( title="Bytes Used for Cache", dataSource=DATASOURCE, targets=[ Target( expr= 'aws_ec_bytes_used_for_cache_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}', legendFormat="{{dimension_cache_cluster_id}}", refId='A', ), ], yAxes=YAxes(