Ejemplo n.º 1
0
    def export(self, output_file=''):
        from baskerville.models.metrics.registry import metrics_registry
        panels = []
        for i, (metric_name,
                value) in enumerate(metrics_registry.registry.items()):
            if i % 4 == 0 or i == len(metrics_registry.registry):
                self.rows.append(Row(panels=panels))
                panels = []

            if 'timer' in metric_name:
                g = Gauge()
                g.maxValue = 0
                g.maxValue = 100
                g.show = True
                g.thresholdMarkers = True
                panels.append(
                    SingleStat(
                        title=metric_name,
                        dataSource=self.ds,
                        gauge=g,
                        targets=[
                            Target(
                                expr=
                                f'({metric_name}_sum / {metric_name}_count)',
                                target=metric_name,
                                refId='A',
                                metric=metric_name,
                                datasource=self.ds,
                            )
                        ]))
            else:
                panels.append(
                    Graph(title=metric_name,
                          dataSource=self.ds,
                          targets=[
                              Target(expr=f'{metric_name}_total' if 'total'
                                     in metric_name else metric_name,
                                     target=metric_name,
                                     refId='A',
                                     metric=metric_name,
                                     datasource=self.ds)
                          ]))

        for panel in panels:
            self.rows.append(Row(panels=[panel]))

        self.dashboard = Dashboard(title=self.dash_title,
                                   rows=self.rows).auto_panel_ids()

        with open(output_file, 'w') as f:
            write_dashboard(self.dashboard, f)
Ejemplo n.º 2
0
 def errors(collapse: bool) -> Row:
     return Row(
         title="Error (System vs user)",
         collapse=collapse,
         panels=[
             Graph(
                 title="User errors",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="System errors",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
         ])
Ejemplo n.º 3
0
 def row(self, elements=None, **kw):
     elements = elements or []
     panels = []
     for element in elements:
         element.dataSource = getattr(element, 'dataSource', '') or self.dataSource
         panels += [element]
     self.rows += [Row(panels=panels, **kw)]
     return self
Ejemplo n.º 4
0
 def create_api_row(api: str, collapse: bool, interval: int = 1) -> Row:
     return Row(title=f"{api} stats",
                collapse=collapse,
                panels=[
                    FlyteAdmin.error_codes(api, interval),
                    FlyteAdmin.error_vs_success(api, interval),
                    FlyteAdmin.api_latency(api, interval),
                ])
Ejemplo n.º 5
0
 def metastore_metrics(interval: int, collapse: bool) -> Row:
     return Row(
         title="Metastore failures and cache",
         collapse=collapse,
         panels=[
             FlytePropeller.metastore_cache_hit_percentage(interval),
             FlytePropeller.metastore_failures(),
         ],
     )
Ejemplo n.º 6
0
 def node_metrics(collapse: bool) -> Row:
     return Row(
         title="Node Metrics",
         collapse=collapse,
         panels=[
             FlytePropeller.node_exec_latency(),
             FlytePropeller.node_input_latency(),
             FlytePropeller.node_event_recording_latency(),
         ],
     )
Ejemplo n.º 7
0
 def wf_store_latency(collapse: bool) -> Row:
     return Row(
         title="etcD write metrics",
         collapse=collapse,
         panels=[
             Graph(
                 title="wf update etcD latency",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
             ),
             Graph(
                 title="etcD writes",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
             Graph(
                 title="etcD write conflicts",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_conflict[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
             Graph(
                 title="etcD write fail",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_failed[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
         ])
Ejemplo n.º 8
0
 def perf_metrics(collapse: bool) -> Row:
     r = Row(
         title="Perf metrics",
         collapse=collapse,
         panels=[],
     )
     r.panels.extend(FlytePropeller.wf_event_recording())
     r.panels.extend(FlytePropeller.node_event_recording())
     r.panels.extend(FlytePropeller.task_event_recording())
     r.panels.extend(FlytePropeller.dynamic_wf_build())
     r.panels.append(FlytePropeller.admin_launcher_cache())
     return r
Ejemplo n.º 9
0
 def create_entity_db_count(entity: str,
                            collapse: bool,
                            interval: int = 1) -> Row:
     r = Row(
         title=f"DB {entity} ops stats",
         collapse=collapse,
         panels=[],
     )
     for op in FlyteAdmin.DB_OPS:
         r.panels.append(
             FlyteAdmin.db_count(entity, op=op, interval=interval))
     return r
Ejemplo n.º 10
0
def create_lambda_only_dashboard(
    tags: List[str],
    name: str,
    cloudwatch_data_source: str,
    lambda_insights_namespace: str,
    notifications: List[str],
    environment: str,
    *args,
    **kwargs,
):
    """Create a dashboard with just the lambda"""

    return Dashboard(
        title="{}{}".format(LAMBDA_DASHBOARD_PREFIX, name),
        editable=EDITABLE,
        tags=tags + ["lambda", environment],
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        rows=[
            Row(panels=[
                lambda_generate_invocations_graph(
                    name, cloudwatch_data_source, notifications=notifications),
                lambda_generate_duration_graph(name, cloudwatch_data_source),
            ]),
            Row(panels=[
                lambda_generate_memory_utilization_percentage_graph(
                    name,
                    cloudwatch_data_source,
                    lambda_insights_namespace,
                    notifications=notifications,
                ),
                lambda_generate_memory_utilization_graph(
                    name, cloudwatch_data_source, lambda_insights_namespace),
            ]),
            Row(panels=[
                lambda_generate_logs_panel(name, cloudwatch_data_source),
            ]),
        ],
    ).auto_panel_ids()
Ejemplo n.º 11
0
 def quota_stats(collapse: bool) -> Row:
     return Row(
         title="Kubernetes Quota Usage stats",
         collapse=collapse,
         panels=[
             Graph(
                 title="CPU Limits vs usage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}',
                         refId='A',
                         legendFormat="max cpu",
                     ),
                     Target(
                         expr=
                         'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}',
                         refId='B',
                         legendFormat="used cpu",
                     ),
                 ],
                 yAxes=YAxes(
                     YAxis(format=OPS_FORMAT),
                     YAxis(format=SHORT_FORMAT),
                 ),
             ),
             Graph(
                 title="Mem Limits vs usage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}',
                         refId='A',
                         legendFormat="max mem",
                     ),
                     Target(
                         expr=
                         'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}',
                         refId='B',
                         legendFormat="used mem",
                     ),
                 ],
                 yAxes=YAxes(
                     YAxis(format=OPS_FORMAT),
                     YAxis(format=SHORT_FORMAT),
                 ),
             ),
         ])
Ejemplo n.º 12
0
 def core_metrics(interval: int, collapse: bool) -> Row:
     return Row(
         title="Core metrics",
         collapse=collapse,
         panels=[
             FlytePropeller.create_free_workers(),
             FlytePropeller.abort_errors(),
             FlytePropeller.system_errors(),
             FlytePropeller.plugin_success_vs_failures(),
             FlytePropeller.round_latency(interval),
             FlytePropeller.round_latency_per_wf(interval),
             FlytePropeller.round_panic(),
             FlytePropeller.workflows_per_project(),
         ],
     )
Ejemplo n.º 13
0
def create_dashboard(
    title: str,
    datasource_name: str,
    queries: List[RawInfluxDbQuery],
    start: datetime,
    end: datetime,
    timezone: str,
    yaxe_types: List[YAxis],
    thresholds: List[Threshold],
    grafana_graph_params: Dict[str, any],
) -> Dashboard:
    """
    Create a dashboard object that can be serialized to JSON and sent to Grafana.
    """
    targets = []
    series_overrides = []
    for query in queries:
        targets.append(InfluxDBTarget(query=query.query, alias=query.alias))
        if query.yaxis == "right":
            series_overrides.append(SeriesOverride(alias=query.alias, yaxis=2))

    left = yaxe_types[0]
    right = yaxe_types[1] if len(yaxe_types) > 1 else None
    yaxes = YAxes(left, right) if right else YAxes(left=left)

    return Dashboard(
        title=title,
        time=Time(start.isoformat(), end.isoformat()),
        timezone=timezone,
        rows=[
            Row(panels=[
                CustomGraph(
                    title=title,
                    dataSource=datasource_name,
                    targets=targets,
                    thresholds=thresholds,
                    seriesOverrides=series_overrides,
                    yAxes=yaxes,
                    transparent=True,
                    **grafana_graph_params,
                ),
            ], ),
        ],
    ).auto_panel_ids()
Ejemplo n.º 14
0
 def resource_stats(collapse: bool) -> Row:
     return Row(
         title="Task stats",
         collapse=collapse,
         panels=[
             Graph(
                 title="Pending tasks",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="Memory Usage Percentage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         '(max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="CPU Usage Percentage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         '(sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
         ])
Ejemplo n.º 15
0
def generate_firehose_dashboard(
    influxdb_data_source: str, environment: str, *args, **kwargs
) -> Dashboard:
    """Generate Firehose dashboard"""
    tags = ["firehose", environment]

    rows = [
        Row(
            panels=[generate_firehose_graph(influxdb_data_source=influxdb_data_source)],
            editable=EDITABLE,
            repeat="firehose",
            title="$firehose",
        )
    ]

    return Dashboard(
        title="Firehose",
        editable=EDITABLE,
        tags=tags,
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        rows=rows,
    ).auto_panel_ids()
Ejemplo n.º 16
0
    def metastore_latencies(collapse: bool) -> Row:
        return Row(
            title=f"Metastore latencies",
            collapse=collapse,
            panels=[
                Graph(
                    title=f"Metastore copy latency",
                    dataSource=DATASOURCE,
                    targets=[
                        Target(
                            expr=f'sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)',
                            refId='A',
                        ),
                    ],
                    yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
                ),
                Graph(
                    title=f"Metastore write latency by workflow",
                    dataSource=DATASOURCE,
                    targets=[
                        Target(
                            expr='sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)',
                            refId='A',
                        ),
                    ],
                    yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
                ),
                Graph(
                    title=f"Metastore read open latency by workflow",
                    dataSource=DATASOURCE,
                    targets=[
                        Target(
                            expr='sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)',
                            refId='A',
                        ),
                    ],
                    yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
                ),
                Graph(
                    title=f"Metastore head latency by workflow",
                    dataSource=DATASOURCE,
                    targets=[
                        Target(
                            expr='sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)',
                            refId='A',
                        ),
                    ],
                    yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
                ),
                Graph(
                    title=f"Metastore fetch latency by workflow",
                    dataSource=DATASOURCE,
                    targets=[
                        Target(
                            expr='sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)',
                            legendFormat="proto-fetch",
                            refId='A',
                        ),

                        Target(
                            expr='sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)',
                            legendFormat="remote-fetch",
                            refId='B',
                        ),
                    ],
                    yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
                ),
            ]
        )
Ejemplo n.º 17
0
def generate_elasticsearch_dashboard(
    name: str,
    client_id: str,
    influxdb_data_source: str,
    cloudwatch_data_source: str,
    environment: str,
    notifications: List[str],
    *args,
    **kwargs,
):
    """Generate Elasticsearch dashboard"""
    tags = ["elasticsearch", environment]

    rows = [
        Row(
            panels=[
                generate_elasticsearch_cpu_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticsearch_jvm_memory_pressure_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticsearch_documents_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                )
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticsearch_storage_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                )
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticsearch_requests_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                )
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticsearch_status_red_alert_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
                generate_elasticsearch_nodes_alert_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
                generate_elasticsearch_writes_blocked_alert_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
                generate_elasticsearch_automated_snapshot_failure_alert_graph(
                    name=name,
                    client_id=client_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
            ],
            editable=EDITABLE,
        ),
    ]

    return Dashboard(
        title="Elasticsearch: {}".format(name),
        editable=EDITABLE,
        tags=tags,
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        rows=rows,
        links=[DOCUMENTATION_LINK],
        refresh=DEFAULT_REFRESH,
    ).auto_panel_ids()
 Row(panels=[
     Graph(
         title="Freeable Memory",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=
                 'aws_ec_freeable_memory_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}',
                 legendFormat="{{dimension_cache_cluster_id}}",
                 refId='A',
             ),
         ],
         yAxes=YAxes(
             YAxis(format=BYTES_FORMAT),
             YAxis(format=SHORT_FORMAT),
         ),
     ),
     Graph(
         title="Bytes Used for Cache",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=
                 'aws_ec_bytes_used_for_cache_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}',
                 legendFormat="{{dimension_cache_cluster_id}}",
                 refId='A',
             ),
         ],
         yAxes=YAxes(
             YAxis(format=BYTES_FORMAT),
             YAxis(format=SHORT_FORMAT),
         ),
     )
 ]),
Ejemplo n.º 19
0
from grafanalib.core import Dashboard, Graph, Row, Target
from grr_grafanalib_dashboards.util import add_data_source
from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS
from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE

GRR_COMPONENT = "admin_ui"

dashboard = Dashboard(
    title="{}s Dashboard".format(GRR_COMPONENT).title().replace("_", " "),
    rows=[
        Row(panels=[panel(GRR_COMPONENT) for panel in row])
        for row in GENERAL_PANELS
    ] + [
        Row(panels=[
            Graph(
                title="API Method Latency Rate",
                targets=[
                    Target(
                        expr=
                        'rate(api_method_latency_sum[10m]) / rate(api_method_latency_count[10m])',
                        legendFormat="Latency - Method: {{method_name}}",
                    ),
                ],
            ),
            Graph(
                title="API Calls Count Rate by Status SUCCESS",
                targets=[
                    Target(
                        expr=
                        'sum(rate(api_method_latency_count{status="SUCCESS"}[10m]))',
                        legendFormat="Successful Calls Rate",
Ejemplo n.º 20
0
GRAFANA_API_URL = 'http://statsd:3000/api/dashboards/db/'

target = Target(target='servers.prod-mysql-maindb*.df-data.df_complex-used',
                datasource='default')

panel = Graph(
    title='mysql-maindb disk consumption',
    dataSource='default',
    targets=[target],
    yAxes=[
        YAxis(format=BYTES_FORMAT),
        YAxis(format=SHORT_FORMAT),
    ],
)

row = Row(panels=[panel])

db = Dashboard(
    title='Autogenerated MySQL Disk Consumption',
    rows=[row],
    time=Time('now-6M', 'now'),
)

s = StringIO.StringIO()
write_dashboard(db, s)
dashboard_json = s.getvalue()
print dashboard_json

payload = {
    "dashboard": json.loads(dashboard_json),
    "overwrite": True
                             BYTES_FORMAT, OPS_FORMAT, single_y_axis, Target)

dashboard = Dashboard(
    title="Test Resolver dashboard",
    rows=[
        Row(panels=[
            Graph(
                title="gRPC Rate",
                dataSource='Prometheus',
                targets=[
                    Target(
                        expr=
                        'rate(grpc_server_handled_total{grpc_service="ResolverService"}[1m])',
                        legendFormat="Total-{{pod}}",
                        refId='A',
                    ),
                    Target(
                        expr=
                        'rate(grpc_server_handled_total{grpc_method="Resolve", grpc_service="ResolverService"}[1m])',
                        legendFormat="Resolve-{{pod}}",
                        refId='B',
                    )
                ],
                xAxis=XAxis(mode="time"),
                yAxes=single_y_axis(format=OPS_FORMAT, min=None),
            ),
        ]),
        Row(panels=[
            Graph(
                title="gRPC latency",
                dataSource='Prometheus',
                targets=[
Ejemplo n.º 22
0
def generate_rds_dashboard(
    name: str,
    environment: str,
    influxdb_data_source: str,
    cloudwatch_data_source: str,
    engine: str,
    notifications: List[str],
    **kwargs,
):

    tags = [environment, engine, "rds", "database"]

    cpu_graph = generate_rds_cpu_graph(
        name=name,
        cloudwatch_data_source=cloudwatch_data_source,
        notifications=notifications,
    )
    burst_graph = generate_rds_burst_balance_graph(
        name=name,
        cloudwatch_data_source=cloudwatch_data_source,
        notifications=notifications,
    )
    connections_graph = generate_rds_database_connections_graph(
        name=name, cloudwatch_data_source=cloudwatch_data_source)
    freeable_memory_graph = generate_rds_freeable_memory_graph(
        name=name, cloudwatch_data_source=cloudwatch_data_source)
    free_storage_graph = generate_rds_free_storage_space_graph(
        name=name, cloudwatch_data_source=cloudwatch_data_source)

    rows = [
        Row(panels=[cpu_graph, burst_graph]),
        Row(panels=[
            connections_graph, freeable_memory_graph, free_storage_graph
        ]),
        Row(panels=[
            generate_rds_disk_latency_graph(
                name=name, cloudwatch_data_source=cloudwatch_data_source),
            generate_rds_disk_ops_graph(
                name=name, cloudwatch_data_source=cloudwatch_data_source),
            generate_rds_network_throughput_graph(
                name=name, cloudwatch_data_source=cloudwatch_data_source),
        ]),
    ]

    if engine == "postgres":
        rows += [
            Row(panels=[
                generate_rds_transaction_id_graph(
                    name=name,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                )
            ])
        ]

    return Dashboard(
        title="RDS: {}".format(name),
        editable=EDITABLE,
        tags=tags,
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        rows=rows,
        links=[
            get_documentation_link(
                "https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/MonitoringOverview.html"
            )
        ],
        refresh=DEFAULT_REFRESH,
    ).auto_panel_ids()
Ejemplo n.º 23
0
from grafanalib.core import Dashboard, Graph, Row, Target
from grr_grafanalib_dashboards.util import add_data_source
from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS
from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE

GRR_COMPONENT = "frontend"

dashboard = Dashboard(
    title="{}s Dashboard".format(GRR_COMPONENT).title(),
    rows=[
        Row(panels=[panel(GRR_COMPONENT) for panel in row])
        for row in GENERAL_PANELS
    ] + [
        Row(panels=[
            Graph(
                title="QPS",
                targets=[
                    Target(
                        expr='sum(rate(frontend_request_count_total[1m]))',
                        legendFormat="Requests",
                    ),
                ],
            ),
            Graph(
                title="Request Latency Rate",
                targets=[
                    Target(
                        expr=
                        'sum(rate(frontend_request_latency_sum[10m])) / sum(rate(frontend_request_latency_count[10m]))',
                        legendFormat="Latency",
                    ),
Ejemplo n.º 24
0
def generate_elasticache_redis_dashboard(
    name: str,
    cache_cluster_id: str,
    influxdb_data_source: str,
    cloudwatch_data_source: str,
    environment: str,
    notifications: List[str],
    *args,
    **kwargs,
):
    """Generate ElastiCache Redis dashboard"""
    tags = ["elasticache", "redis", environment]

    rows = [
        Row(
            panels=[
                generate_elasticache_redis_cpu_usage_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticache_redis_cpu_credit_usage_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                    notifications=notifications,
                ),
                generate_elasticache_redis_swap_and_memory_usage_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticache_redis_network_in_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticache_redis_connections_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticache_redis_db_memory_usage_and_evicitons_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
            ],
            editable=EDITABLE,
        ),
        Row(
            panels=[
                generate_elasticache_redis_network_out_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticache_redis_replication_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
                generate_elasticache_redis_latency_graph(
                    cache_cluster_id=cache_cluster_id,
                    cloudwatch_data_source=cloudwatch_data_source,
                ),
            ],
            editable=EDITABLE,
        ),
    ]

    return Dashboard(
        title="ElastiCache Redis: {}".format(name),
        editable=EDITABLE,
        tags=tags,
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        rows=rows,
        links=[DOCUMENTATION_LINK],
        refresh=DEFAULT_REFRESH,
    ).auto_panel_ids()
Ejemplo n.º 25
0
 Row(panels=[
     Graph(
         title="Number of Active Processes",
         targets=[
             Target(
                 expr='sum(up{job="fleetspeak"})',
                 legendFormat="Active Processes",
             ),
         ],
         alert=Alert(
             name="Number of Active Processes alert",
             message=
             "The number of active Fleetspeak Server processes is below {}"
             .format(ACTIVE_PROCESSES_ALERTING_CONDITION),
             alertConditions=[
                 AlertCondition(
                     Target(
                         expr='sum(up{job="fleetspeak"})',
                         legendFormat="Active Processes",
                     ),
                     timeRange=TimeRange("10s", "now"),
                     evaluator=LowerThan(
                         ACTIVE_PROCESSES_ALERTING_CONDITION),
                     operator=OP_AND,
                     reducerType=RTYPE_SUM)
             ],
         )),
     Graph(
         title="Sum of Process Memory Bytes (across all instances)",
         targets=[
             Target(
                 expr=
                 'sum(process_resident_memory_bytes{job="fleetspeak"})',
                 legendFormat="Resident Memory",
             ),
         ]),
     Graph(
         title="CPU Usage",
         targets=[
             Target(
                 expr=
                 'avg(rate(process_cpu_seconds_total{job="fleetspeak"}[30s])) * 100',
                 legendFormat="Average Process CPU Usage",
             ),
         ],
         yAxes=YAxes(left=YAxis(max=105, format="percent")),
     ),
 ]),
Ejemplo n.º 26
0
from grafanalib.core import Dashboard, Graph, Row, Target, YAxes, YAxis, SECONDS_FORMAT, OPS_FORMAT
from grr_grafanalib_dashboards.util import add_data_source
from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS
from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE

GRR_COMPONENT = "admin_ui"

dashboard = Dashboard(
    title="{}s Dashboard".format(GRR_COMPONENT).title().replace("_", " "),
    rows=[
        Row(panels=[panel(GRR_COMPONENT) for panel in row])
        for row in GENERAL_PANELS
    ] + [
        Row(panels=[
            Graph(
                title="API Method Latency Rate by Method",
                targets=[
                    Target(
                        expr=
                        'sum by (method_name) (rate(api_method_latency_sum[10m]) / rate(api_method_latency_count[10m]))',
                        legendFormat="{{method_name}}",
                    ),
                ],
                yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)),
            ),
            Graph(
                title="API Calls Count Rate with Status SUCCESS",
                targets=[
                    Target(
                        expr=
                        'sum(rate(api_method_latency_count{status="SUCCESS"}[10m]))',
Ejemplo n.º 27
0
from grafanalib.core import Dashboard, Graph, Row, Target, YAxes, YAxis, SECONDS_FORMAT
from grr_grafanalib_dashboards.util import add_data_source
from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS
from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE

GRR_COMPONENT = "frontend"

dashboard = Dashboard(
    title="{}s Dashboard".format(GRR_COMPONENT).title(),
    rows=[
        Row(panels=[panel(GRR_COMPONENT) for panel in row])
        for row in GENERAL_PANELS
    ] + [
        Row(panels=[
            Graph(
                title="QPS",
                targets=[
                    Target(
                        expr='sum(rate(frontend_request_count_total[1m]))',
                        legendFormat="Requests",
                    ),
                ],
                yAxes=YAxes(left=YAxis(format="reqps")),
            ),
            Graph(
                title="Request Latency",
                targets=[
                    Target(
                        expr=
                        'sum(rate(frontend_request_latency_sum[10m])) / sum(rate(frontend_request_latency_count[10m]))',
                        legendFormat="Latency",
Ejemplo n.º 28
0
 def queue_metrics(collapse: bool) -> Row:
     return Row(
         title="FlytePropeller Queue metrics",
         collapse=collapse,
         panels=[
             Graph(
                 title="WF Adds to main queue",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:main_adds[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="Unprocessed Queue depth",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:main_depth[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="Item retries",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:main_retries[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="Seconds of unfinished work in progress",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=f'flyte:propeller:all:main_unfinished_work_s',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SECONDS_FORMAT),
             ),
             Graph(
                 title="Workqueue work average duration",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:main_work_duration_us_sum[5m]) / rate(flyte:propeller:all:main_work_duration_us_count[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SECONDS_FORMAT),
             ),
             Graph(
                 title="Duration for which an item stays in queue - avg",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:main_queue_latency_us_sum[5m]) / rate(flyte:propeller:all:main_queue_latency_us_count[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SECONDS_FORMAT),
             ),
         ],
     )
Ejemplo n.º 29
0
def generate_api_gateways_dashboard(
    name: str,
    cloudwatch_data_source: str,
    lambda_insights_namespace: str,
    notifications: List[str],
    environment: str,
    lambdas: List[str],
    *args,
    **kwargs,
):
    tags = ["api-gateway", environment]

    if lambdas:
        tags = tags + ["lambda"]

    api_gateway_graph = generate_api_gateway_requests_graph(
        name, cloudwatch_data_source, notifications)

    rows = [
        Row(title="API Gateway Metrics",
            showTitle=True,
            panels=[api_gateway_graph])
    ]

    if lambdas:
        for lambda_fn in lambdas:
            lambda_metrics_row = Row(
                title="{} Lambda Metrics".format(lambda_fn),
                showTitle=True,
                collapse=False,
                panels=[
                    lambda_generate_invocations_graph(name,
                                                      cloudwatch_data_source,
                                                      notifications=[]),
                    lambda_generate_duration_graph(name,
                                                   cloudwatch_data_source),
                    lambda_generate_memory_utilization_percentage_graph(
                        name,
                        cloudwatch_data_source,
                        lambda_insights_namespace,
                        notifications=notifications,
                    ),
                    lambda_generate_memory_utilization_graph(
                        name, cloudwatch_data_source,
                        lambda_insights_namespace),
                ],
            )
            lambda_logs_row = Row(
                title="{} Lambda Logs".format(lambda_fn),
                showTitle=True,
                collapse=True,
                panels=[
                    lambda_generate_logs_panel(name, cloudwatch_data_source),
                ],
            )

            rows.append(lambda_metrics_row)
            rows.append(lambda_logs_row)

    return Dashboard(
        title="{} {}".format("API Gateway:", name),
        editable=EDITABLE,
        tags=tags,
        timezone=TIMEZONE,
        sharedCrosshair=SHARED_CROSSHAIR,
        refresh=DEFAULT_REFRESH,
        rows=rows,
    ).auto_panel_ids()
Ejemplo n.º 30
0
from grafanalib.core import Dashboard, Graph, Row, Target
from grr_grafanalib_dashboards.util import add_data_source
from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS
from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE

GRR_COMPONENT = "worker"

dashboard = Dashboard(
    title="{}s Dashboard".format(GRR_COMPONENT).title(),
    rows=[
        Row(panels=[panel(GRR_COMPONENT) for panel in row])
        for row in GENERAL_PANELS
    ] + [
        Row(panels=[
            Graph(
                title="Successful Flows Rate vs. Failed Flows Rate",
                targets=[
                    Target(
                        expr=
                        'sum(rate(flow_completions_total{job="grr_worker"}[10m]))',
                        legendFormat="Successes",
                    ),
                    Target(
                        expr=
                        'sum(rate(flow_errors_total{job="grr_worker"}[10m]))',
                        legendFormat="Failures",
                    ),
                ],
            ),
            Graph(
                title="Threadpool Latency Rate vs. Queuing Time Rate",