Beispiel #1
0
 def errors(collapse: bool) -> Row:
     return Row(
         title="Error (System vs user)",
         collapse=collapse,
         panels=[
             Graph(
                 title="User errors",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="System errors",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
         ])
 def dynamic_wf_build() -> typing.List[Graph]:
     return [
         Graph(
             title="Dynamic workflow build latency",
             dataSource=DATASOURCE,
             targets=[
                 Target(
                     expr=
                     f'sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000',
                     refId='A',
                 ),
             ],
             yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
         ),
         Graph(
             title="Dynamic workflow build count",
             dataSource=DATASOURCE,
             targets=[
                 Target(
                     expr=
                     f'sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)',
                     refId='A',
                 ),
             ],
             yAxes=single_y_axis(format=NO_FORMAT),
         ),
     ]
 def wf_event_recording() -> typing.List[Graph]:
     return [
         Graph(
             title="wf event recording latency success",
             dataSource=DATASOURCE,
             targets=[
                 Target(
                     expr=
                     f'sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)',
                     refId='A',
                 ),
             ],
             yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
         ),
         Graph(
             title="wf event recording count",
             dataSource=DATASOURCE,
             targets=[
                 Target(
                     expr=
                     f'sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)',
                     legendFormat="success",
                     refId='A',
                 ),
                 Target(
                     expr=
                     f'sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)',
                     legendFormat="failure",
                     refId='A',
                 ),
             ],
             yAxes=single_y_axis(format=NO_FORMAT),
         ),
     ]
 def wf_store_latency(collapse: bool) -> Row:
     return Row(
         title="etcD write metrics",
         collapse=collapse,
         panels=[
             Graph(
                 title="wf update etcD latency",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
             ),
             Graph(
                 title="etcD writes",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
             Graph(
                 title="etcD write conflicts",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_conflict[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
             Graph(
                 title="etcD write fail",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         f'sum(rate(flyte:propeller:all:wf_update_failed[5m]))',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=NO_FORMAT),
             ),
         ])
Beispiel #5
0
def generate_rds_free_storage_space_graph(name: str,
                                          cloudwatch_data_source: str):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=BYTES)

    targets = [
        CloudwatchMetricsTarget(
            alias="Free storage",
            metricName="FreeStorageSpace",
            statistics=["Minimum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
            refId=ALERT_REF_ID,
        ),
    ]

    return Graph(
        title="Free storage",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
    ).auto_ref_ids()
 def node_errors() -> Graph:
     return Graph(
         title="node event recording count",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=
                 f'sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))',
                 legendFormat="system error",
                 refId='A',
             ),
             Target(
                 expr=
                 f'sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))',
                 legendFormat="user error",
                 refId='A',
             ),
             Target(
                 expr=
                 f'sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))',
                 legendFormat="user error",
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=NO_FORMAT),
     )
Beispiel #7
0
def create_lambda_sqs_graph(name: str, cloudwatch_data_source: str,
                            fifo: bool):
    """Create SQS graph"""

    if fifo:
        name += ".fifo"

    targets = [
        CloudwatchMetricsTarget(
            alias="Number of messages sent to the queue",
            namespace="AWS/SQS",
            statistics=["Sum"],
            metricName="NumberOfMessagesSent",
            dimensions={"QueueName": name},
        )
    ]

    yAxes = single_y_axis(format=SHORT_FORMAT)

    return Graph(
        title="SQS: {}".format(name),
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
    ).auto_ref_ids()
Beispiel #8
0
def generate_firehose_graph(influxdb_data_source: str) -> Graph:
    """
    Generate Firehose graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)

    targets = [
        InfluxDBTarget(
            alias=FIREHOSE_INCOMING_RECORDS_ALIAS,
            query='SELECT sum("incoming_records_sum") FROM "{}"."{}" WHERE ("delivery_stream_name" =~ /^$firehose$/) AND $timeFilter GROUP BY time(5m), "delivery_stream_name" fill(0)'.format(  # noqa: E501
                RETENTION_POLICY, FIREHOSE_MEASUREMENT
            ),
            rawQuery=RAW_QUERY,
        ),
        InfluxDBTarget(
            alias=FIREHOSE_DELIVERY_TO_S3_SUCCESS_ALIAS,
            query='SELECT sum("delivery_to_s3._success_sum") FROM "{}"."{}" WHERE ("delivery_stream_name" =~ /^$firehose$/) AND $timeFilter GROUP BY time(5m), "delivery_stream_name" fill(0)'.format(  # noqa: E501
                RETENTION_POLICY, FIREHOSE_MEASUREMENT
            ),
            rawQuery=RAW_QUERY,
        ),
        InfluxDBTarget(
            alias=FIREHOSE_DELIVERY_TO_S3_ALIAS,
            query='SELECT sum("delivery_to_s3._records_sum") FROM "{}"."{}" WHERE ("delivery_stream_name" =~ /^$firehose$/) AND $timeFilter GROUP BY time(5m), "delivery_stream_name" fill(0)'.format(  # noqa: E501
                RETENTION_POLICY, FIREHOSE_MEASUREMENT
            ),
            rawQuery=RAW_QUERY,
        ),
    ]

    series_overrides = [
        {
            "alias": FIREHOSE_INCOMING_RECORDS_ALIAS,
            "color": colors.ORANGE,
        },
        {
            "alias": FIREHOSE_DELIVERY_TO_S3_ALIAS,
            "color": colors.YELLOW,
        },
        {
            "alias": FIREHOSE_DELIVERY_TO_S3_SUCCESS_ALIAS,
            "color": colors.GREEN,
            "zindex": 1,
        },
    ]

    return Graph(
        title="Firehose: $firehose",
        dataSource=influxdb_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
    ).auto_ref_ids()
Beispiel #9
0
def generate_elasticache_redis_swap_and_memory_usage_graph(
        cache_cluster_id: str, cloudwatch_data_source: str) -> Graph:
    """
    Generate ElastiCache Redis graph
    """

    y_axes = single_y_axis(format=BYTES)
    aliases = {
        "bytes": "Bytes used for cache",
        "swap": "Swap Usage",
    }

    targets = [
        CloudwatchMetricsTarget(
            alias=aliases["bytes"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="BytesUsedForCache",
        ),
        CloudwatchMetricsTarget(
            alias=aliases["swap"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="SwapUsage",
        ),
    ]

    series_overrides = [
        {
            "alias": aliases["swap"],
            "color": colors.BLUE,
            "lines": True,
            "bars": False,
        },
        {
            "alias": aliases["bytes"],
            "color": colors.GREEN,
            "lines": True,
            "bars": False,
        },
    ]

    return Graph(
        title="Memory and Swap usage",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
    ).auto_ref_ids()
Beispiel #10
0
def PercentageAxes(label=None, max=1):
    """Y axes that show a percentage based on a unit value."""
    return G.single_y_axis(
        format=G.PERCENT_UNIT_FORMAT,
        label=label,
        logBase=1,
        max=max,
        min=0,
    )
Beispiel #11
0
def generate_elasticsearch_automated_snapshot_failure_alert_graph(
        name: str, client_id: str, cloudwatch_data_source: str,
        notifications: List[str]):
    """
    Generate Elasticsearch graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)

    targets = [
        CloudwatchMetricsTarget(
            alias="Automated snapshot failure",
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={
                "DomainName": name,
                "ClientId": client_id
            },
            metricName="AutomatedSnapshotFailure",
            refId=ALERT_REF_ID,
        ),
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="Elasticsearch automated snapshot failure alert",
            message="Elasticsearch automated snapshot failure alert",
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    return Graph(
        title="Elasticsearch automated snapshot failure alerts",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
        alert=alert,
    ).auto_ref_ids()
Beispiel #12
0
 def resource_stats(collapse: bool) -> Row:
     return Row(
         title="Task stats",
         collapse=collapse,
         panels=[
             Graph(
                 title="Pending tasks",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         'sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="Memory Usage Percentage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         '(max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
             Graph(
                 title="CPU Usage Percentage",
                 dataSource=DATASOURCE,
                 targets=[
                     Target(
                         expr=
                         '(sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0',
                         refId='A',
                     ),
                 ],
                 yAxes=single_y_axis(format=SHORT_FORMAT),
             ),
         ])
Beispiel #13
0
 def round_latency(interval: int = 1) -> Graph:
     return Graph(
         title=f"round Latency by quantile",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=f'sum(rate(flyte:propeller:all:round:raw_unlabeled_ms[{interval}m])) by (quantile)',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
     )
Beispiel #14
0
def api_call_latency(title, metric, verb, scope, threshold):
    return d.Graph(
        title=title,
        targets=[
            d.Target(expr=str(threshold), legendFormat="threshold"),
            d.Target(
                expr='quantile_over_time(0.99, %(metric)s{quantile="0.99", verb=~"%(verb)s", scope=~"%(scope)s"}[12h])'
                % {"metric": metric, "verb": verb, "scope": scope}
            ),
        ],
        yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
    )
 def api_latency(api: str, interval: int = 1) -> Graph:
     return Graph(
         title=f"{api} Latency",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=f'sum(rate(flyte:admin:{api}:duration_ms[{interval}m])) by (quantile)',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=SECONDS_FORMAT),
     )
Beispiel #16
0
 def node_input_latency() -> Graph:
     return Graph(
         title=f"Node Input latency quantile and workflow",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=f'sum(flyte:propeller:all:node:node_input_latency_ms) by (quantile, wf)',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
     )
Beispiel #17
0
 def round_latency_per_wf(interval: int = 1) -> Graph:
     return Graph(
         title=f"round Latency per workflow",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=f'sum(rate(flyte:propeller:all:round:raw_ms[{interval}m])) by (wf)',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
     )
Beispiel #18
0
def generate_rds_network_throughput_graph(name: str,
                                          cloudwatch_data_source: str):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=BYTES_SEC, min=None)

    targets = [
        CloudwatchMetricsTarget(
            alias="RX",
            metricName="NetworkReceiveThroughput",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
        CloudwatchMetricsTarget(
            alias="TX",
            metricName="NetworkTransmitThroughput",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
    ]

    series_overrides = [
        {
            "alias": "TX",
            "color": colors.GREEN,
            "transform": "negative-Y",
            "fillGradient": 10,
        },
        {
            "alias": "RX",
            "color": colors.YELLOW,
            "fillGradient": 10
        },
    ]

    return Graph(
        title="Network throughput",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
        seriesOverrides=series_overrides,
    ).auto_ref_ids()
Beispiel #19
0
def generate_rds_transaction_id_graph(name: str, cloudwatch_data_source: str,
                                      notifications: List[str]):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)

    targets = [
        CloudwatchMetricsTarget(
            alias="Transaction ids used",
            metricName="MaximumUsedTransactionIDs",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
            refId=ALERT_REF_ID,
        ),
    ]

    alert = None

    if notifications:
        alert = Alert(
            name="{} transaction ids used Errors".format(name),
            message="{} is having transaction ids used errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(1000000000),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="2m",
            frequency="2m",
            notifications=notifications,
        )

    return Graph(
        title="Transaction ids used",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
        alert=alert,
    ).auto_ref_ids()
Beispiel #20
0
def generate_rds_database_connections_graph(name: str,
                                            cloudwatch_data_source: str):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)
    min_alias = "min"
    max_alias = "max"
    mean_alias = "mean"

    targets = [
        CloudwatchMetricsTarget(
            alias=max_alias,
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            metricName="DatabaseConnections",
            statistics=["Maximum"],
            period="1m",
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=mean_alias,
            metricName="DatabaseConnections",
            statistics=["Average"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
        CloudwatchMetricsTarget(
            alias=min_alias,
            metricName="DatabaseConnections",
            statistics=["Minimum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
    ]

    series_overrides = get_series_overrides(min_alias, mean_alias, max_alias)

    return Graph(
        title="Database connections",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
    ).auto_ref_ids()
 def db_latency(entity: str, op: str, interval: int = 1) -> Graph:
     return Graph(
         title=f"{op} Latency",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=
                 f'sum(flyte:admin:database:postgres:repositories:{entity}:{op}_ms) by (quantile)',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
     )
def api_call_latency(title, verb, scope, threshold):
    return d.Graph(
        title=title,
        targets=[
            g.Target(expr=str(threshold), legendFormat="threshold"),
            g.Target(
                expr='apiserver:apiserver_request_latency_1m:histogram_quantile{quantile="0.99", verb=~"%(verb)s", scope=~"%(scope)s"}'
                % {"verb": verb, "scope": scope},
                legendFormat="{{verb}} {{scope}}/{{resource}}",
            ),
        ],
        yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
    )
Beispiel #23
0
def create_lambda_sqs_dlq_graph(name: str, cloudwatch_data_source: str,
                                fifo: bool, notifications: List[str]):
    """Create SQS Deadletter graph"""

    if fifo:
        name += ".fifo"

    targets = [
        CloudwatchMetricsTarget(
            alias="Approximate number of messages available",
            namespace="AWS/SQS",
            statistics=["Maximum"],
            metricName="ApproximateNumberOfMessagesVisible",
            dimensions={"QueueName": name},
            refId=ALERT_REF_ID if notifications else None,
        )
    ]

    yAxes = single_y_axis(format=SHORT_FORMAT)
    alert = None

    # https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-monitoring-using-cloudwatch.html
    # https://aws.amazon.com/about-aws/whats-new/2019/12/amazon-sqs-now-supports-1-minute-cloudwatch-metrics/
    if notifications:
        alert = Alert(
            name="{} messages".format(name),
            message="{} is having messages".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                ),
            ],
            gracePeriod="5m",
            notifications=notifications,
        )

    return Graph(
        title="SQS Dead Letter Queue: {}".format(name),
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
    def run(self):
        templateList = [
            G.Template(default="",
                       dataSource="default",
                       name="serverid",
                       label="ServerID",
                       query="label_values(serverid)")
        ]

        dashboard = G.Dashboard(title=self.options.title,
                                templating=G.Templating(list=templateList))

        # Simple table processing - could be enhanced to use GridPos etc.
        for metric in metrics:
            if 'section' in metric:
                dashboard.rows.append(
                    G.Row(title=metric['section'], showTitle=True))
                continue
            if 'row' in metric:
                dashboard.rows.append(G.Row(title='', showTitle=False))
                continue
            graph = G.Graph(title=metric['title'],
                            dataSource='default',
                            maxDataPoints=1000,
                            legend=G.Legend(show=True,
                                            alignAsTable=True,
                                            min=True,
                                            max=True,
                                            avg=True,
                                            current=True,
                                            total=True,
                                            sort='max',
                                            sortDesc=True),
                            yAxes=G.single_y_axis())
            ref_id = 'A'
            for texp in metric['expr']:
                graph.targets.append(G.Target(expr=texp, refId=ref_id))
                ref_id = chr(ord(ref_id) + 1)
            dashboard.rows[-1].panels.append(graph)

        # Auto-number panels - returns new dashboard
        dashboard = dashboard.auto_panel_ids()

        s = io.StringIO()
        write_dashboard(dashboard, s)
        print("""{
        "dashboard": %s
        }
        """ % s.getvalue())
Beispiel #25
0
 def node_event_recording_latency() -> Graph:
     return Graph(
         title=f"Node Event event recording latency quantile and workflow",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=f'sum(flyte:propeller:all:node:event_recording:success_duration_ms) by (quantile, wf)',
                 refId='A',
             ),
             Target(
                 expr=f'sum(flyte:propeller:all:node:event_recording:failure_duration_ms) by (quantile, wf)',
                 refId='B',
             ),
         ],
         yAxes=single_y_axis(format=MILLISECONDS_FORMAT),
     )
Beispiel #26
0
 def api_call_latency(title, verb, scope, threshold):
     return d.Graph(
         title=title,
         targets=[
             g.Target(expr=str(threshold), legendFormat="threshold"),
             g.Target(
                 expr=d.one_line(expression % {
                     "verb": verb,
                     "scope": scope
                 }),
                 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
                 # legendFormat="{{verb}} {{scope}}/{{resource}}",
             ),
         ],
         yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
     )
 def metastore_cache_hit_percentage(interval: int) -> Graph:
     """
     TODO replace with metric math maybe?
     """
     return Graph(
         title="cache hit percentage",
         dataSource=DATASOURCE,
         targets=[
             Target(
                 expr=
                 f'(sum(rate(flyte:propeller:all:metastore:cache_hit[{interval}m])) * 100) / (sum(rate(flyte:propeller:all:metastore:cache_miss[{interval}m])) + sum(rate(flyte:propeller:all:metastore:cache_hit[{interval}m])))',
                 refId='A',
             ),
         ],
         yAxes=single_y_axis(format=PERCENT_FORMAT),
     )
Beispiel #28
0
def generate_rds_disk_ops_graph(name: str, cloudwatch_data_source: str):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT, min=None)

    targets = [
        CloudwatchMetricsTarget(
            alias="write iops",
            metricName="WriteIOPS",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
        CloudwatchMetricsTarget(
            alias="read iops",
            metricName="ReadIOPS",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
        CloudwatchMetricsTarget(
            alias="disk queue depth",
            metricName="DiskQueueDepth",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
        ),
    ]

    return Graph(
        title="Disk iops",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
    ).auto_ref_ids()
Beispiel #29
0
def generate_elasticache_redis_latency_graph(
        cache_cluster_id: str, cloudwatch_data_source: str) -> Graph:
    """
    Generate ElastiCache Redis graph
    """

    y_axes = single_y_axis(format=MILLISECONDS_FORMAT)

    aliases = {
        "latency": "String based CMDs latency",
    }

    targets = [
        CloudwatchMetricsTarget(
            alias=aliases["latency"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="StringBasedCmdsLatency",
        ),
    ]

    series_overrides = [
        {
            "alias": aliases["latency"],
            "color": colors.GREEN,
            "lines": True,
            "bars": False,
        },
    ]

    return Graph(
        title="Latency",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
    ).auto_ref_ids()
Beispiel #30
0
def generate_elasticache_redis_connections_graph(
        cache_cluster_id: str, cloudwatch_data_source: str) -> Graph:
    """
    Generate ElastiCache Redis graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)
    aliases = {
        "current": "Current connections",
    }

    targets = [
        CloudwatchMetricsTarget(
            alias=aliases["current"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="CurrConnections",
            refId=ALERT_REF_ID,
        ),
    ]

    series_overrides = [
        {
            "alias": aliases["current"],
            "color": colors.GREEN,
            "lines": True,
            "bars": False,
        },
    ]

    return Graph(
        title="Current connections",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
    ).auto_ref_ids()