def test_should_generate_pending_count_with_alerts_graph(self): name = "service-1" cloudwatch_data_source = "prod" cluster_name = "cluster-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar"] panel = generate_pending_count_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, cluster_name=cluster_name, grid_pos=grid_pos, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.gracePeriod.should.eql("15m") panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions[0].should.eql( AlertCondition( Target(refId="A"), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) )
def test_should_generate_mem_utilization_percentage_with_alerts_graph(self): name = "service-1" cloudwatch_data_source = "prod" cluster_name = "cluster-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar", "baz"] expected_alert_condition = AlertCondition( Target(refId="A"), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(85), reducerType=RTYPE_MAX, operator=OP_AND, ) panel = generate_mem_utilization_percentage_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, cluster_name=cluster_name, grid_pos=grid_pos, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions[0].should.eql(expected_alert_condition) panel.alert.notifications.should.eql(notifications)
def test_should_generate_res_count_graph_with_alert(self): name = "service-1" cloudwatch_data_source = "prod" loadbalancer = "loadbalancer-1" target_group = "target-group-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar", "baz"] panel = generate_res_count_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, grid_pos=grid_pos, loadbalancer=loadbalancer, target_group=target_group, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.message.should.eql("{} has 5XX errors".format(name)) panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions.should.eql( [ AlertCondition( Target(refId="A"), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ), ] )
def number_of_active_processes_graph(grr_component): return Graph( title="Number of Active Processes", targets=[ Target( expr='sum(up{{job="grr_{}"}})'.format(grr_component), legendFormat="Active Processes", ), ], alert=Alert( name="Number of Active Processes alert", message="The number of active {} processes is below {}".format( grr_component.capitalize(), config.ACTIVE_PROCESSES_ALERTING_CONDITION), alertConditions=[ AlertCondition(Target( expr='sum(up{{job="grr_{}"}})'.format(grr_component), legendFormat="Active Processes", ), timeRange=TimeRange("10s", "now"), evaluator=LowerThan( config.ACTIVE_PROCESSES_ALERTING_CONDITION), operator=OP_AND, reducerType=RTYPE_SUM) ], ))
def generate_elasticsearch_status_red_alert_graph( name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) targets = [ CloudwatchMetricsTarget( alias="Red status", namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="ClusterStatus.red", ), ] alert = None if notifications: alert = Alert( name="Elasticsearch is in status red", message="Elasticsearch is in status red", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) return Graph( title="Status RED alerts", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def generate_desired_count_graph( name: str, cluster_name: str, max: int, cloudwatch_data_source: str, notifications: List[str], grid_pos: GridPos, ): targets = [ CloudwatchMetricsTarget( alias="Containers", namespace=CONTAINER_INSIGHTS_NAMESPACE, statistics=["Maximum"], metricName="DesiredTaskCount", dimensions={ "ServiceName": name, "ClusterName": cluster_name }, refId=ALERT_REF_ID, ), ] alert = None if notifications and max > 1: alert = Alert( name="{} Desired count of containers nearing the max".format(name), message="{} is having Desired count of containers nearing the max". format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0.9 * max), # 90% of max reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Desired Tasks", dataSource=cloudwatch_data_source, targets=targets, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, gridPos=grid_pos, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
def generate_rds_transaction_id_graph(name: str, cloudwatch_data_source: str, notifications: List[str]): """ Generate rds graph """ y_axes = single_y_axis(format=SHORT_FORMAT) targets = [ CloudwatchMetricsTarget( alias="Transaction ids used", metricName="MaximumUsedTransactionIDs", statistics=["Maximum"], namespace=NAMESPACE, dimensions={"DBInstanceIdentifier": name}, period="1m", refId=ALERT_REF_ID, ), ] alert = None if notifications: alert = Alert( name="{} transaction ids used Errors".format(name), message="{} is having transaction ids used errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(1000000000), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="2m", frequency="2m", notifications=notifications, ) return Graph( title="Transaction ids used", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, transparent=TRANSPARENT, editable=EDITABLE, bars=False, lines=True, alert=alert, ).auto_ref_ids()
def create_lambda_sqs_dlq_graph(name: str, cloudwatch_data_source: str, fifo: bool, notifications: List[str]): """Create SQS Deadletter graph""" if fifo: name += ".fifo" targets = [ CloudwatchMetricsTarget( alias="Approximate number of messages available", namespace="AWS/SQS", statistics=["Maximum"], metricName="ApproximateNumberOfMessagesVisible", dimensions={"QueueName": name}, refId=ALERT_REF_ID if notifications else None, ) ] yAxes = single_y_axis(format=SHORT_FORMAT) alert = None # https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-monitoring-using-cloudwatch.html # https://aws.amazon.com/about-aws/whats-new/2019/12/amazon-sqs-now-supports-1-minute-cloudwatch-metrics/ if notifications: alert = Alert( name="{} messages".format(name), message="{} is having messages".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ), ], gracePeriod="5m", notifications=notifications, ) return Graph( title="SQS Dead Letter Queue: {}".format(name), dataSource=cloudwatch_data_source, targets=targets, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
def generate_elasticsearch_storage_graph(name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=MEGA_BYTES), YAxis(format=MEGA_BYTES), ) free_storage_alias = "Free storage" cluster_used_space_alias = "Used space" targets = [ CloudwatchMetricsTarget( alias=free_storage_alias, namespace=NAMESPACE, period="1m", statistics=["Minimum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="FreeStorageSpace", refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=cluster_used_space_alias, namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="ClusterUsedSpace", ), ] alert = None if notifications: alert = Alert( name="Elasticsearch storage alert", message="Elasticsearch might be low on storage", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=LowerThan(10240), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) series_overrides = [ { "alias": free_storage_alias, "color": colors.GREEN, "lines": True, "bars": False, }, { "alias": cluster_used_space_alias, "color": colors.ORANGE, "lines": True, "bars": False, "yaxis": 2, }, ] return Graph( title="Storage", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def generate_res_count_graph( name: str, cloudwatch_data_source: str, loadbalancer: str, target_group: str, grid_pos: GridPos, notifications: List[str], ) -> Graph: """ Generate res graph """ xx2_alias = "2xx" xx3_alias = "3xx" xx4_alias = "4xx" xx5_alias = "5xx" targets = [ CloudwatchMetricsTarget( alias=xx2_alias, namespace="AWS/ApplicationELB", statistics=["Sum"], metricName="HTTPCode_Target_2XX_Count", dimensions={ "LoadBalancer": loadbalancer, "TargetGroup": target_group }, ), CloudwatchMetricsTarget( alias=xx3_alias, namespace="AWS/ApplicationELB", statistics=["Sum"], metricName="HTTPCode_Target_3XX_Count", dimensions={ "LoadBalancer": loadbalancer, "TargetGroup": target_group }, ), CloudwatchMetricsTarget( alias=xx4_alias, namespace="AWS/ApplicationELB", statistics=["Sum"], metricName="HTTPCode_Target_4XX_Count", dimensions={ "LoadBalancer": loadbalancer, "TargetGroup": target_group }, ), CloudwatchMetricsTarget( alias=xx5_alias, namespace="AWS/ApplicationELB", statistics=["Sum"], metricName="HTTPCode_Target_5XX_Count", dimensions={ "LoadBalancer": loadbalancer, "TargetGroup": target_group }, refId=ALERT_REF_ID, ), ] seriesOverrides = [ { "alias": xx2_alias, "color": colors.GREEN, "fill": 0 }, { "alias": xx3_alias, "color": colors.YELLOW, "fill": 0 }, { "alias": xx4_alias, "color": colors.ORANGE, "fill": 0 }, { "alias": xx5_alias, "color": colors.RED, "fill": 0 }, ] alert = None if notifications: alert = Alert( name="{} has 5XX errors".format(name), message="{} has 5XX errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Responses", dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, transparent=TRANSPARENT, editable=EDITABLE, gridPos=grid_pos, alert=alert, )
def generate_mem_utilization_percentage_graph( name: str, cloudwatch_data_source: str, cluster_name: str, notifications: List[str], grid_pos: GridPos, ) -> Graph: """ Generate Mem Percentage graph """ y_axes = single_y_axis(format=PERCENT_FORMAT) targets = [ CloudwatchMetricsTarget( alias=MINIMUM_ALIAS, namespace=ECS_NAMESPACE, statistics=["Minimum"], metricName="MemoryUtilization", dimensions={ "ServiceName": name, "ClusterName": cluster_name }, ), CloudwatchMetricsTarget( alias=AVERAGE_ALIAS, namespace=ECS_NAMESPACE, statistics=["Average"], metricName="MemoryUtilization", dimensions={ "ServiceName": name, "ClusterName": cluster_name }, refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=MAXIMUM_ALIAS, namespace=ECS_NAMESPACE, statistics=["Maximum"], metricName="MemoryUtilization", dimensions={ "ServiceName": name, "ClusterName": cluster_name }, ), ] seriesOverrides = [ { "alias": MINIMUM_ALIAS, "color": colors.GREEN, "lines": False }, { "alias": AVERAGE_ALIAS, "color": colors.YELLOW, "fill": 0 }, { "alias": MAXIMUM_ALIAS, "color": colors.GREEN, "fillBelowTo": MINIMUM_ALIAS, "lines": False, }, ] alert = None if notifications: alert = Alert( name="{} Memory utilization Errors".format(name), message="{} is having Memory utilization errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(85), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Memory Utilization Percentage", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=seriesOverrides, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, gridPos=grid_pos, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
def generate_sfn_execution_metrics_graph(name: str, cloudwatch_data_source: str, notifications: List[str], *args, **kwargs): """ Generate step function graph """ targets = [ CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_STARTED_ALIAS, namespace=NAMESPACE, metricName="ExecutionsStarted", statistics=["Sum"], dimensions={"StateMachineArn": name}, ), CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_SUCCEEDED_ALIAS, namespace=NAMESPACE, metricName="ExecutionsSucceeded", statistics=["Sum"], dimensions={"StateMachineArn": name}, ), CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_ABORTED_ALIAS, namespace=NAMESPACE, metricName="ExecutionsAborted", statistics=["Sum"], dimensions={"StateMachineArn": name}, refId=SFN_EXECUTIONS_ABORTED_REF_ID, ), CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_FAILED_ALIAS, namespace=NAMESPACE, metricName="ExecutionsFailed", statistics=["Sum"], dimensions={"StateMachineArn": name}, refId=SFN_EXECUTIONS_FAILED_REF_ID, ), CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_THROTTLED_ALIAS, namespace=NAMESPACE, metricName="ExecutionsThrottled", statistics=["Sum"], dimensions={"StateMachineArn": name}, refId=SFN_EXECUTIONS_THROTTLED_REF_ID, ), CloudwatchMetricsTarget( alias=SFN_EXECUTIONS_TIMEDOUT_ALIAS, namespace=NAMESPACE, metricName="ExecutionsTimedOut", statistics=["Sum"], dimensions={"StateMachineArn": name}, refId=SFN_EXECUTIONS_TIMEDOUT_REF_ID, ), ] yAxes = YAxes( YAxis(format=SHORT_FORMAT, decimals=2), YAxis(format=SHORT_FORMAT, decimals=2), ) seriesOverrides = [ { "alias": SFN_EXECUTIONS_STARTED_ALIAS, "points": False, "color": colors.BLUE, }, { "alias": SFN_EXECUTIONS_SUCCEEDED_ALIAS, "points": False, "color": colors.GREEN, }, { "alias": SFN_EXECUTIONS_ABORTED_ALIAS, "points": False, "color": colors.RED, }, { "alias": SFN_EXECUTIONS_FAILED_ALIAS, "points": False, "color": colors.RED, }, { "alias": SFN_EXECUTIONS_THROTTLED_ALIAS, "points": False, "color": colors.ORANGE, }, { "alias": SFN_EXECUTIONS_TIMEDOUT_ALIAS, "points": False, "color": colors.RED, }, ] alert = None # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html if notifications: alert = Alert( name="{} execution issues".format(name), message="{} might have failed, aborted, throttled or timedout". format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=SFN_EXECUTIONS_ABORTED_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), AlertCondition( Target(refId=SFN_EXECUTIONS_FAILED_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), AlertCondition( Target(refId=SFN_EXECUTIONS_THROTTLED_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), AlertCondition( Target(refId=SFN_EXECUTIONS_TIMEDOUT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), ], gracePeriod="2m", frequency="2m", notifications=notifications, ) return Graph( title="Step function execution metrics", dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
def generate_elasticsearch_jvm_memory_pressure_graph( name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = single_y_axis(format=PERCENT_FORMAT) alias = "JVM memory pressure" targets = [ CloudwatchMetricsTarget( alias=alias, namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="JVMMemoryPressure", refId=ALERT_REF_ID, ) ] alert = None if notifications: alert = Alert( name="Elasticsearch JVM memory pressure alert", message="Elasticsearch JVM memory pressure alert", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(80), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) series_overrides = [{ "alias": alias, "color": colors.GREEN, "lines": True, "bars": False, }] return Graph( title=alias, dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
expr='sum(up{job="fleetspeak"})', legendFormat="Active Processes", ), ], alert=Alert( name="Number of Active Processes alert", message= "The number of active Fleetspeak Server processes is below {}" .format(ACTIVE_PROCESSES_ALERTING_CONDITION), alertConditions=[ AlertCondition( Target( expr='sum(up{job="fleetspeak"})', legendFormat="Active Processes", ), timeRange=TimeRange("10s", "now"), evaluator=LowerThan( ACTIVE_PROCESSES_ALERTING_CONDITION), operator=OP_AND, reducerType=RTYPE_SUM) ], )), Graph( title="Sum of Process Memory Bytes (across all instances)", targets=[ Target( expr= 'sum(process_resident_memory_bytes{job="fleetspeak"})', legendFormat="Resident Memory", ), ]),
def generate_elasticache_redis_cpu_credit_usage_graph( cache_cluster_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate ElastiCache Redis graph """ y_axes = single_y_axis(format=SHORT_FORMAT) aliases = { "credit balance": "CPU credit balance", "credit usage": "CPU credit usage", } targets = [ CloudwatchMetricsTarget( alias=aliases["credit balance"], namespace=NAMESPACE, period="1m", statistics=["Minimum"], dimensions={"CacheClusterId": cache_cluster_id}, metricName="CPUCreditBalance", refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=aliases["credit usage"], namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={"CacheClusterId": cache_cluster_id}, metricName="CPUCreditUsage", ), ] alert = None if notifications: alert = Alert( name="ElastiCache Redis CPU credit balance alert", message="ElastiCache Redis CPU credit balance alert", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=LowerThan(250), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) series_overrides = [ { "alias": aliases["credit balance"], "color": colors.GREEN, "lines": True, "bars": False, }, { "alias": aliases["credit usage"], "color": colors.YELLOW, "lines": True, "bars": False, }, ] return Graph( title="CPU credit utilization", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def generate_rds_cpu_graph(name: str, cloudwatch_data_source: str, notifications: List[str]): """ Generate rds graph """ y_axes = single_y_axis(format=PERCENT_FORMAT) min_alias = "min" max_alias = "max" mean_alias = "mean" targets = [ CloudwatchMetricsTarget( alias=max_alias, namespace=NAMESPACE, dimensions={"DBInstanceIdentifier": name}, period="1m", statistics=["Maximum"], metricName="CPUUtilization", refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=mean_alias, namespace=NAMESPACE, dimensions={"DBInstanceIdentifier": name}, statistics=["Average"], metricName="CPUUtilization", period="1m", ), CloudwatchMetricsTarget( alias=min_alias, namespace=NAMESPACE, dimensions={"DBInstanceIdentifier": name}, statistics=["Minimum"], metricName="CPUUtilization", period="1m", ), ] series_overrides = get_series_overrides(min_alias, mean_alias, max_alias) alert = None if notifications: alert = Alert( name="{} CPU utilization Errors".format(name), message="{} is having CPU utilization errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(80), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="2m", frequency="2m", notifications=notifications, ) return Graph( title="CPU utilization", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, seriesOverrides=series_overrides, transparent=TRANSPARENT, editable=EDITABLE, bars=False, lines=True, alert=alert, ).auto_ref_ids()
def lambda_generate_memory_utilization_percentage_graph( name: str, cloudwatch_data_source: str, lambda_insights_namespace: str, notifications: List[str], *args, **kwargs, ) -> Graph: """ Generate lambda graph """ targets = [ CloudwatchMetricsTarget( alias=MINIMUM_ALIAS, namespace=lambda_insights_namespace, statistics=["Minimum"], metricName="memory_utilization", dimensions={"function_name": name}, ), CloudwatchMetricsTarget( alias=AVERAGE_ALIAS, namespace=lambda_insights_namespace, statistics=["Average"], metricName="memory_utilization", dimensions={"function_name": name}, refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=MAXIMUM_ALIAS, namespace=lambda_insights_namespace, statistics=["Maximum"], metricName="memory_utilization", dimensions={"function_name": name}, ), ] yAxes = YAxes( YAxis(format=SHORT_FORMAT, decimals=2), YAxis(format=SHORT_FORMAT, decimals=2), ) seriesOverrides = [ { "alias": MINIMUM_ALIAS, "color": "#C8F2C2", "lines": False }, { "alias": AVERAGE_ALIAS, "color": "#FADE2A", "fill": 0 }, { "alias": MAXIMUM_ALIAS, "color": "rgb(77, 159, 179)", "fillBelowTo": MINIMUM_ALIAS, "lines": False, }, ] alert = None # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html if notifications: alert = Alert( name="{} Memory utilization Errors".format(name), message="{} is having Memory utilization errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(90), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Lambda Memory Utilization Percentage", dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, # gridPos=GridPos(8,12,0,0) ).auto_ref_ids()
def lambda_generate_invocations_graph(name: str, cloudwatch_data_source: str, notifications: List[str], *args, **kwargs) -> Graph: """ Generate lambda graph """ targets = [ CloudwatchMetricsTarget( alias=LAMBDA_INVOCATIONS_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="Invocations", dimensions={"FunctionName": name}, ), CloudwatchMetricsTarget( alias=LAMBDA_ERRORS_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="Errors", dimensions={"FunctionName": name}, refId=ALERT_REF_ID, ), ] yAxes = YAxes( YAxis(format=SHORT_FORMAT, decimals=2), YAxis(format=SHORT_FORMAT, decimals=2), ) seriesOverrides = [ { "alias": LAMBDA_INVOCATIONS_ALIAS, "points": False, "color": colors.GREEN, }, { "alias": LAMBDA_ERRORS_ALIAS, "points": False, "color": colors.RED, }, ] alert = None # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html if notifications: alert = Alert( name="{} Invocation Errors".format(name), message="{} is having invocation errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Lambda Invocations and Errors", dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, # gridPos=GridPos(8,12,0,0) ).auto_ref_ids()
def generate_api_gateway_requests_graph(name: str, cloudwatch_data_source: str, notifications: List[str], *args, **kwargs): targets = [ CloudwatchMetricsTarget( alias=API_GATEWAY_5XX_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="5XXError", dimensions={"ApiName": name}, refId=ALERT_REF_ID, ), CloudwatchMetricsTarget( alias=API_GATEWAY_REQUESTS_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="Count", dimensions={"ApiName": name}, refId=API_GATEWAY_REQUESTS_REF_ID, ), CloudwatchMetricsTarget( alias=API_GATEWAY_4XX_ALIAS, namespace=NAMESPACE, statistics=["Sum"], metricName="4XXError", dimensions={"ApiName": name}, refId=API_GATEWAY_4XX_REF_ID, ), ] yAxes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) seriesOverrides = [ { "alias": API_GATEWAY_REQUESTS_ALIAS, "points": False, "color": colors.GREEN, }, { "alias": API_GATEWAY_4XX_ALIAS, "color": colors.YELLOW, }, { "alias": API_GATEWAY_5XX_ALIAS, "color": colors.RED, }, ] alert = None # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html if notifications: alert = Alert( name="{} API Gateway 5XX Errors".format(name), message="{} is having 5XX errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) ], frequency="2m", gracePeriod="2m", notifications=notifications, ) return Graph( title="API Gateway Requests: {}".format(name), dataSource=cloudwatch_data_source, targets=targets, seriesOverrides=seriesOverrides, yAxes=yAxes, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), alert=Alert( name="Too many 500s on Nginx", message="More than 5 QPS of 500s on Nginx for 5 minutes", alertConditions=[ AlertCondition( Target( expr= 'sum(irate(nginx_http_requests_total{job="default/frontend",status=~"5.."}[1m]))', legendFormat="5xx", refId='A', ), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(5), operator=OP_AND, reducerType=RTYPE_SUM, ), ], notifications=[ Notification("notification_channel_uid"), ], )), Graph( title="Frontend latency", dataSource='My Prometheus', targets=[ Target( expr=