def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # ----------------------------------------------------------------------------------------------------------- # The Simple Webservice Logic - This is what we will be monitoring # # API GW HTTP API, Lambda Fn and DynamoDB # https://github.com/cdk-patterns/serverless/tree/master/the-simple-webservice # ----------------------------------------------------------------------------------------------------------- # DynamoDB Table table = dynamo_db.Table( self, "Hits", partition_key=dynamo_db.Attribute( name="path", type=dynamo_db.AttributeType.STRING), billing_mode=dynamo_db.BillingMode.PAY_PER_REQUEST) # defines an AWS Lambda resource dynamo_lambda = _lambda.Function( self, "DynamoLambdaHandler", runtime=_lambda.Runtime.NODEJS_12_X, # execution environment handler="lambda.handler", # file is "lambda", function is "handler" code=_lambda.Code.from_asset( "lambda_fns"), # Code loaded from the lambda dir environment={'HITS_TABLE_NAME': table.table_name}) # grant the lambda role read/write permissions to our table' table.grant_read_write_data(dynamo_lambda) # defines an API Gateway Http API resource backed by our "dynamoLambda" function. api = api_gw.HttpApi(self, 'HttpAPI', default_integration=api_gw.LambdaProxyIntegration( handler=dynamo_lambda)) core.CfnOutput(self, 'HTTP API Url', value=api.url) # ----------------------------------------------------------------------------------------------------------- # Monitoring Logic Starts Here # # This is everything we need to understand the state of our system: # - custom metrics # - cloudwatch alarms # - custom cloudwatch dashboard # ----------------------------------------------------------------------------------------------------------- # SNS Topic so we can hook things into our alerts e.g. email error_topic = sns.Topic(self, 'theBigFanTopic') ### # Custom Metrics ### api_gw_4xx_error_percentage = cloud_watch.MathExpression( expression="m1/m2*100", label="% API Gateway 4xx Errors", using_metrics={ "m1": self.metric_for_api_gw(api.http_api_id, '4XXError', '4XX Errors', 'sum'), "m2": self.metric_for_api_gw(api.http_api_id, 'Count', '# Requests', 'sum'), }, period=core.Duration.minutes(5)) # Gather the % of lambda invocations that error in past 5 mins lambda_error_perc = cloud_watch.MathExpression( expression="e / i * 100", label="% of invocations that errored, last 5 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "e": dynamo_lambda.metric(metric_name="Errors", statistic="sum"), }, period=core.Duration.minutes(5)) # note: throttled requests are not counted in total num of invocations lambda_throttled_perc = cloud_watch.MathExpression( expression="t / (i + t) * 100", label="% of throttled requests, last 30 mins", using_metrics={ "i": dynamo_lambda.metric(metric_name="Invocations", statistic="sum"), "t": dynamo_lambda.metric(metric_name="Throttles", statistic="sum"), }, period=core.Duration.minutes(5)) # I think usererrors are at an account level rather than a table level so merging # these two metrics until I can get a definitive answer. I think usererrors # will always show as 0 when scoped to a table so this is still effectively # a system errors count dynamo_db_total_errors = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Errors", using_metrics={ "m1": table.metric_user_errors(), "m2": table.metric_system_errors(), }, period=core.Duration.minutes(5)) # Rather than have 2 alerts, let's create one aggregate metric dynamo_db_throttles = cloud_watch.MathExpression( expression="m1 + m2", label="DynamoDB Throttles", using_metrics={ "m1": table.metric(metric_name="ReadThrottleEvents", statistic="sum"), "m2": table.metric(metric_name="WriteThrottleEvents", statistic="sum"), }, period=core.Duration.minutes(5)) ### # Alarms ### # Api Gateway # 4xx are user errors so a large volume indicates a problem cloud_watch.Alarm(self, id="API Gateway 4XX Errors > 1%", metric=api_gw_4xx_error_percentage, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 5xx are internal server errors so we want 0 of these cloud_watch.Alarm(self, id="API Gateway 5XX Errors > 0", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="p99"), threshold=0, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) cloud_watch.Alarm(self, id="API p99 latency alarm >= 1s", metric=self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API GW Latency", stat="p99"), threshold=1000, period=core.Duration.minutes(5), evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # Lambda # 2% of Dynamo Lambda invocations erroring cloud_watch.Alarm(self, id="Dynamo Lambda 2% Error", metric=lambda_error_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 1% of Lambda invocations taking longer than 1 second cloud_watch.Alarm(self, id="Dynamo Lambda p99 Long Duration (>1s)", metric=dynamo_lambda.metric_duration(), period=core.Duration.minutes(5), threshold=1000, evaluation_periods=6, datapoints_to_alarm=1, statistic="p99", treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # 2% of our lambda invocations are throttled cloud_watch.Alarm(self, id="Dynamo Lambda 2% Throttled", metric=lambda_throttled_perc, threshold=2, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # DynamoDB # DynamoDB Interactions are throttled - indicated poorly provisioned cloud_watch.Alarm(self, id="DynamoDB Table Reads/Writes Throttled", metric=dynamo_db_throttles, threshold=1, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) # There should be 0 DynamoDB errors cloud_watch.Alarm(self, id="DynamoDB Errors > 0", metric=dynamo_db_total_errors, threshold=0, evaluation_periods=6, datapoints_to_alarm=1, treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \ .add_alarm_action(actions.SnsAction(error_topic)) dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard") dashboard.add_widgets( cloud_watch.GraphWidget(title="Requests", width=8, left=[ self.metric_for_api_gw( api_id=api.http_api_id, metric_name="Count", label="# Requests", stat="sum") ]), cloud_watch.GraphWidget( title="API GW Latency", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p50", stat="p50"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p90", stat="p90"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="Latency", label="API Latency p99", stat="p99") ]), cloud_watch.GraphWidget( title="API GW Errors", width=8, stacked=True, left=[ self.metric_for_api_gw(api_id=api.http_api_id, metric_name="4XXError", label="4XX Errors", stat="sum"), self.metric_for_api_gw(api_id=api.http_api_id, metric_name="5XXError", label="5XX Errors", stat="sum") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Error %", width=8, left=[lambda_error_perc]), cloud_watch.GraphWidget( title="Dynamo Lambda Duration", width=8, stacked=True, left=[ dynamo_lambda.metric_duration(statistic="p50"), dynamo_lambda.metric_duration(statistic="p90"), dynamo_lambda.metric_duration(statistic="p99") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %", width=8, left=[lambda_throttled_perc]), cloud_watch.GraphWidget( title="DynamoDB Latency", width=8, stacked=True, left=[ table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "GetItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "UpdateItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "PutItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "DeleteItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "Query" }), ]), cloud_watch.GraphWidget( title="DynamoDB Consumed Read/Write Units", width=8, stacked=False, left=[ table.metric(metric_name="ConsumedReadCapacityUnits"), table.metric(metric_name="ConsumedWriteCapacityUnits") ]), cloud_watch.GraphWidget( title="DynamoDB Throttles", width=8, stacked=True, left=[ table.metric(metric_name="ReadThrottleEvents", statistic="sum"), table.metric(metric_name="WriteThrottleEvents", statistic="sum") ]), )
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, # s3bucket, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create jobsender ec2 node jobsender = ec2.Instance( self, "jobsender", instance_name="s3_migrate_cluster_jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(user_data_jobsender), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC)) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(user_data_worker), desired_capacity=1, min_capacity=1, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access new s3 bucket # s3bucket.grant_read(jobsender) # s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_cluster") ec2_metric_net = cw.Metric( namespace="AWS/EC2", metric_name="NetworkOut", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Sum") ec2_metric_cpu_max = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Maximum") ec2_metric_cpu_avg = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1)) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Average", period=core.Duration.minutes(1)) cwagent_mem_max = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Maximum", period=core.Duration.minutes(1)) # CWAgent collected application logs - filter metric s3_migrate_log = logs.LogGroup(self, "applog", log_group_name="s3_migration_log") s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]), cw.GraphWidget(title="EC2-ALL-CPU", left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]), cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY", left=[cwagent_mem_max, cwagent_mem_avg]), cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING], height=6), cw.SingleValueWidget( title="Running/Waiting and Death Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id) core.CfnOutput(self, "WorkerEC2AutoscalingGroup", value=worker_asg.auto_scaling_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, s3bucket, s3_deploy, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create environment variable into userdata env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \ f'export sqs_queue_name={sqs_queue.queue_name}\n' \ f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n' env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \ f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \ f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n' # Create log group and put group name into userdata s3_migrate_log = logs.LogGroup(self, "applog") cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['metrics']['append_dimensions'][ 'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}" cw_agent_config['metrics']['append_dimensions'][ 'InstanceId'] = "\\${aws:InstanceId}" cw_agent_config_str = json.dumps(cw_agent_config, indent=4).replace("\\\\", "\\") userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \ s3_deploy.bucket_name + " .\n" + env_var + env_var_st jobsender_userdata = userdata_head + user_data_jobsender_p worker_userdata = userdata_head + user_data_worker_p # Create jobsender ec2 node jobsender = autoscaling.AutoScalingGroup( self, "jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(jobsender_userdata), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), desired_capacity=1, min_capacity=0, max_capacity=1) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(worker_userdata), desired_capacity=2, min_capacity=2, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access source code on s3_deploy bucket s3_deploy.grant_read(jobsender) s3_deploy.grant_read(worker_asg) # Allow EC2 access new s3 bucket s3bucket.grant_read(jobsender) s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets # bucket_name = '' # for b in bucket_para: # if bucket_name != b['des_bucket']: # 如果列了多个相同的Bucket,就跳过 # bucket_name = b['des_bucket'] # s3exist_bucket = s3.Bucket.from_bucket_name(self, # bucket_name, # 用这个做id # bucket_name=bucket_name) # s3exist_bucket.grant_read_write(jobsender) # s3exist_bucket.grant_read_write(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate") ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) ec2_metric_net_out = cw.MathExpression( expression= "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)", label="EC2-NetworkOut", using_metrics={}) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=mem_used_percent)', 'Average', 60)", label="mem_avg", using_metrics={}) cwagent_disk_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} " "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)", label="disk_avg", using_metrics={}) cwagent_net_tcp = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=tcp_established)', 'Average', 60)", label="tcp_conn", using_metrics={}) # CWAgent collected application logs - filter metric s3_migrate_log.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Complete", bytes, key]')) s3_migrate_log.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Uploading", bytes, key]')) s3_migrate_log.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Downloading", bytes, key]')) traffic_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC", left=[ traffic_metric_Complete, traffic_metric_Upload, traffic_metric_Download ], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False)), cw.GraphWidget(title="ERROR/WARNING LOGS", left=[log_metric_ERROR], left_y_axis=cw.YAxisProps(label="Count", show_units=False), right=[log_metric_WARNING], right_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget( title="SQS-JOBS", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="RUNNING, WAITING & DEATH JOBS", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-AutoscalingGroup-TCP", left=[cwagent_net_tcp], left_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY", left=[ec2_metric_cpu_avg, cwagent_mem_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-DISK", left=[cwagent_disk_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-NetworkOut", left=[ec2_metric_net_out], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False))) # Autoscaling up when visible message > 100 in 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, id: str, stage: str, api: _api_gw.IRestApi, fn: _lambda.IFunction, table: _ddb.ITable, **kwargs) -> None: super().__init__(scope, id, **kwargs) gw = dict(self.node.try_get_context("gateway")) ### # Custom Metrics ### # Gather the % of lambda invocations that error in past 5 mins lambda_error_perc = cloud_watch.MathExpression( expression="e / i * 100", label="% of invocations that errored, last 5 mins", using_metrics={ "i": fn.metric(metric_name="Invocations", statistic="sum"), "e": fn.metric(metric_name="Errors", statistic="sum"), }, period=core.Duration.minutes(5)) # note: throttled requests are not counted in total num of invocations lambda_throttled_perc = cloud_watch.MathExpression( expression="t / (i + t) * 100", label="% of throttled requests, last 30 mins", using_metrics={ "i": fn.metric(metric_name="Invocations", statistic="sum"), "t": fn.metric(metric_name="Throttles", statistic="sum"), }, period=core.Duration.minutes(5)) dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard", dashboard_name="Serverlesslens") dashboard.add_widgets( cloud_watch.GraphWidget(title="Requests", width=8, left=[ self.metric_for_api_gw( api_name=gw["gw_name"], stage=stage, metric_name="Count", label="# Requests", stat="sum") ]), cloud_watch.GraphWidget( title="API GW Latency", width=8, stacked=True, left=[ self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p50", stat="p50"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p90", stat="p90"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="Latency", label="API Latency p99", stat="p99") ]), cloud_watch.GraphWidget( title="API GW Errors", width=8, stacked=True, left=[ self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="4XXError", label="4XX Errors", stat="sum"), self.metric_for_api_gw(api_name=gw["gw_name"], stage=stage, metric_name="5XXError", label="5XX Errors", stat="sum") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Error %", width=8, left=[lambda_error_perc]), cloud_watch.GraphWidget(title="Dynamo Lambda Duration", width=8, stacked=True, left=[ fn.metric_duration(statistic="p50"), fn.metric_duration(statistic="p90"), fn.metric_duration(statistic="p99") ]), cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %", width=8, left=[lambda_throttled_perc]), cloud_watch.GraphWidget( title="DynamoDB Latency", width=8, stacked=True, left=[ table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "GetItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "UpdateItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "PutItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "DeleteItem" }), table.metric_successful_request_latency( dimensions={ "TableName": table.table_name, "Operation": "Query" }), ]), cloud_watch.GraphWidget( title="DynamoDB Consumed Read/Write Units", width=8, stacked=False, left=[ table.metric(metric_name="ConsumedReadCapacityUnits"), table.metric(metric_name="ConsumedWriteCapacityUnits") ]), cloud_watch.GraphWidget( title="DynamoDB Throttles", width=8, stacked=True, left=[ table.metric(metric_name="ReadThrottleEvents", statistic="sum"), table.metric(metric_name="WriteThrottleEvents", statistic="sum") ]), )