def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # -----------------------------------------------------------------------------------------------------------
        # The Simple Webservice Logic - This is what we will be monitoring
        #
        # API GW HTTP API, Lambda Fn and DynamoDB
        # https://github.com/cdk-patterns/serverless/tree/master/the-simple-webservice
        # -----------------------------------------------------------------------------------------------------------

        # DynamoDB Table
        table = dynamo_db.Table(
            self,
            "Hits",
            partition_key=dynamo_db.Attribute(
                name="path", type=dynamo_db.AttributeType.STRING),
            billing_mode=dynamo_db.BillingMode.PAY_PER_REQUEST)

        # defines an AWS  Lambda resource
        dynamo_lambda = _lambda.Function(
            self,
            "DynamoLambdaHandler",
            runtime=_lambda.Runtime.NODEJS_12_X,  # execution environment
            handler="lambda.handler",  # file is "lambda", function is "handler"
            code=_lambda.Code.from_asset(
                "lambda_fns"),  # Code loaded from the lambda dir
            environment={'HITS_TABLE_NAME': table.table_name})

        # grant the lambda role read/write permissions to our table'
        table.grant_read_write_data(dynamo_lambda)

        # defines an API Gateway Http API resource backed by our "dynamoLambda" function.
        api = api_gw.HttpApi(self,
                             'HttpAPI',
                             default_integration=api_gw.LambdaProxyIntegration(
                                 handler=dynamo_lambda))

        core.CfnOutput(self, 'HTTP API Url', value=api.url)

        # -----------------------------------------------------------------------------------------------------------
        # Monitoring Logic Starts Here
        #
        # This is everything we need to understand the state of our system:
        # - custom metrics
        # - cloudwatch alarms
        # - custom cloudwatch dashboard
        # -----------------------------------------------------------------------------------------------------------

        # SNS Topic so we can hook things into our alerts e.g. email
        error_topic = sns.Topic(self, 'theBigFanTopic')

        ###
        # Custom Metrics
        ###

        api_gw_4xx_error_percentage = cloud_watch.MathExpression(
            expression="m1/m2*100",
            label="% API Gateway 4xx Errors",
            using_metrics={
                "m1":
                self.metric_for_api_gw(api.http_api_id, '4XXError',
                                       '4XX Errors', 'sum'),
                "m2":
                self.metric_for_api_gw(api.http_api_id, 'Count', '# Requests',
                                       'sum'),
            },
            period=core.Duration.minutes(5))

        # Gather the % of lambda invocations that error in past 5 mins
        lambda_error_perc = cloud_watch.MathExpression(
            expression="e / i * 100",
            label="% of invocations that errored, last 5 mins",
            using_metrics={
                "i":
                dynamo_lambda.metric(metric_name="Invocations",
                                     statistic="sum"),
                "e":
                dynamo_lambda.metric(metric_name="Errors", statistic="sum"),
            },
            period=core.Duration.minutes(5))

        # note: throttled requests are not counted in total num of invocations
        lambda_throttled_perc = cloud_watch.MathExpression(
            expression="t / (i + t) * 100",
            label="% of throttled requests, last 30 mins",
            using_metrics={
                "i":
                dynamo_lambda.metric(metric_name="Invocations",
                                     statistic="sum"),
                "t":
                dynamo_lambda.metric(metric_name="Throttles", statistic="sum"),
            },
            period=core.Duration.minutes(5))

        # I think usererrors are at an account level rather than a table level so merging
        # these two metrics until I can get a definitive answer. I think usererrors
        # will always show as 0 when scoped to a table so this is still effectively
        # a system errors count
        dynamo_db_total_errors = cloud_watch.MathExpression(
            expression="m1 + m2",
            label="DynamoDB Errors",
            using_metrics={
                "m1": table.metric_user_errors(),
                "m2": table.metric_system_errors(),
            },
            period=core.Duration.minutes(5))

        # Rather than have 2 alerts, let's create one aggregate metric
        dynamo_db_throttles = cloud_watch.MathExpression(
            expression="m1 + m2",
            label="DynamoDB Throttles",
            using_metrics={
                "m1":
                table.metric(metric_name="ReadThrottleEvents",
                             statistic="sum"),
                "m2":
                table.metric(metric_name="WriteThrottleEvents",
                             statistic="sum"),
            },
            period=core.Duration.minutes(5))
        ###
        # Alarms
        ###

        # Api Gateway

        # 4xx are user errors so a large volume indicates a problem
        cloud_watch.Alarm(self,
                          id="API Gateway 4XX Errors > 1%",
                          metric=api_gw_4xx_error_percentage,
                          threshold=1,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # 5xx are internal server errors so we want 0 of these
        cloud_watch.Alarm(self,
                          id="API Gateway 5XX Errors > 0",
                          metric=self.metric_for_api_gw(api_id=api.http_api_id,
                                                        metric_name="5XXError",
                                                        label="5XX Errors",
                                                        stat="p99"),
                          threshold=0,
                          period=core.Duration.minutes(5),
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        cloud_watch.Alarm(self,
                          id="API p99 latency alarm >= 1s",
                          metric=self.metric_for_api_gw(api_id=api.http_api_id,
                                                        metric_name="Latency",
                                                        label="API GW Latency",
                                                        stat="p99"),
                          threshold=1000,
                          period=core.Duration.minutes(5),
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # Lambda

        # 2% of Dynamo Lambda invocations erroring
        cloud_watch.Alarm(self,
                          id="Dynamo Lambda 2% Error",
                          metric=lambda_error_perc,
                          threshold=2,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # 1% of Lambda invocations taking longer than 1 second
        cloud_watch.Alarm(self,
                          id="Dynamo Lambda p99 Long Duration (>1s)",
                          metric=dynamo_lambda.metric_duration(),
                          period=core.Duration.minutes(5),
                          threshold=1000,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          statistic="p99",
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # 2% of our lambda invocations are throttled
        cloud_watch.Alarm(self,
                          id="Dynamo Lambda 2% Throttled",
                          metric=lambda_throttled_perc,
                          threshold=2,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # DynamoDB

        # DynamoDB Interactions are throttled - indicated poorly provisioned
        cloud_watch.Alarm(self,
                          id="DynamoDB Table Reads/Writes Throttled",
                          metric=dynamo_db_throttles,
                          threshold=1,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        # There should be 0 DynamoDB errors
        cloud_watch.Alarm(self,
                          id="DynamoDB Errors > 0",
                          metric=dynamo_db_total_errors,
                          threshold=0,
                          evaluation_periods=6,
                          datapoints_to_alarm=1,
                          treat_missing_data=cloud_watch.TreatMissingData.NOT_BREACHING) \
            .add_alarm_action(actions.SnsAction(error_topic))

        dashboard = cloud_watch.Dashboard(self, id="CloudWatchDashBoard")
        dashboard.add_widgets(
            cloud_watch.GraphWidget(title="Requests",
                                    width=8,
                                    left=[
                                        self.metric_for_api_gw(
                                            api_id=api.http_api_id,
                                            metric_name="Count",
                                            label="# Requests",
                                            stat="sum")
                                    ]),
            cloud_watch.GraphWidget(
                title="API GW Latency",
                width=8,
                stacked=True,
                left=[
                    self.metric_for_api_gw(api_id=api.http_api_id,
                                           metric_name="Latency",
                                           label="API Latency p50",
                                           stat="p50"),
                    self.metric_for_api_gw(api_id=api.http_api_id,
                                           metric_name="Latency",
                                           label="API Latency p90",
                                           stat="p90"),
                    self.metric_for_api_gw(api_id=api.http_api_id,
                                           metric_name="Latency",
                                           label="API Latency p99",
                                           stat="p99")
                ]),
            cloud_watch.GraphWidget(
                title="API GW Errors",
                width=8,
                stacked=True,
                left=[
                    self.metric_for_api_gw(api_id=api.http_api_id,
                                           metric_name="4XXError",
                                           label="4XX Errors",
                                           stat="sum"),
                    self.metric_for_api_gw(api_id=api.http_api_id,
                                           metric_name="5XXError",
                                           label="5XX Errors",
                                           stat="sum")
                ]),
            cloud_watch.GraphWidget(title="Dynamo Lambda Error %",
                                    width=8,
                                    left=[lambda_error_perc]),
            cloud_watch.GraphWidget(
                title="Dynamo Lambda Duration",
                width=8,
                stacked=True,
                left=[
                    dynamo_lambda.metric_duration(statistic="p50"),
                    dynamo_lambda.metric_duration(statistic="p90"),
                    dynamo_lambda.metric_duration(statistic="p99")
                ]),
            cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %",
                                    width=8,
                                    left=[lambda_throttled_perc]),
            cloud_watch.GraphWidget(
                title="DynamoDB Latency",
                width=8,
                stacked=True,
                left=[
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "GetItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "UpdateItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "PutItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "DeleteItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "Query"
                        }),
                ]),
            cloud_watch.GraphWidget(
                title="DynamoDB Consumed Read/Write Units",
                width=8,
                stacked=False,
                left=[
                    table.metric(metric_name="ConsumedReadCapacityUnits"),
                    table.metric(metric_name="ConsumedWriteCapacityUnits")
                ]),
            cloud_watch.GraphWidget(
                title="DynamoDB Throttles",
                width=8,
                stacked=True,
                left=[
                    table.metric(metric_name="ReadThrottleEvents",
                                 statistic="sum"),
                    table.metric(metric_name="WriteThrottleEvents",
                                 statistic="sum")
                ]),
        )
    def __init__(
            self,
            scope: core.Construct,
            _id: str,
            vpc,
            bucket_para,
            # key_name,
            ddb_file_list,
            sqs_queue,
            sqs_queue_DLQ,
            ssm_bucket_para,
            ssm_credential_para,
            # s3bucket,
            **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        # Create jobsender ec2 node
        jobsender = ec2.Instance(
            self,
            "jobsender",
            instance_name="s3_migrate_cluster_jobsender",
            instance_type=ec2.InstanceType(
                instance_type_identifier=jobsender_type),
            machine_image=linux_ami,
            # key_name=key_name,
            user_data=ec2.UserData.custom(user_data_jobsender),
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC))

        # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # jobsender.role.add_managed_policy(
        #     iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"))
        # Don't give full access s3 to ec2, violate security rule

        # Create Autoscaling Group with fixed 2*EC2 hosts
        worker_asg = autoscaling.AutoScalingGroup(
            self,
            "worker-asg",
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            instance_type=ec2.InstanceType(
                instance_type_identifier=worker_type),
            machine_image=linux_ami,
            # key_name=key_name,  # Optional if use SSM-SessionManager
            user_data=ec2.UserData.custom(user_data_worker),
            desired_capacity=1,
            min_capacity=1,
            max_capacity=10,
            spot_price="0.5")

        # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet.
        # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric:
        # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc.

        # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # Allow EC2 access new DynamoDB Table
        ddb_file_list.grant_full_access(jobsender)
        ddb_file_list.grant_full_access(worker_asg)

        # Allow EC2 access new sqs and its DLQ
        sqs_queue.grant_consume_messages(jobsender)
        sqs_queue.grant_send_messages(jobsender)
        sqs_queue.grant_consume_messages(worker_asg)
        sqs_queue_DLQ.grant_consume_messages(jobsender)

        # Allow EC2 access SSM Parameter Store, get bucket infor and get credential
        ssm_bucket_para.grant_read(jobsender)
        ssm_credential_para.grant_read(jobsender)
        ssm_credential_para.grant_read(worker_asg)

        # Allow EC2 access new s3 bucket
        # s3bucket.grant_read(jobsender)
        # s3bucket.grant_read(worker_asg)

        # Allow EC2 access exist s3 bucket
        bucket_name = ''
        for b in bucket_para:
            if bucket_name != b['src_bucket']:  # 如果列了多个相同的Bucket,就跳过
                bucket_name = b['src_bucket']
                s3exist_bucket = s3.Bucket.from_bucket_name(
                    self,
                    bucket_name,  # 用这个做id
                    bucket_name=bucket_name)
                s3exist_bucket.grant_read(jobsender)
                s3exist_bucket.grant_read(worker_asg)

        # Dashboard to monitor SQS and EC2
        board = cw.Dashboard(self,
                             "s3_migrate",
                             dashboard_name="s3_migrate_cluster")

        ec2_metric_net = cw.Metric(
            namespace="AWS/EC2",
            metric_name="NetworkOut",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1),
            statistic="Sum")
        ec2_metric_cpu_max = cw.Metric(
            namespace="AWS/EC2",
            metric_name="CPUUtilization",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1),
            statistic="Maximum")
        ec2_metric_cpu_avg = cw.Metric(
            namespace="AWS/EC2",
            metric_name="CPUUtilization",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1))

        autoscaling_GroupDesiredCapacity = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupDesiredCapacity",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupInServiceInstances = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupInServiceInstances",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMinSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMinSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMaxSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMaxSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        # CWAgent collected metric
        cwagent_mem_avg = cw.Metric(namespace="CWAgent",
                                    metric_name="mem_used_percent",
                                    dimensions={
                                        "AutoScalingGroupName":
                                        worker_asg.auto_scaling_group_name
                                    },
                                    statistic="Average",
                                    period=core.Duration.minutes(1))
        cwagent_mem_max = cw.Metric(namespace="CWAgent",
                                    metric_name="mem_used_percent",
                                    dimensions={
                                        "AutoScalingGroupName":
                                        worker_asg.auto_scaling_group_name
                                    },
                                    statistic="Maximum",
                                    period=core.Duration.minutes(1))

        # CWAgent collected application logs - filter metric
        s3_migrate_log = logs.LogGroup(self,
                                       "applog",
                                       log_group_name="s3_migration_log")
        s3_migrate_log.add_metric_filter(
            "ERROR",
            metric_name="ERROR-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"ERROR"'))
        s3_migrate_log.add_metric_filter(
            "WARNING",
            metric_name="WARNING-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"WARNING"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        board.add_widgets(
            cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]),
            cw.GraphWidget(title="EC2-ALL-CPU",
                           left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]),
            cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY",
                           left=[cwagent_mem_max, cwagent_mem_avg]),
            cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity",
                                 metrics=[
                                     autoscaling_GroupDesiredCapacity,
                                     autoscaling_GroupInServiceInstances,
                                     autoscaling_GroupMinSize,
                                     autoscaling_GroupMaxSize
                                 ],
                                 height=6),
        )

        board.add_widgets(
            cw.GraphWidget(
                title="SQS-Jobs",
                left=[
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.GraphWidget(
                title="SQS-DeadLetterQueue",
                left=[
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.GraphWidget(title="ERROR/WARNING Logs",
                           left=[log_metric_ERROR],
                           right=[log_metric_WARNING],
                           height=6),
            cw.SingleValueWidget(
                title="Running/Waiting and Death Jobs",
                metrics=[
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1))
                ],
                height=6))

        # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins
        worker_asg.scale_on_metric(
            "scaleup",
            metric=sqs_queue.metric_approximate_number_of_messages_visible(),
            scaling_steps=[
                autoscaling.ScalingInterval(change=1, lower=100, upper=500),
                autoscaling.ScalingInterval(change=2, lower=500),
                autoscaling.ScalingInterval(change=0, upper=100, lower=0)
            ],
            adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY)

        # Alarm for queue empty and ec2 > 1
        # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台
        # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0
        metric_all_message = cw.MathExpression(
            expression="IF(((a+b) == 0) AND (c >1), 0, 1)",  # a+b且c>1则设置为0,告警
            label="empty_queue_expression",
            using_metrics={
                "a": sqs_queue.metric_approximate_number_of_messages_visible(),
                "b":
                sqs_queue.metric_approximate_number_of_messages_not_visible(),
                "c": autoscaling_GroupInServiceInstances
            })
        alarm_0 = cw.Alarm(
            self,
            "SQSempty",
            alarm_name=
            "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster",
            metric=metric_all_message,
            threshold=0,
            comparison_operator=cw.ComparisonOperator.
            LESS_THAN_OR_EQUAL_TO_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.NOT_BREACHING)
        alarm_topic_empty = sns.Topic(
            self, "SQS queue empty and ec2 more than 1 in Cluster")
        # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知
        alarm_topic_empty.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty))

        # If queue empty, set autoscale down to 1 EC2
        action_shutdown = autoscaling.StepScalingAction(
            self,
            "shutdown",
            auto_scaling_group=worker_asg,
            adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY)
        action_shutdown.add_adjustment(adjustment=1, upper_bound=0)
        alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown))

        # While message in SQS-DLQ, alarm to sns
        alarm_DLQ = cw.Alarm(
            self,
            "SQS_DLQ",
            alarm_name=
            "s3-migration-cluster-SQS DLQ more than 1 message-Cluster",
            metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
            ),
            threshold=0,
            comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.IGNORE)
        alarm_topic_DLQ = sns.Topic(self,
                                    "SQS DLQ more than 1 message-Cluster")
        alarm_topic_DLQ.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ))

        # Output
        core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id)
        core.CfnOutput(self,
                       "WorkerEC2AutoscalingGroup",
                       value=worker_asg.auto_scaling_group_name)
        core.CfnOutput(self,
                       "Dashboard",
                       value="CloudWatch Dashboard name s3_migrate_cluster")
        core.CfnOutput(self,
                       "Alarm",
                       value="CloudWatch SQS queue empty Alarm for cluster: " +
                       alarm_email)
Example #3
0
    def __init__(
            self,
            scope: core.Construct,
            _id: str,
            vpc,
            bucket_para,
            # key_name,
            ddb_file_list,
            sqs_queue,
            sqs_queue_DLQ,
            ssm_bucket_para,
            ssm_credential_para,
            s3bucket,
            s3_deploy,
            **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        # Create environment variable into userdata
        env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \
                  f'export sqs_queue_name={sqs_queue.queue_name}\n' \
                  f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n'
        env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \
                     f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \
                     f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n'
        # Create log group and put group name into userdata
        s3_migrate_log = logs.LogGroup(self, "applog")
        cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][
            'log_group_name'] = s3_migrate_log.log_group_name
        cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][
            'log_group_name'] = s3_migrate_log.log_group_name
        cw_agent_config['metrics']['append_dimensions'][
            'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}"
        cw_agent_config['metrics']['append_dimensions'][
            'InstanceId'] = "\\${aws:InstanceId}"
        cw_agent_config_str = json.dumps(cw_agent_config,
                                         indent=4).replace("\\\\", "\\")
        userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \
                        s3_deploy.bucket_name + " .\n" + env_var + env_var_st
        jobsender_userdata = userdata_head + user_data_jobsender_p
        worker_userdata = userdata_head + user_data_worker_p

        # Create jobsender ec2 node
        jobsender = autoscaling.AutoScalingGroup(
            self,
            "jobsender",
            instance_type=ec2.InstanceType(
                instance_type_identifier=jobsender_type),
            machine_image=linux_ami,
            # key_name=key_name,
            user_data=ec2.UserData.custom(jobsender_userdata),
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            desired_capacity=1,
            min_capacity=0,
            max_capacity=1)

        # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # jobsender.role.add_managed_policy(
        #     iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"))
        # Don't give full access s3 to ec2, violate security rule

        # Create Autoscaling Group with fixed 2*EC2 hosts
        worker_asg = autoscaling.AutoScalingGroup(
            self,
            "worker-asg",
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            instance_type=ec2.InstanceType(
                instance_type_identifier=worker_type),
            machine_image=linux_ami,
            # key_name=key_name,  # Optional if use SSM-SessionManager
            user_data=ec2.UserData.custom(worker_userdata),
            desired_capacity=2,
            min_capacity=2,
            max_capacity=10,
            spot_price="0.5")

        # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet.
        # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric:
        # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc.

        # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # Allow EC2 access new DynamoDB Table
        ddb_file_list.grant_full_access(jobsender)
        ddb_file_list.grant_full_access(worker_asg)

        # Allow EC2 access new sqs and its DLQ
        sqs_queue.grant_consume_messages(jobsender)
        sqs_queue.grant_send_messages(jobsender)
        sqs_queue.grant_consume_messages(worker_asg)
        sqs_queue_DLQ.grant_consume_messages(jobsender)

        # Allow EC2 access SSM Parameter Store, get bucket infor and get credential
        ssm_bucket_para.grant_read(jobsender)
        ssm_credential_para.grant_read(jobsender)
        ssm_credential_para.grant_read(worker_asg)

        # Allow EC2 access source code on s3_deploy bucket
        s3_deploy.grant_read(jobsender)
        s3_deploy.grant_read(worker_asg)

        # Allow EC2 access new s3 bucket
        s3bucket.grant_read(jobsender)
        s3bucket.grant_read(worker_asg)

        # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets
        bucket_name = ''
        for b in bucket_para:
            if bucket_name != b['src_bucket']:  # 如果列了多个相同的Bucket,就跳过
                bucket_name = b['src_bucket']
                s3exist_bucket = s3.Bucket.from_bucket_name(
                    self,
                    bucket_name,  # 用这个做id
                    bucket_name=bucket_name)
                s3exist_bucket.grant_read(jobsender)
                s3exist_bucket.grant_read(worker_asg)
        # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets
        # bucket_name = ''
        # for b in bucket_para:
        #     if bucket_name != b['des_bucket']:  # 如果列了多个相同的Bucket,就跳过
        #         bucket_name = b['des_bucket']
        #         s3exist_bucket = s3.Bucket.from_bucket_name(self,
        #                                                     bucket_name,  # 用这个做id
        #                                                     bucket_name=bucket_name)
        #         s3exist_bucket.grant_read_write(jobsender)
        #         s3exist_bucket.grant_read_write(worker_asg)

        # Dashboard to monitor SQS and EC2
        board = cw.Dashboard(self, "s3_migrate")

        ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2",
                                       metric_name="CPUUtilization",
                                       dimensions={
                                           "AutoScalingGroupName":
                                           worker_asg.auto_scaling_group_name
                                       },
                                       period=core.Duration.minutes(1))

        ec2_metric_net_out = cw.MathExpression(
            expression=
            "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)",
            label="EC2-NetworkOut",
            using_metrics={})

        autoscaling_GroupDesiredCapacity = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupDesiredCapacity",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupInServiceInstances = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupInServiceInstances",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMinSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMinSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMaxSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMaxSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))

        # CWAgent collected metric
        cwagent_mem_avg = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName="
            + worker_asg.auto_scaling_group_name +
            " AND MetricName=mem_used_percent)', 'Average', 60)",
            label="mem_avg",
            using_metrics={})
        cwagent_disk_avg = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} "
            "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name +
            " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)",
            label="disk_avg",
            using_metrics={})
        cwagent_net_tcp = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName="
            + worker_asg.auto_scaling_group_name +
            " AND MetricName=tcp_established)', 'Average', 60)",
            label="tcp_conn",
            using_metrics={})

        # CWAgent collected application logs - filter metric
        s3_migrate_log.add_metric_filter(
            "Completed-bytes",
            metric_name="Completed-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Complete", bytes, key]'))
        s3_migrate_log.add_metric_filter(
            "Uploading-bytes",
            metric_name="Uploading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Uploading", bytes, key]'))
        s3_migrate_log.add_metric_filter(
            "Downloading-bytes",
            metric_name="Downloading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Downloading", bytes, key]'))
        traffic_metric_Complete = cw.Metric(namespace="s3_migrate",
                                            metric_name="Completed-bytes",
                                            statistic="Sum",
                                            period=core.Duration.minutes(1))
        traffic_metric_Upload = cw.Metric(namespace="s3_migrate",
                                          metric_name="Uploading-bytes",
                                          statistic="Sum",
                                          period=core.Duration.minutes(1))
        traffic_metric_Download = cw.Metric(namespace="s3_migrate",
                                            metric_name="Downloading-bytes",
                                            statistic="Sum",
                                            period=core.Duration.minutes(1))
        s3_migrate_log.add_metric_filter(
            "ERROR",
            metric_name="ERROR-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"ERROR"'))
        s3_migrate_log.add_metric_filter(
            "WARNING",
            metric_name="WARNING-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"WARNING"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        board.add_widgets(
            cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC",
                           left=[
                               traffic_metric_Complete, traffic_metric_Upload,
                               traffic_metric_Download
                           ],
                           left_y_axis=cw.YAxisProps(label="Bytes/min",
                                                     show_units=False)),
            cw.GraphWidget(title="ERROR/WARNING LOGS",
                           left=[log_metric_ERROR],
                           left_y_axis=cw.YAxisProps(label="Count",
                                                     show_units=False),
                           right=[log_metric_WARNING],
                           right_y_axis=cw.YAxisProps(label="Count",
                                                      show_units=False)),
            cw.GraphWidget(
                title="SQS-JOBS",
                left=[
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.SingleValueWidget(
                title="RUNNING, WAITING & DEATH JOBS",
                metrics=[
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1))
                ],
                height=6))

        board.add_widgets(
            cw.GraphWidget(title="EC2-AutoscalingGroup-TCP",
                           left=[cwagent_net_tcp],
                           left_y_axis=cw.YAxisProps(label="Count",
                                                     show_units=False)),
            cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY",
                           left=[ec2_metric_cpu_avg, cwagent_mem_avg],
                           left_y_axis=cw.YAxisProps(max=100,
                                                     min=0,
                                                     label="%",
                                                     show_units=False)),
            cw.GraphWidget(title="EC2-AutoscalingGroup-DISK",
                           left=[cwagent_disk_avg],
                           left_y_axis=cw.YAxisProps(max=100,
                                                     min=0,
                                                     label="%",
                                                     show_units=False)),
            cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY",
                                 metrics=[
                                     autoscaling_GroupDesiredCapacity,
                                     autoscaling_GroupInServiceInstances,
                                     autoscaling_GroupMinSize,
                                     autoscaling_GroupMaxSize
                                 ],
                                 height=6))
        board.add_widgets(
            cw.GraphWidget(title="EC2-NetworkOut",
                           left=[ec2_metric_net_out],
                           left_y_axis=cw.YAxisProps(label="Bytes/min",
                                                     show_units=False)))

        # Autoscaling up when visible message > 100 in 5 mins
        worker_asg.scale_on_metric(
            "scaleup",
            metric=sqs_queue.metric_approximate_number_of_messages_visible(),
            scaling_steps=[
                autoscaling.ScalingInterval(change=1, lower=100, upper=500),
                autoscaling.ScalingInterval(change=2, lower=500),
                autoscaling.ScalingInterval(change=0, upper=100, lower=0)
            ],
            adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY)

        # Alarm for queue empty and ec2 > 1
        # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台
        # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0
        metric_all_message = cw.MathExpression(
            expression="IF(((a+b) == 0) AND (c >1), 0, 1)",  # a+b且c>1则设置为0,告警
            label="empty_queue_expression",
            using_metrics={
                "a": sqs_queue.metric_approximate_number_of_messages_visible(),
                "b":
                sqs_queue.metric_approximate_number_of_messages_not_visible(),
                "c": autoscaling_GroupInServiceInstances
            })
        alarm_0 = cw.Alarm(
            self,
            "SQSempty",
            alarm_name=
            "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster",
            metric=metric_all_message,
            threshold=0,
            comparison_operator=cw.ComparisonOperator.
            LESS_THAN_OR_EQUAL_TO_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.NOT_BREACHING)
        alarm_topic_empty = sns.Topic(
            self, "SQS queue empty and ec2 more than 1 in Cluster")
        # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知
        alarm_topic_empty.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty))

        # If queue empty, set autoscale down to 1 EC2
        action_shutdown = autoscaling.StepScalingAction(
            self,
            "shutdown",
            auto_scaling_group=worker_asg,
            adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY)
        action_shutdown.add_adjustment(adjustment=1, upper_bound=0)
        alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown))

        # While message in SQS-DLQ, alarm to sns
        alarm_DLQ = cw.Alarm(
            self,
            "SQS_DLQ",
            alarm_name=
            "s3-migration-cluster-SQS DLQ more than 1 message-Cluster",
            metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
            ),
            threshold=0,
            comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.IGNORE)
        alarm_topic_DLQ = sns.Topic(self,
                                    "SQS DLQ more than 1 message-Cluster")
        alarm_topic_DLQ.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ))

        # Output
        core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name)
        core.CfnOutput(self,
                       "Dashboard",
                       value="CloudWatch Dashboard name s3_migrate_cluster")
        core.CfnOutput(self,
                       "Alarm",
                       value="CloudWatch SQS queue empty Alarm for cluster: " +
                       alarm_email)
Example #4
0
    def __init__(self, scope: core.Construct, id: str, stage: str,
                 api: _api_gw.IRestApi, fn: _lambda.IFunction,
                 table: _ddb.ITable, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        gw = dict(self.node.try_get_context("gateway"))

        ###
        # Custom Metrics
        ###

        # Gather the % of lambda invocations that error in past 5 mins
        lambda_error_perc = cloud_watch.MathExpression(
            expression="e / i * 100",
            label="% of invocations that errored, last 5 mins",
            using_metrics={
                "i": fn.metric(metric_name="Invocations", statistic="sum"),
                "e": fn.metric(metric_name="Errors", statistic="sum"),
            },
            period=core.Duration.minutes(5))

        # note: throttled requests are not counted in total num of invocations
        lambda_throttled_perc = cloud_watch.MathExpression(
            expression="t / (i + t) * 100",
            label="% of throttled requests, last 30 mins",
            using_metrics={
                "i": fn.metric(metric_name="Invocations", statistic="sum"),
                "t": fn.metric(metric_name="Throttles", statistic="sum"),
            },
            period=core.Duration.minutes(5))

        dashboard = cloud_watch.Dashboard(self,
                                          id="CloudWatchDashBoard",
                                          dashboard_name="Serverlesslens")

        dashboard.add_widgets(
            cloud_watch.GraphWidget(title="Requests",
                                    width=8,
                                    left=[
                                        self.metric_for_api_gw(
                                            api_name=gw["gw_name"],
                                            stage=stage,
                                            metric_name="Count",
                                            label="# Requests",
                                            stat="sum")
                                    ]),
            cloud_watch.GraphWidget(
                title="API GW Latency",
                width=8,
                stacked=True,
                left=[
                    self.metric_for_api_gw(api_name=gw["gw_name"],
                                           stage=stage,
                                           metric_name="Latency",
                                           label="API Latency p50",
                                           stat="p50"),
                    self.metric_for_api_gw(api_name=gw["gw_name"],
                                           stage=stage,
                                           metric_name="Latency",
                                           label="API Latency p90",
                                           stat="p90"),
                    self.metric_for_api_gw(api_name=gw["gw_name"],
                                           stage=stage,
                                           metric_name="Latency",
                                           label="API Latency p99",
                                           stat="p99")
                ]),
            cloud_watch.GraphWidget(
                title="API GW Errors",
                width=8,
                stacked=True,
                left=[
                    self.metric_for_api_gw(api_name=gw["gw_name"],
                                           stage=stage,
                                           metric_name="4XXError",
                                           label="4XX Errors",
                                           stat="sum"),
                    self.metric_for_api_gw(api_name=gw["gw_name"],
                                           stage=stage,
                                           metric_name="5XXError",
                                           label="5XX Errors",
                                           stat="sum")
                ]),
            cloud_watch.GraphWidget(title="Dynamo Lambda Error %",
                                    width=8,
                                    left=[lambda_error_perc]),
            cloud_watch.GraphWidget(title="Dynamo Lambda Duration",
                                    width=8,
                                    stacked=True,
                                    left=[
                                        fn.metric_duration(statistic="p50"),
                                        fn.metric_duration(statistic="p90"),
                                        fn.metric_duration(statistic="p99")
                                    ]),
            cloud_watch.GraphWidget(title="Dynamo Lambda Throttle %",
                                    width=8,
                                    left=[lambda_throttled_perc]),
            cloud_watch.GraphWidget(
                title="DynamoDB Latency",
                width=8,
                stacked=True,
                left=[
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "GetItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "UpdateItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "PutItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "DeleteItem"
                        }),
                    table.metric_successful_request_latency(
                        dimensions={
                            "TableName": table.table_name,
                            "Operation": "Query"
                        }),
                ]),
            cloud_watch.GraphWidget(
                title="DynamoDB Consumed Read/Write Units",
                width=8,
                stacked=False,
                left=[
                    table.metric(metric_name="ConsumedReadCapacityUnits"),
                    table.metric(metric_name="ConsumedWriteCapacityUnits")
                ]),
            cloud_watch.GraphWidget(
                title="DynamoDB Throttles",
                width=8,
                stacked=True,
                left=[
                    table.metric(metric_name="ReadThrottleEvents",
                                 statistic="sum"),
                    table.metric(metric_name="WriteThrottleEvents",
                                 statistic="sum")
                ]),
        )