def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # Read Lambda Code):
        try:
            with open("serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f:
                konstone_custom_metric_fn_code = f.read()
        except OSError:
            print("Unable to read Lambda Function Code")

        konstone_custom_metric_fn = _lambda.Function(
            self,
            "konstoneFunction",
            function_name="konstone_custom_metric_fn",
            runtime=_lambda.Runtime.PYTHON_3_7,
            handler="index.lambda_handler",
            code=_lambda.InlineCode(
                konstone_custom_metric_fn_code),
            timeout=core.Duration.seconds(
                3),
            reserved_concurrent_executions=1,
            environment={
                "LOG_LEVEL": "INFO",
                "PERCENTAGE_ERRORS": "75"
            }
        )

        # Create Custom Loggroup
        # /aws/lambda/function-name
        konstone_custom_metric_lg = _logs.LogGroup(
            self,
            "konstoneLoggroup",
            log_group_name=f"/aws/lambda/{konstone_custom_metric_fn.function_name}",
            removal_policy=core.RemovalPolicy.DESTROY,
            retention=_logs.RetentionDays.ONE_DAY,
        )

        # Create Custom Metric Namespace
        third_party_error_metric = _cloudwatch.Metric(
            namespace=f"third-party-error-metric",
            metric_name="third_party_error_metric",
            label="Total No. of Third Party API Errors",
            period=core.Duration.minutes(1),
            statistic="Sum"
        )

        # Create Custom Metric Log Filter
        third_party_error_metric_filter = _logs.MetricFilter(
            self,
            "thirdPartyApiErrorMetricFilter",
            filter_pattern=_logs.FilterPattern.boolean_value(
                "$.third_party_api_error", True),
            log_group=konstone_custom_metric_lg,
            metric_namespace=third_party_error_metric.namespace,
            metric_name=third_party_error_metric.metric_name,
            default_value=0,
            metric_value="1"
        )

        # Create Third Party Error Alarm
        third_party_error_alarm = _cloudwatch.Alarm(
            self,
            "thirdPartyApiErrorAlarm",
            alarm_description="Alert if 3rd party API has more than 2 errors in the last two minutes",
            alarm_name="third-party-api-alarm",
            metric=third_party_error_metric,
            comparison_operator=_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
            threshold=2,
            evaluation_periods=2,
            datapoints_to_alarm=1,
            period=core.Duration.minutes(1),
            treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING
        )

        # Create CloudWatch Dashboard
        konstone_dashboard = _cloudwatch.Dashboard(
            self,
            id="konstoneDashboard",
            dashboard_name="Konstone-App-Live-Dashboard"
        )

        # Add Lambda Function Metrics to Dashboard
        konstone_dashboard.add_widgets(
            _cloudwatch.Row(
                _cloudwatch.GraphWidget(
                    title="Backend-Invocations",
                    left=[
                        konstone_custom_metric_fn.metric_invocations(
                            statistic="Sum",
                            period=core.Duration.minutes(1)
                        )
                    ]
                ),
                _cloudwatch.GraphWidget(
                    title="Backend-Errors",
                    left=[
                        konstone_custom_metric_fn.metric_errors(
                            statistic="Sum",
                            period=core.Duration.minutes(1)
                        )
                    ]
                )
            )
        )

        # Add 3rd Party API Error to Dashboard
        konstone_dashboard.add_widgets(
            _cloudwatch.Row(
                _cloudwatch.SingleValueWidget(
                    title="Third Party API Errors",
                    metrics=[third_party_error_metric]
                )
            )
        )
Example #2
0
    def __init__(
            self,
            scope: core.Construct,
            _id: str,
            vpc,
            bucket_para,
            # key_name,
            ddb_file_list,
            sqs_queue,
            sqs_queue_DLQ,
            ssm_bucket_para,
            ssm_credential_para,
            s3bucket,
            s3_deploy,
            **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        # Create environment variable into userdata
        env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \
                  f'export sqs_queue_name={sqs_queue.queue_name}\n' \
                  f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n'
        env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \
                     f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \
                     f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n'
        # Create log group and put group name into userdata
        s3_migrate_log = logs.LogGroup(self, "applog")
        cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][
            'log_group_name'] = s3_migrate_log.log_group_name
        cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][
            'log_group_name'] = s3_migrate_log.log_group_name
        cw_agent_config['metrics']['append_dimensions'][
            'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}"
        cw_agent_config['metrics']['append_dimensions'][
            'InstanceId'] = "\\${aws:InstanceId}"
        cw_agent_config_str = json.dumps(cw_agent_config,
                                         indent=4).replace("\\\\", "\\")
        userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \
                        s3_deploy.bucket_name + " .\n" + env_var + env_var_st
        jobsender_userdata = userdata_head + user_data_jobsender_p
        worker_userdata = userdata_head + user_data_worker_p

        # Create jobsender ec2 node
        jobsender = autoscaling.AutoScalingGroup(
            self,
            "jobsender",
            instance_type=ec2.InstanceType(
                instance_type_identifier=jobsender_type),
            machine_image=linux_ami,
            # key_name=key_name,
            user_data=ec2.UserData.custom(jobsender_userdata),
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            desired_capacity=1,
            min_capacity=0,
            max_capacity=1)

        # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # jobsender.role.add_managed_policy(
        #     iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"))
        # Don't give full access s3 to ec2, violate security rule

        # Create Autoscaling Group with fixed 2*EC2 hosts
        worker_asg = autoscaling.AutoScalingGroup(
            self,
            "worker-asg",
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            instance_type=ec2.InstanceType(
                instance_type_identifier=worker_type),
            machine_image=linux_ami,
            # key_name=key_name,  # Optional if use SSM-SessionManager
            user_data=ec2.UserData.custom(worker_userdata),
            desired_capacity=2,
            min_capacity=2,
            max_capacity=10,
            spot_price="0.5")

        # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet.
        # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric:
        # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc.

        # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # Allow EC2 access new DynamoDB Table
        ddb_file_list.grant_full_access(jobsender)
        ddb_file_list.grant_full_access(worker_asg)

        # Allow EC2 access new sqs and its DLQ
        sqs_queue.grant_consume_messages(jobsender)
        sqs_queue.grant_send_messages(jobsender)
        sqs_queue.grant_consume_messages(worker_asg)
        sqs_queue_DLQ.grant_consume_messages(jobsender)

        # Allow EC2 access SSM Parameter Store, get bucket infor and get credential
        ssm_bucket_para.grant_read(jobsender)
        ssm_credential_para.grant_read(jobsender)
        ssm_credential_para.grant_read(worker_asg)

        # Allow EC2 access source code on s3_deploy bucket
        s3_deploy.grant_read(jobsender)
        s3_deploy.grant_read(worker_asg)

        # Allow EC2 access new s3 bucket
        s3bucket.grant_read(jobsender)
        s3bucket.grant_read(worker_asg)

        # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets
        bucket_name = ''
        for b in bucket_para:
            if bucket_name != b['src_bucket']:  # 如果列了多个相同的Bucket,就跳过
                bucket_name = b['src_bucket']
                s3exist_bucket = s3.Bucket.from_bucket_name(
                    self,
                    bucket_name,  # 用这个做id
                    bucket_name=bucket_name)
                s3exist_bucket.grant_read(jobsender)
                s3exist_bucket.grant_read(worker_asg)
        # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets
        # bucket_name = ''
        # for b in bucket_para:
        #     if bucket_name != b['des_bucket']:  # 如果列了多个相同的Bucket,就跳过
        #         bucket_name = b['des_bucket']
        #         s3exist_bucket = s3.Bucket.from_bucket_name(self,
        #                                                     bucket_name,  # 用这个做id
        #                                                     bucket_name=bucket_name)
        #         s3exist_bucket.grant_read_write(jobsender)
        #         s3exist_bucket.grant_read_write(worker_asg)

        # Dashboard to monitor SQS and EC2
        board = cw.Dashboard(self, "s3_migrate")

        ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2",
                                       metric_name="CPUUtilization",
                                       dimensions={
                                           "AutoScalingGroupName":
                                           worker_asg.auto_scaling_group_name
                                       },
                                       period=core.Duration.minutes(1))

        ec2_metric_net_out = cw.MathExpression(
            expression=
            "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)",
            label="EC2-NetworkOut",
            using_metrics={})

        autoscaling_GroupDesiredCapacity = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupDesiredCapacity",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupInServiceInstances = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupInServiceInstances",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMinSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMinSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMaxSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMaxSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))

        # CWAgent collected metric
        cwagent_mem_avg = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName="
            + worker_asg.auto_scaling_group_name +
            " AND MetricName=mem_used_percent)', 'Average', 60)",
            label="mem_avg",
            using_metrics={})
        cwagent_disk_avg = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} "
            "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name +
            " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)",
            label="disk_avg",
            using_metrics={})
        cwagent_net_tcp = cw.MathExpression(
            expression=
            "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName="
            + worker_asg.auto_scaling_group_name +
            " AND MetricName=tcp_established)', 'Average', 60)",
            label="tcp_conn",
            using_metrics={})

        # CWAgent collected application logs - filter metric
        s3_migrate_log.add_metric_filter(
            "Completed-bytes",
            metric_name="Completed-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Complete", bytes, key]'))
        s3_migrate_log.add_metric_filter(
            "Uploading-bytes",
            metric_name="Uploading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Uploading", bytes, key]'))
        s3_migrate_log.add_metric_filter(
            "Downloading-bytes",
            metric_name="Downloading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[date, time, info, hs, p="--->Downloading", bytes, key]'))
        traffic_metric_Complete = cw.Metric(namespace="s3_migrate",
                                            metric_name="Completed-bytes",
                                            statistic="Sum",
                                            period=core.Duration.minutes(1))
        traffic_metric_Upload = cw.Metric(namespace="s3_migrate",
                                          metric_name="Uploading-bytes",
                                          statistic="Sum",
                                          period=core.Duration.minutes(1))
        traffic_metric_Download = cw.Metric(namespace="s3_migrate",
                                            metric_name="Downloading-bytes",
                                            statistic="Sum",
                                            period=core.Duration.minutes(1))
        s3_migrate_log.add_metric_filter(
            "ERROR",
            metric_name="ERROR-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"ERROR"'))
        s3_migrate_log.add_metric_filter(
            "WARNING",
            metric_name="WARNING-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"WARNING"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        board.add_widgets(
            cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC",
                           left=[
                               traffic_metric_Complete, traffic_metric_Upload,
                               traffic_metric_Download
                           ],
                           left_y_axis=cw.YAxisProps(label="Bytes/min",
                                                     show_units=False)),
            cw.GraphWidget(title="ERROR/WARNING LOGS",
                           left=[log_metric_ERROR],
                           left_y_axis=cw.YAxisProps(label="Count",
                                                     show_units=False),
                           right=[log_metric_WARNING],
                           right_y_axis=cw.YAxisProps(label="Count",
                                                      show_units=False)),
            cw.GraphWidget(
                title="SQS-JOBS",
                left=[
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.SingleValueWidget(
                title="RUNNING, WAITING & DEATH JOBS",
                metrics=[
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1))
                ],
                height=6))

        board.add_widgets(
            cw.GraphWidget(title="EC2-AutoscalingGroup-TCP",
                           left=[cwagent_net_tcp],
                           left_y_axis=cw.YAxisProps(label="Count",
                                                     show_units=False)),
            cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY",
                           left=[ec2_metric_cpu_avg, cwagent_mem_avg],
                           left_y_axis=cw.YAxisProps(max=100,
                                                     min=0,
                                                     label="%",
                                                     show_units=False)),
            cw.GraphWidget(title="EC2-AutoscalingGroup-DISK",
                           left=[cwagent_disk_avg],
                           left_y_axis=cw.YAxisProps(max=100,
                                                     min=0,
                                                     label="%",
                                                     show_units=False)),
            cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY",
                                 metrics=[
                                     autoscaling_GroupDesiredCapacity,
                                     autoscaling_GroupInServiceInstances,
                                     autoscaling_GroupMinSize,
                                     autoscaling_GroupMaxSize
                                 ],
                                 height=6))
        board.add_widgets(
            cw.GraphWidget(title="EC2-NetworkOut",
                           left=[ec2_metric_net_out],
                           left_y_axis=cw.YAxisProps(label="Bytes/min",
                                                     show_units=False)))

        # Autoscaling up when visible message > 100 in 5 mins
        worker_asg.scale_on_metric(
            "scaleup",
            metric=sqs_queue.metric_approximate_number_of_messages_visible(),
            scaling_steps=[
                autoscaling.ScalingInterval(change=1, lower=100, upper=500),
                autoscaling.ScalingInterval(change=2, lower=500),
                autoscaling.ScalingInterval(change=0, upper=100, lower=0)
            ],
            adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY)

        # Alarm for queue empty and ec2 > 1
        # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台
        # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0
        metric_all_message = cw.MathExpression(
            expression="IF(((a+b) == 0) AND (c >1), 0, 1)",  # a+b且c>1则设置为0,告警
            label="empty_queue_expression",
            using_metrics={
                "a": sqs_queue.metric_approximate_number_of_messages_visible(),
                "b":
                sqs_queue.metric_approximate_number_of_messages_not_visible(),
                "c": autoscaling_GroupInServiceInstances
            })
        alarm_0 = cw.Alarm(
            self,
            "SQSempty",
            alarm_name=
            "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster",
            metric=metric_all_message,
            threshold=0,
            comparison_operator=cw.ComparisonOperator.
            LESS_THAN_OR_EQUAL_TO_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.NOT_BREACHING)
        alarm_topic_empty = sns.Topic(
            self, "SQS queue empty and ec2 more than 1 in Cluster")
        # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知
        alarm_topic_empty.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty))

        # If queue empty, set autoscale down to 1 EC2
        action_shutdown = autoscaling.StepScalingAction(
            self,
            "shutdown",
            auto_scaling_group=worker_asg,
            adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY)
        action_shutdown.add_adjustment(adjustment=1, upper_bound=0)
        alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown))

        # While message in SQS-DLQ, alarm to sns
        alarm_DLQ = cw.Alarm(
            self,
            "SQS_DLQ",
            alarm_name=
            "s3-migration-cluster-SQS DLQ more than 1 message-Cluster",
            metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
            ),
            threshold=0,
            comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.IGNORE)
        alarm_topic_DLQ = sns.Topic(self,
                                    "SQS DLQ more than 1 message-Cluster")
        alarm_topic_DLQ.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ))

        # Output
        core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name)
        core.CfnOutput(self,
                       "Dashboard",
                       value="CloudWatch Dashboard name s3_migrate_cluster")
        core.CfnOutput(self,
                       "Alarm",
                       value="CloudWatch SQS queue empty Alarm for cluster: " +
                       alarm_email)
Example #3
0
    def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        # Setup SSM parameter of credentials, bucket_para, ignore_list
        ssm_credential_para = ssm.StringParameter.from_secure_string_parameter_attributes(
            self,
            "ssm_parameter_credentials",
            parameter_name=ssm_parameter_credentials,
            version=1)

        ssm_bucket_para = ssm.StringParameter(self,
                                              "s3bucket_serverless",
                                              string_value=json.dumps(
                                                  bucket_para, indent=4))

        ssm_parameter_ignore_list = ssm.StringParameter(
            self, "s3_migrate_ignore_list", string_value=ignore_list)

        # Setup DynamoDB
        ddb_file_list = ddb.Table(self,
                                  "s3migrate_serverless",
                                  partition_key=ddb.Attribute(
                                      name="Key",
                                      type=ddb.AttributeType.STRING),
                                  billing_mode=ddb.BillingMode.PAY_PER_REQUEST)
        ddb_file_list.add_global_secondary_index(
            partition_key=ddb.Attribute(name="desBucket",
                                        type=ddb.AttributeType.STRING),
            index_name="desBucket-index",
            projection_type=ddb.ProjectionType.INCLUDE,
            non_key_attributes=["desKey", "versionId"])

        # Setup SQS
        sqs_queue_DLQ = sqs.Queue(self,
                                  "s3migrate_serverless_Q_DLQ",
                                  visibility_timeout=core.Duration.minutes(15),
                                  retention_period=core.Duration.days(14))
        sqs_queue = sqs.Queue(self,
                              "s3migrate_serverless_Q",
                              visibility_timeout=core.Duration.minutes(15),
                              retention_period=core.Duration.days(14),
                              dead_letter_queue=sqs.DeadLetterQueue(
                                  max_receive_count=60, queue=sqs_queue_DLQ))

        # Setup API for Lambda to get IP address (for debug networking routing purpose)
        checkip = api.RestApi(
            self,
            "lambda-checkip-api",
            cloud_watch_role=True,
            deploy=True,
            description="For Lambda get IP address",
            default_integration=api.MockIntegration(
                integration_responses=[
                    api.IntegrationResponse(status_code="200",
                                            response_templates={
                                                "application/json":
                                                "$context.identity.sourceIp"
                                            })
                ],
                request_templates={"application/json": '{"statusCode": 200}'}),
            endpoint_types=[api.EndpointType.REGIONAL])
        checkip.root.add_method("GET",
                                method_responses=[
                                    api.MethodResponse(
                                        status_code="200",
                                        response_models={
                                            "application/json":
                                            api.Model.EMPTY_MODEL
                                        })
                                ])

        # Setup Lambda functions
        handler = lam.Function(self,
                               "s3-migrate-worker",
                               code=lam.Code.asset("./lambda"),
                               handler="lambda_function_worker.lambda_handler",
                               runtime=lam.Runtime.PYTHON_3_8,
                               memory_size=1024,
                               timeout=core.Duration.minutes(15),
                               tracing=lam.Tracing.ACTIVE,
                               environment={
                                   'table_queue_name':
                                   ddb_file_list.table_name,
                                   'Des_bucket_default': Des_bucket_default,
                                   'Des_prefix_default': Des_prefix_default,
                                   'StorageClass': StorageClass,
                                   'checkip_url': checkip.url,
                                   'ssm_parameter_credentials':
                                   ssm_parameter_credentials,
                                   'JobType': JobType,
                                   'MaxRetry': MaxRetry,
                                   'MaxThread': MaxThread,
                                   'MaxParallelFile': MaxParallelFile,
                                   'JobTimeout': JobTimeout,
                                   'UpdateVersionId': UpdateVersionId,
                                   'GetObjectWithVersionId':
                                   GetObjectWithVersionId
                               })

        handler_jobsender = lam.Function(
            self,
            "s3-migrate-jobsender",
            code=lam.Code.asset("./lambda"),
            handler="lambda_function_jobsender.lambda_handler",
            runtime=lam.Runtime.PYTHON_3_8,
            memory_size=1024,
            timeout=core.Duration.minutes(15),
            tracing=lam.Tracing.ACTIVE,
            environment={
                'table_queue_name': ddb_file_list.table_name,
                'StorageClass': StorageClass,
                'checkip_url': checkip.url,
                'sqs_queue': sqs_queue.queue_name,
                'ssm_parameter_credentials': ssm_parameter_credentials,
                'ssm_parameter_ignore_list':
                ssm_parameter_ignore_list.parameter_name,
                'ssm_parameter_bucket': ssm_bucket_para.parameter_name,
                'JobType': JobType,
                'MaxRetry': MaxRetry,
                'JobsenderCompareVersionId': JobsenderCompareVersionId
            })

        # Allow lambda read/write DDB, SQS
        ddb_file_list.grant_read_write_data(handler)
        ddb_file_list.grant_read_write_data(handler_jobsender)
        sqs_queue.grant_send_messages(handler_jobsender)
        # SQS trigger Lambda worker
        handler.add_event_source(SqsEventSource(sqs_queue, batch_size=1))

        # Option1: Create S3 Bucket, all new objects in this bucket will be transmitted by Lambda Worker
        s3bucket = s3.Bucket(self, "s3_new_migrate")
        s3bucket.grant_read(handler)
        s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED,
                                        s3n.SqsDestination(sqs_queue))

        # Option2: Allow Exist S3 Buckets to be read by Lambda functions.
        # Lambda Jobsender will scan and compare the these buckets and trigger Lambda Workers to transmit
        bucket_name = ''
        for b in bucket_para:
            if bucket_name != b['src_bucket']:  # 如果列了多个相同的Bucket,就跳过
                bucket_name = b['src_bucket']
                s3exist_bucket = s3.Bucket.from_bucket_name(
                    self,
                    bucket_name,  # 用这个做id
                    bucket_name=bucket_name)
                if JobType == 'PUT':
                    s3exist_bucket.grant_read(handler_jobsender)
                    s3exist_bucket.grant_read(handler)
                else:  # 'GET' mode
                    s3exist_bucket.grant_read_write(handler_jobsender)
                    s3exist_bucket.grant_read_write(handler)

        # Allow Lambda read ssm parameters
        ssm_bucket_para.grant_read(handler_jobsender)
        ssm_credential_para.grant_read(handler)
        ssm_credential_para.grant_read(handler_jobsender)
        ssm_parameter_ignore_list.grant_read(handler_jobsender)

        # Schedule cron event to trigger Lambda Jobsender per hour:
        event.Rule(self,
                   'cron_trigger_jobsender',
                   schedule=event.Schedule.rate(core.Duration.hours(1)),
                   targets=[target.LambdaFunction(handler_jobsender)])

        # TODO: Trigger event imediately, add custom resource lambda to invoke handler_jobsender

        # Create Lambda logs filter to create network traffic metric
        handler.log_group.add_metric_filter(
            "Completed-bytes",
            metric_name="Completed-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[info, date, sn, p="--->Complete", bytes, key]'))
        handler.log_group.add_metric_filter(
            "Uploading-bytes",
            metric_name="Uploading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[info, date, sn, p="--->Uploading", bytes, key]'))
        handler.log_group.add_metric_filter(
            "Downloading-bytes",
            metric_name="Downloading-bytes",
            metric_namespace="s3_migrate",
            metric_value="$bytes",
            filter_pattern=logs.FilterPattern.literal(
                '[info, date, sn, p="--->Downloading", bytes, key]'))
        handler.log_group.add_metric_filter(
            "MaxMemoryUsed",
            metric_name="MaxMemoryUsed",
            metric_namespace="s3_migrate",
            metric_value="$memory",
            filter_pattern=logs.FilterPattern.literal(
                '[head="REPORT", a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, '
                'a13, a14, a15, a16, memory, MB="MB", rest]'))
        lambda_metric_Complete = cw.Metric(namespace="s3_migrate",
                                           metric_name="Completed-bytes",
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
        lambda_metric_Upload = cw.Metric(namespace="s3_migrate",
                                         metric_name="Uploading-bytes",
                                         statistic="Sum",
                                         period=core.Duration.minutes(1))
        lambda_metric_Download = cw.Metric(namespace="s3_migrate",
                                           metric_name="Downloading-bytes",
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
        lambda_metric_MaxMemoryUsed = cw.Metric(
            namespace="s3_migrate",
            metric_name="MaxMemoryUsed",
            statistic="Maximum",
            period=core.Duration.minutes(1))
        handler.log_group.add_metric_filter(
            "ERROR",
            metric_name="ERROR-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"ERROR"'))
        handler.log_group.add_metric_filter(
            "WARNING",
            metric_name="WARNING-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"WARNING"'))
        # Task timed out
        handler.log_group.add_metric_filter(
            "TIMEOUT",
            metric_name="TIMEOUT-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"Task timed out"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))
        log_metric_TIMEOUT = cw.Metric(namespace="s3_migrate",
                                       metric_name="TIMEOUT-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        # Dashboard to monitor SQS and Lambda
        board = cw.Dashboard(self, "s3_migrate_serverless")

        board.add_widgets(
            cw.GraphWidget(title="Lambda-NETWORK",
                           left=[
                               lambda_metric_Download, lambda_metric_Upload,
                               lambda_metric_Complete
                           ]),
            cw.GraphWidget(title="Lambda-concurrent",
                           left=[
                               handler.metric(
                                   metric_name="ConcurrentExecutions",
                                   period=core.Duration.minutes(1))
                           ]),
            cw.GraphWidget(
                title="Lambda-invocations/errors/throttles",
                left=[
                    handler.metric_invocations(
                        period=core.Duration.minutes(1)),
                    handler.metric_errors(period=core.Duration.minutes(1)),
                    handler.metric_throttles(period=core.Duration.minutes(1))
                ]),
            cw.GraphWidget(
                title="Lambda-duration",
                left=[
                    handler.metric_duration(period=core.Duration.minutes(1))
                ]),
        )

        board.add_widgets(
            cw.GraphWidget(title="Lambda_MaxMemoryUsed(MB)",
                           left=[lambda_metric_MaxMemoryUsed]),
            cw.GraphWidget(title="ERROR/WARNING Logs",
                           left=[log_metric_ERROR],
                           right=[log_metric_WARNING, log_metric_TIMEOUT]),
            cw.GraphWidget(
                title="SQS-Jobs",
                left=[
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.SingleValueWidget(
                title="Running/Waiting and Dead Jobs",
                metrics=[
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1))
                ],
                height=6))
        # Alarm for queue - DLQ
        alarm_DLQ = cw.Alarm(
            self,
            "SQS_DLQ",
            metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
            ),
            threshold=0,
            comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
            evaluation_periods=1,
            datapoints_to_alarm=1)
        alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter")
        alarm_topic.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic))

        core.CfnOutput(self,
                       "Dashboard",
                       value="CloudWatch Dashboard name s3_migrate_serverless")
Example #4
0
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        #import function code
        try:
            with open("serverless_stack/functions/metric_logs_generator.py",
                      mode="r") as file:
                function_body = file.read()
        except OSError:
            print('File can not read')

        #function
        function_01 = aws_lambda.Function(
            self,
            "lambdafunction01",
            function_name="LambdaTestCustomMEtric",
            runtime=aws_lambda.Runtime.PYTHON_3_6,
            handler="index.lambda_handler",
            code=aws_lambda.InlineCode(function_body),
            timeout=core.Duration.seconds(5),
            reserved_concurrent_executions=1,
            environment={
                'LOG_LEVEL': 'INFO',
                'PERCENTAGE_ERRORS': '75'
            })

        #attached cloudwatch log group
        custom_metric_log_group01 = aws_logs.LogGroup(
            self,
            "cloudwatchlog01",
            log_group_name=f"/aws/lambda/{function_01.function_name}",
            removal_policy=core.RemovalPolicy.DESTROY,
            retention=aws_logs.RetentionDays.ONE_DAY)

        #Custom metric namespace
        custom_metric_namespace01 = aws_cw.Metric(
            namespace=f"custom-error-metric",
            metric_name="custom-error-metric",
            label="Amount of Custom API errors",
            period=core.Duration.minutes(1),
            statistic="Sum")

        #Custom metric logs filter
        custom_metric_filter01 = aws_logs.MetricFilter(
            self,
            "customMetricFilter",
            filter_pattern=aws_logs.FilterPattern.boolean_value(
                "$.custom_api_error", True),
            log_group=custom_metric_log_group01,
            metric_namespace=custom_metric_namespace01.namespace,
            metric_name=custom_metric_namespace01.metric_name,
            default_value=0,
            metric_value="1")

        #create custom alarm
        custom_metric_alarm01 = aws_cw.Alarm(
            self,
            "customMetricAlarm",
            alarm_description="Custom API errors",
            alarm_name="Custom-API-alarm",
            metric=custom_metric_namespace01,
            comparison_operator=aws_cw.ComparisonOperator.
            GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
            threshold=2,
            evaluation_periods=2,
            datapoints_to_alarm=1,
            period=core.Duration.minutes(1),
            treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING)

        #cloudwatch dashboard
        custom_dashboard01 = aws_cw.Dashboard(
            self, id="CustomDashBoard", dashboard_name="CDK-custom-DashBoard")

        #lambda metrics to dashboard
        custom_dashboard01.add_widgets(
            aws_cw.Row(
                aws_cw.GraphWidget(title="Lambda-invoke",
                                   left=[
                                       function_01.metric_invocations(
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
                                   ]),
                aws_cw.GraphWidget(title="Lambda-errors",
                                   left=[
                                       function_01.metric_errors(
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
                                   ])))

        #custom api errors to dashboard
        custom_dashboard01.add_widgets(
            aws_cw.Row(
                aws_cw.SingleValueWidget(title="Custom-API-errors",
                                         metrics=[custom_metric_namespace01])))
    def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        ddb_file_list = ddb.Table(self, "ddb",
                                  partition_key=ddb.Attribute(name="Key", type=ddb.AttributeType.STRING),
                                  billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        sqs_queue_DLQ = sqs.Queue(self, "sqs_DLQ",
                                  visibility_timeout=core.Duration.minutes(15),
                                  retention_period=core.Duration.days(14)
                                  )
        sqs_queue = sqs.Queue(self, "sqs_queue",
                              visibility_timeout=core.Duration.minutes(15),
                              retention_period=core.Duration.days(14),
                              dead_letter_queue=sqs.DeadLetterQueue(
                                  max_receive_count=100,
                                  queue=sqs_queue_DLQ
                              )
                              )
        handler = lam.Function(self, "lambdaFunction",
                               code=lam.Code.asset("./lambda"),
                               handler="lambda_function.lambda_handler",
                               runtime=lam.Runtime.PYTHON_3_8,
                               memory_size=1024,
                               timeout=core.Duration.minutes(15),
                               tracing=lam.Tracing.ACTIVE,
                               environment={
                                   'table_queue_name': ddb_file_list.table_name,
                                   'Des_bucket_default': Des_bucket_default,
                                   'Des_prefix_default': Des_prefix_default,
                                   'StorageClass': StorageClass,
                                   'aws_access_key_id': aws_access_key_id,
                                   'aws_secret_access_key': aws_secret_access_key,
                                   'aws_access_key_region': aws_access_key_region
                               })

        ddb_file_list.grant_read_write_data(handler)
        handler.add_event_source(SqsEventSource(sqs_queue))

        s3bucket = s3.Bucket(self, "s3bucket")
        s3bucket.grant_read(handler)
        s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED,
                                        s3n.SqsDestination(sqs_queue))

        # You can import an existing bucket and grant access to lambda
        # exist_s3bucket = s3.Bucket.from_bucket_name(self, "import_bucket",
        #                                             bucket_name="you_bucket_name")
        # exist_s3bucket.grant_read(handler)

        # But You have to add sqs as imported bucket event notification manually, it doesn't support by CloudFormation
        # An work around is to add on_cloud_trail_event for the bucket, but will trigger could_trail first
        # 因为是导入的Bucket,需要手工建Bucket Event Trigger SQS,以及设置SQS允许该bucekt触发的Permission

        core.CfnOutput(self, "DynamoDB_Table", value=ddb_file_list.table_name)
        core.CfnOutput(self, "SQS_Job_Queue", value=sqs_queue.queue_name)
        core.CfnOutput(self, "SQS_Job_Queue_DLQ", value=sqs_queue_DLQ.queue_name)
        core.CfnOutput(self, "Worker_Lambda_Function", value=handler.function_name)
        core.CfnOutput(self, "New_S3_Bucket", value=s3bucket.bucket_name)

        # Create Lambda logs filter to create network traffic metric
        handler.log_group.add_metric_filter("Complete-bytes",
                                            metric_name="Complete-bytes",
                                            metric_namespace="s3_migrate",
                                            metric_value="$bytes",
                                            filter_pattern=logs.FilterPattern.literal(
                                                '[info, date, sn, p="--->Complete", bytes, key]'))
        handler.log_group.add_metric_filter("Uploading-bytes",
                                            metric_name="Uploading-bytes",
                                            metric_namespace="s3_migrate",
                                            metric_value="$bytes",
                                            filter_pattern=logs.FilterPattern.literal(
                                                '[info, date, sn, p="--->Uploading", bytes, key]'))
        handler.log_group.add_metric_filter("Downloading-bytes",
                                            metric_name="Downloading-bytes",
                                            metric_namespace="s3_migrate",
                                            metric_value="$bytes",
                                            filter_pattern=logs.FilterPattern.literal(
                                                '[info, date, sn, p="--->Downloading", bytes, key]'))
        lambda_metric_Complete = cw.Metric(namespace="s3_migrate",
                                           metric_name="Complete-bytes",
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
        lambda_metric_Upload = cw.Metric(namespace="s3_migrate",
                                         metric_name="Uploading-bytes",
                                         statistic="Sum",
                                         period=core.Duration.minutes(1))
        lambda_metric_Download = cw.Metric(namespace="s3_migrate",
                                           metric_name="Downloading-bytes",
                                           statistic="Sum",
                                           period=core.Duration.minutes(1))
        handler.log_group.add_metric_filter("ERROR",
                                            metric_name="ERROR-Logs",
                                            metric_namespace="s3_migrate",
                                            metric_value="1",
                                            filter_pattern=logs.FilterPattern.literal(
                                                '"ERROR"'))
        handler.log_group.add_metric_filter("WARNING",
                                            metric_name="WARNING-Logs",
                                            metric_namespace="s3_migrate",
                                            metric_value="1",
                                            filter_pattern=logs.FilterPattern.literal(
                                                '"WARNING"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        # Dashboard to monitor SQS and Lambda
        board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_serverless")

        board.add_widgets(cw.GraphWidget(title="Lambda-NETWORK",
                                         left=[lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete]),
                          # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK
                          # Lambda now supports monitor single lambda concurrency, will change this after CDK support
                          cw.GraphWidget(title="Lambda-all-concurrent",
                                         left=[handler.metric_all_concurrent_executions(period=core.Duration.minutes(1))]),

                          cw.GraphWidget(title="Lambda-invocations/errors/throttles",
                                         left=[handler.metric_invocations(period=core.Duration.minutes(1)),
                                               handler.metric_errors(period=core.Duration.minutes(1)),
                                               handler.metric_throttles(period=core.Duration.minutes(1))]),
                          cw.GraphWidget(title="Lambda-duration",
                                         left=[handler.metric_duration(period=core.Duration.minutes(1))]),
                          )

        board.add_widgets(cw.GraphWidget(title="SQS-Jobs",
                                         left=[sqs_queue.metric_approximate_number_of_messages_visible(
                                             period=core.Duration.minutes(1)
                                         ),
                                               sqs_queue.metric_approximate_number_of_messages_not_visible(
                                                   period=core.Duration.minutes(1)
                                               )]),
                          cw.GraphWidget(title="SQS-DeadLetterQueue",
                                         left=[sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
                                             period=core.Duration.minutes(1)
                                         ),
                                               sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible(
                                                   period=core.Duration.minutes(1)
                                               )]),
                          cw.GraphWidget(title="ERROR/WARNING Logs",
                                         left=[log_metric_ERROR],
                                         right=[log_metric_WARNING]),
                          cw.SingleValueWidget(title="Running/Waiting and Dead Jobs",
                                               metrics=[sqs_queue.metric_approximate_number_of_messages_not_visible(
                                                   period=core.Duration.minutes(1)
                                               ),
                                                        sqs_queue.metric_approximate_number_of_messages_visible(
                                                            period=core.Duration.minutes(1)
                                                        ),
                                                        sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible(
                                                            period=core.Duration.minutes(1)
                                                        ),
                                                        sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
                                                            period=core.Duration.minutes(1)
                                                        )],
                                               height=6)
                          )
        # Alarm for queue - DLQ
        alarm_DLQ = cw.Alarm(self, "SQS_DLQ",
                             alarm_name="s3-migration-serverless-SQS Dead Letter Queue",
                             metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(),
                             threshold=0,
                             comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
                             evaluation_periods=1,
                             datapoints_to_alarm=1)
        alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter")
        alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic))

        # Alarm for queue empty, i.e. no visible message and no in-visible message
        # metric_all_message = cw.MathExpression(
        #     expression="a + b",
        #     label="empty_queue_expression",
        #     using_metrics={
        #         "a": sqs_queue.metric_approximate_number_of_messages_visible(),
        #         "b": sqs_queue.metric_approximate_number_of_messages_not_visible()
        #     }
        # )
        # alarm_0 = cw.Alarm(self, "SQSempty",
        #                    alarm_name="SQS queue empty-Serverless",
        #                    metric=metric_all_message,
        #                    threshold=0,
        #                    comparison_operator=cw.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD,
        #                    evaluation_periods=3,
        #                    datapoints_to_alarm=3,
        #                    treat_missing_data=cw.TreatMissingData.IGNORE
        #                    )
        # alarm_topic = sns.Topic(self, "SQS queue empty-Serverless")
        # alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email))
        # alarm_0.add_alarm_action(action.SnsAction(alarm_topic))

        # core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for Serverless: " + alarm_email)
        core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
    def __init__(
            self,
            scope: core.Construct,
            _id: str,
            vpc,
            bucket_para,
            # key_name,
            ddb_file_list,
            sqs_queue,
            sqs_queue_DLQ,
            ssm_bucket_para,
            ssm_credential_para,
            # s3bucket,
            **kwargs) -> None:
        super().__init__(scope, _id, **kwargs)

        # Create jobsender ec2 node
        jobsender = ec2.Instance(
            self,
            "jobsender",
            instance_name="s3_migrate_cluster_jobsender",
            instance_type=ec2.InstanceType(
                instance_type_identifier=jobsender_type),
            machine_image=linux_ami,
            # key_name=key_name,
            user_data=ec2.UserData.custom(user_data_jobsender),
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC))

        # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        jobsender.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # jobsender.role.add_managed_policy(
        #     iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"))
        # Don't give full access s3 to ec2, violate security rule

        # Create Autoscaling Group with fixed 2*EC2 hosts
        worker_asg = autoscaling.AutoScalingGroup(
            self,
            "worker-asg",
            vpc=vpc,
            vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
            instance_type=ec2.InstanceType(
                instance_type_identifier=worker_type),
            machine_image=linux_ami,
            # key_name=key_name,  # Optional if use SSM-SessionManager
            user_data=ec2.UserData.custom(user_data_worker),
            desired_capacity=1,
            min_capacity=1,
            max_capacity=10,
            spot_price="0.5")

        # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet.
        # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric:
        # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc.

        # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH")
        # Don't need SSH since we use Session Manager

        # Assign EC2 Policy to use SSM and CWAgent
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "AmazonSSMManagedInstanceCore"))
        worker_asg.role.add_managed_policy(
            iam.ManagedPolicy.from_aws_managed_policy_name(
                "CloudWatchAgentServerPolicy"))

        # Allow EC2 access new DynamoDB Table
        ddb_file_list.grant_full_access(jobsender)
        ddb_file_list.grant_full_access(worker_asg)

        # Allow EC2 access new sqs and its DLQ
        sqs_queue.grant_consume_messages(jobsender)
        sqs_queue.grant_send_messages(jobsender)
        sqs_queue.grant_consume_messages(worker_asg)
        sqs_queue_DLQ.grant_consume_messages(jobsender)

        # Allow EC2 access SSM Parameter Store, get bucket infor and get credential
        ssm_bucket_para.grant_read(jobsender)
        ssm_credential_para.grant_read(jobsender)
        ssm_credential_para.grant_read(worker_asg)

        # Allow EC2 access new s3 bucket
        # s3bucket.grant_read(jobsender)
        # s3bucket.grant_read(worker_asg)

        # Allow EC2 access exist s3 bucket
        bucket_name = ''
        for b in bucket_para:
            if bucket_name != b['src_bucket']:  # 如果列了多个相同的Bucket,就跳过
                bucket_name = b['src_bucket']
                s3exist_bucket = s3.Bucket.from_bucket_name(
                    self,
                    bucket_name,  # 用这个做id
                    bucket_name=bucket_name)
                s3exist_bucket.grant_read(jobsender)
                s3exist_bucket.grant_read(worker_asg)

        # Dashboard to monitor SQS and EC2
        board = cw.Dashboard(self,
                             "s3_migrate",
                             dashboard_name="s3_migrate_cluster")

        ec2_metric_net = cw.Metric(
            namespace="AWS/EC2",
            metric_name="NetworkOut",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1),
            statistic="Sum")
        ec2_metric_cpu_max = cw.Metric(
            namespace="AWS/EC2",
            metric_name="CPUUtilization",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1),
            statistic="Maximum")
        ec2_metric_cpu_avg = cw.Metric(
            namespace="AWS/EC2",
            metric_name="CPUUtilization",
            # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name},
            period=core.Duration.minutes(1))

        autoscaling_GroupDesiredCapacity = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupDesiredCapacity",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupInServiceInstances = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupInServiceInstances",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMinSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMinSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        autoscaling_GroupMaxSize = cw.Metric(
            namespace="AWS/AutoScaling",
            metric_name="GroupMaxSize",
            dimensions={
                "AutoScalingGroupName": worker_asg.auto_scaling_group_name
            },
            period=core.Duration.minutes(1))
        # CWAgent collected metric
        cwagent_mem_avg = cw.Metric(namespace="CWAgent",
                                    metric_name="mem_used_percent",
                                    dimensions={
                                        "AutoScalingGroupName":
                                        worker_asg.auto_scaling_group_name
                                    },
                                    statistic="Average",
                                    period=core.Duration.minutes(1))
        cwagent_mem_max = cw.Metric(namespace="CWAgent",
                                    metric_name="mem_used_percent",
                                    dimensions={
                                        "AutoScalingGroupName":
                                        worker_asg.auto_scaling_group_name
                                    },
                                    statistic="Maximum",
                                    period=core.Duration.minutes(1))

        # CWAgent collected application logs - filter metric
        s3_migrate_log = logs.LogGroup(self,
                                       "applog",
                                       log_group_name="s3_migration_log")
        s3_migrate_log.add_metric_filter(
            "ERROR",
            metric_name="ERROR-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"ERROR"'))
        s3_migrate_log.add_metric_filter(
            "WARNING",
            metric_name="WARNING-Logs",
            metric_namespace="s3_migrate",
            metric_value="1",
            filter_pattern=logs.FilterPattern.literal('"WARNING"'))
        log_metric_ERROR = cw.Metric(namespace="s3_migrate",
                                     metric_name="ERROR-Logs",
                                     statistic="Sum",
                                     period=core.Duration.minutes(1))
        log_metric_WARNING = cw.Metric(namespace="s3_migrate",
                                       metric_name="WARNING-Logs",
                                       statistic="Sum",
                                       period=core.Duration.minutes(1))

        board.add_widgets(
            cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]),
            cw.GraphWidget(title="EC2-ALL-CPU",
                           left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]),
            cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY",
                           left=[cwagent_mem_max, cwagent_mem_avg]),
            cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity",
                                 metrics=[
                                     autoscaling_GroupDesiredCapacity,
                                     autoscaling_GroupInServiceInstances,
                                     autoscaling_GroupMinSize,
                                     autoscaling_GroupMaxSize
                                 ],
                                 height=6),
        )

        board.add_widgets(
            cw.GraphWidget(
                title="SQS-Jobs",
                left=[
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.GraphWidget(
                title="SQS-DeadLetterQueue",
                left=[
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1))
                ]),
            cw.GraphWidget(title="ERROR/WARNING Logs",
                           left=[log_metric_ERROR],
                           right=[log_metric_WARNING],
                           height=6),
            cw.SingleValueWidget(
                title="Running/Waiting and Death Jobs",
                metrics=[
                    sqs_queue.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue.metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_not_visible(
                        period=core.Duration.minutes(1)),
                    sqs_queue_DLQ.
                    metric_approximate_number_of_messages_visible(
                        period=core.Duration.minutes(1))
                ],
                height=6))

        # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins
        worker_asg.scale_on_metric(
            "scaleup",
            metric=sqs_queue.metric_approximate_number_of_messages_visible(),
            scaling_steps=[
                autoscaling.ScalingInterval(change=1, lower=100, upper=500),
                autoscaling.ScalingInterval(change=2, lower=500),
                autoscaling.ScalingInterval(change=0, upper=100, lower=0)
            ],
            adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY)

        # Alarm for queue empty and ec2 > 1
        # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台
        # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0
        metric_all_message = cw.MathExpression(
            expression="IF(((a+b) == 0) AND (c >1), 0, 1)",  # a+b且c>1则设置为0,告警
            label="empty_queue_expression",
            using_metrics={
                "a": sqs_queue.metric_approximate_number_of_messages_visible(),
                "b":
                sqs_queue.metric_approximate_number_of_messages_not_visible(),
                "c": autoscaling_GroupInServiceInstances
            })
        alarm_0 = cw.Alarm(
            self,
            "SQSempty",
            alarm_name=
            "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster",
            metric=metric_all_message,
            threshold=0,
            comparison_operator=cw.ComparisonOperator.
            LESS_THAN_OR_EQUAL_TO_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.NOT_BREACHING)
        alarm_topic_empty = sns.Topic(
            self, "SQS queue empty and ec2 more than 1 in Cluster")
        # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知
        alarm_topic_empty.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty))

        # If queue empty, set autoscale down to 1 EC2
        action_shutdown = autoscaling.StepScalingAction(
            self,
            "shutdown",
            auto_scaling_group=worker_asg,
            adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY)
        action_shutdown.add_adjustment(adjustment=1, upper_bound=0)
        alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown))

        # While message in SQS-DLQ, alarm to sns
        alarm_DLQ = cw.Alarm(
            self,
            "SQS_DLQ",
            alarm_name=
            "s3-migration-cluster-SQS DLQ more than 1 message-Cluster",
            metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(
            ),
            threshold=0,
            comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD,
            evaluation_periods=3,
            datapoints_to_alarm=3,
            treat_missing_data=cw.TreatMissingData.IGNORE)
        alarm_topic_DLQ = sns.Topic(self,
                                    "SQS DLQ more than 1 message-Cluster")
        alarm_topic_DLQ.add_subscription(
            subscription=sub.EmailSubscription(alarm_email))
        alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ))

        # Output
        core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id)
        core.CfnOutput(self,
                       "WorkerEC2AutoscalingGroup",
                       value=worker_asg.auto_scaling_group_name)
        core.CfnOutput(self,
                       "Dashboard",
                       value="CloudWatch Dashboard name s3_migrate_cluster")
        core.CfnOutput(self,
                       "Alarm",
                       value="CloudWatch SQS queue empty Alarm for cluster: " +
                       alarm_email)
Example #7
0
    def __init__(self, scope: core.Construct, id: str,
                 stream_producer_lg,
                 stream_pipe,
                 py_stream_record_processor_fn,
                 node_stream_record_processor_fn,
                 ** kwargs
                 ) -> None:
        super().__init__(scope, id, **kwargs)

        # ):
        ##### MONITORING ######

        ##################################################
        ##########        STREAM  METRICS        #########
        ##################################################

        # Shows you the ingestion rate into the shard.
        stream_in_bytes_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="IncomingBytes",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="IncomingBytes",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_in_records_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="IncomingRecords",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="IncomingRecords",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_w_throttle_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="WriteProvisionedThroughputExceeded",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="WriteProvisionedThroughputExceeded",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_r_throttle_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="ReadProvisionedThroughputExceeded",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="ReadProvisionedThroughputExceeded",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_put_success_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="PutRecords.Success",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="PutRecords.LatSuccessency",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_put_latency_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="PutRecords.Latency",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="PutRecords.Latency",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        stream_get_latency_metric = _cloudwatch.Metric(
            namespace="AWS/Kinesis",
            metric_name="GetRecords.Latency",
            dimensions={
                "StreamName": f"{stream_pipe.stream_name}"
            },
            label="GetRecords.Latency",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )

        ##################################################
        ##########    STREAM PRODUCER METRICS    #########
        ##################################################
        # JSON Metric Filter - https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html

        records_produced_metric = _cloudwatch.Metric(
            namespace=f"{global_args.OWNER}-stream-data-processor",
            metric_name="recordsProducedCount",
            label="Total No. Of Records Produced",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )

        records_produced_metric_filter = _logs.MetricFilter(self, "recordsProducedCountFilter",
                                                            filter_pattern=_logs.FilterPattern.exists(
                                                                "$.records_produced"),
                                                            log_group=stream_producer_lg,
                                                            metric_namespace=records_produced_metric.namespace,
                                                            metric_name=records_produced_metric.metric_name,
                                                            default_value=0,
                                                            metric_value="$.records_produced",
                                                            )

        ##################################################
        ##########    STREAM CONSUMER METRICS    #########
        ##################################################

        py_records_processed_metric = _cloudwatch.Metric(
            namespace=f"{global_args.OWNER}-stream-data-processor",
            # dimensions={
            #     "RecordsProcessed": "py_processor"
            # },
            metric_name="pyRecordsProcessedCount",
            label="Total No. Of Records Processed",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )

        py_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter01",
                                                        filter_pattern=_logs.FilterPattern.exists(
                                                            "$.records_processed"),
                                                        log_group=py_stream_record_processor_fn.log_group,
                                                        metric_namespace=py_records_processed_metric.namespace,
                                                        metric_name=py_records_processed_metric.metric_name,
                                                        default_value=0,
                                                        metric_value="$.records_processed",
                                                        )
        node_records_processed_metric = _cloudwatch.Metric(
            namespace=f"{global_args.OWNER}-stream-data-processor",
            metric_name="nodeRecordsProcessedCount",
            label="Total No. Of Records Processed",
            period=core.Duration.minutes(30),
            statistic="Sum"
        )
        node_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter02",
                                                                filter_pattern=_logs.FilterPattern.exists(
                                                                    "$.records_processed"),
                                                                log_group=node_stream_record_processor_fn.log_group,
                                                                metric_namespace=node_records_processed_metric.namespace,
                                                                metric_name=node_records_processed_metric.metric_name,
                                                                default_value=0,
                                                                metric_value="$.records_processed",
                                                          )

        # Create CloudWatch Dashboard for Streams
        stream_processor_dashboard = _cloudwatch.Dashboard(self,
                                                           id="streamProcessorDashboard",
                                                           dashboard_name="Stream-Processor"
                                                           )

        stream_processor_dashboard.add_widgets(
            _cloudwatch.SingleValueWidget(
                title="TotalRecordsProduced",
                metrics=[records_produced_metric]
            ),
            _cloudwatch.SingleValueWidget(
                title="RecordsProcessed-by-Python-Consumer",
                metrics=[py_records_processed_metric]
            ),
            _cloudwatch.SingleValueWidget(
                title="RecordsProcessed-by-Node-Consumer",
                metrics=[node_records_processed_metric]
            )
        )

        # Stream Incoming bytes Graph
        stream_processor_dashboard.add_widgets(
            _cloudwatch.Row(
                _cloudwatch.GraphWidget(
                    title="Shard Ingestion Metrics",
                    left=[stream_in_bytes_metric],
                    right=[stream_in_records_metric]
                ),
                _cloudwatch.GraphWidget(
                    title="Shard Throttle Metrics",
                    left=[stream_w_throttle_metric],
                    right=[stream_r_throttle_metric]
                )
            )
        )

        stream_processor_dashboard.add_widgets(
            _cloudwatch.Row(
                _cloudwatch.GraphWidget(
                    title="Stream Put Latency",
                    left=[stream_put_latency_metric]
                ),
                _cloudwatch.GraphWidget(
                    title="Stream Get Latency",
                    left=[stream_get_latency_metric]
                ),
                _cloudwatch.GraphWidget(
                    title="Stream Put Success",
                    left=[stream_put_success_metric]
                )
            )
        )

        ###########################################
        ################# OUTPUTS #################
        ###########################################

        output_0 = core.CfnOutput(self,
                                  "SecuirtyAutomationFrom",
                                  value=f"{global_args.SOURCE_INFO}",
                                  description="To know more about this automation stack, check out our github page."
                                  )
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        bucket_name = 'devassoc-monitored'
        bucket = s3.Bucket(self,
                           'bucket-monitored',
                           bucket_name=bucket_name,
                           removal_policy=core.RemovalPolicy.DESTROY,
                           auto_delete_objects=True)

        core.CfnOutput(self, 'monitored-bucket', value=bucket.bucket_name)

        size_metric = cw.Metric(namespace='AWS/S3',
                                metric_name='BucketSizeBytes',
                                dimensions={
                                    'BucketName': bucket.bucket_name,
                                    'StorageType': 'StandardStorage'
                                },
                                period=core.Duration.days(1))
        size_alarm = size_metric.create_alarm(
            self,
            'bucket-alarm',
            alarm_name='S3 Storage Alarm',
            comparison_operator=cw.ComparisonOperator.
            GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
            evaluation_periods=1,
            period=core.Duration.days(1),
            threshold=1000,
            actions_enabled=True)
        size_topic = sns.Topic(self,
                               'size-topic',
                               display_name='My S3 Alarm List')
        email_param = ssm.StringParameter.from_string_parameter_name(
            self, 'email-param', 'notification-email')
        size_topic_sub = sns.Subscription(
            self,
            'size-topic-sub',
            topic=size_topic,
            protocol=sns.SubscriptionProtocol.EMAIL,
            endpoint=email_param.string_value)
        size_action = cwa.SnsAction(size_topic)
        size_alarm.add_alarm_action(size_action)

        bucket_name = 'devassoc-s3-logs'
        log_bucket = s3.Bucket(self,
                               'bucket-s3-logs',
                               bucket_name=bucket_name,
                               removal_policy=core.RemovalPolicy.DESTROY,
                               auto_delete_objects=True)
        s3_trail = ct.Trail(self,
                            'bucket-trail',
                            bucket=log_bucket,
                            trail_name='s3_logs')
        s3_trail.add_s3_event_selector([ct.S3EventSelector(bucket=bucket)])
        s3_trail.log_all_s3_data_events()

        single_value_widget = cw.SingleValueWidget(metrics=[size_metric])
        graph_widget = cw.GraphWidget(left=[size_metric])
        cw.Dashboard(self,
                     'cloudwatch-dashboard',
                     dashboard_name='S3Dashboard',
                     widgets=[[single_value_widget, graph_widget]])