def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None: super().__init__(scope, id, **kwargs) # Read Lambda Code): try: with open("serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f: konstone_custom_metric_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") konstone_custom_metric_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_custom_metric_fn", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode( konstone_custom_metric_fn_code), timeout=core.Duration.seconds( 3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "PERCENTAGE_ERRORS": "75" } ) # Create Custom Loggroup # /aws/lambda/function-name konstone_custom_metric_lg = _logs.LogGroup( self, "konstoneLoggroup", log_group_name=f"/aws/lambda/{konstone_custom_metric_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY, ) # Create Custom Metric Namespace third_party_error_metric = _cloudwatch.Metric( namespace=f"third-party-error-metric", metric_name="third_party_error_metric", label="Total No. of Third Party API Errors", period=core.Duration.minutes(1), statistic="Sum" ) # Create Custom Metric Log Filter third_party_error_metric_filter = _logs.MetricFilter( self, "thirdPartyApiErrorMetricFilter", filter_pattern=_logs.FilterPattern.boolean_value( "$.third_party_api_error", True), log_group=konstone_custom_metric_lg, metric_namespace=third_party_error_metric.namespace, metric_name=third_party_error_metric.metric_name, default_value=0, metric_value="1" ) # Create Third Party Error Alarm third_party_error_alarm = _cloudwatch.Alarm( self, "thirdPartyApiErrorAlarm", alarm_description="Alert if 3rd party API has more than 2 errors in the last two minutes", alarm_name="third-party-api-alarm", metric=third_party_error_metric, comparison_operator=_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING ) # Create CloudWatch Dashboard konstone_dashboard = _cloudwatch.Dashboard( self, id="konstoneDashboard", dashboard_name="Konstone-App-Live-Dashboard" ) # Add Lambda Function Metrics to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Backend-Invocations", left=[ konstone_custom_metric_fn.metric_invocations( statistic="Sum", period=core.Duration.minutes(1) ) ] ), _cloudwatch.GraphWidget( title="Backend-Errors", left=[ konstone_custom_metric_fn.metric_errors( statistic="Sum", period=core.Duration.minutes(1) ) ] ) ) ) # Add 3rd Party API Error to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.SingleValueWidget( title="Third Party API Errors", metrics=[third_party_error_metric] ) ) )
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, s3bucket, s3_deploy, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create environment variable into userdata env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \ f'export sqs_queue_name={sqs_queue.queue_name}\n' \ f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n' env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \ f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \ f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n' # Create log group and put group name into userdata s3_migrate_log = logs.LogGroup(self, "applog") cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['metrics']['append_dimensions'][ 'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}" cw_agent_config['metrics']['append_dimensions'][ 'InstanceId'] = "\\${aws:InstanceId}" cw_agent_config_str = json.dumps(cw_agent_config, indent=4).replace("\\\\", "\\") userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \ s3_deploy.bucket_name + " .\n" + env_var + env_var_st jobsender_userdata = userdata_head + user_data_jobsender_p worker_userdata = userdata_head + user_data_worker_p # Create jobsender ec2 node jobsender = autoscaling.AutoScalingGroup( self, "jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(jobsender_userdata), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), desired_capacity=1, min_capacity=0, max_capacity=1) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(worker_userdata), desired_capacity=2, min_capacity=2, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access source code on s3_deploy bucket s3_deploy.grant_read(jobsender) s3_deploy.grant_read(worker_asg) # Allow EC2 access new s3 bucket s3bucket.grant_read(jobsender) s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets # bucket_name = '' # for b in bucket_para: # if bucket_name != b['des_bucket']: # 如果列了多个相同的Bucket,就跳过 # bucket_name = b['des_bucket'] # s3exist_bucket = s3.Bucket.from_bucket_name(self, # bucket_name, # 用这个做id # bucket_name=bucket_name) # s3exist_bucket.grant_read_write(jobsender) # s3exist_bucket.grant_read_write(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate") ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) ec2_metric_net_out = cw.MathExpression( expression= "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)", label="EC2-NetworkOut", using_metrics={}) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=mem_used_percent)', 'Average', 60)", label="mem_avg", using_metrics={}) cwagent_disk_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} " "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)", label="disk_avg", using_metrics={}) cwagent_net_tcp = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=tcp_established)', 'Average', 60)", label="tcp_conn", using_metrics={}) # CWAgent collected application logs - filter metric s3_migrate_log.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Complete", bytes, key]')) s3_migrate_log.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Uploading", bytes, key]')) s3_migrate_log.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Downloading", bytes, key]')) traffic_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC", left=[ traffic_metric_Complete, traffic_metric_Upload, traffic_metric_Download ], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False)), cw.GraphWidget(title="ERROR/WARNING LOGS", left=[log_metric_ERROR], left_y_axis=cw.YAxisProps(label="Count", show_units=False), right=[log_metric_WARNING], right_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget( title="SQS-JOBS", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="RUNNING, WAITING & DEATH JOBS", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-AutoscalingGroup-TCP", left=[cwagent_net_tcp], left_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY", left=[ec2_metric_cpu_avg, cwagent_mem_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-DISK", left=[cwagent_disk_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-NetworkOut", left=[ec2_metric_net_out], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False))) # Autoscaling up when visible message > 100 in 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Setup SSM parameter of credentials, bucket_para, ignore_list ssm_credential_para = ssm.StringParameter.from_secure_string_parameter_attributes( self, "ssm_parameter_credentials", parameter_name=ssm_parameter_credentials, version=1) ssm_bucket_para = ssm.StringParameter(self, "s3bucket_serverless", string_value=json.dumps( bucket_para, indent=4)) ssm_parameter_ignore_list = ssm.StringParameter( self, "s3_migrate_ignore_list", string_value=ignore_list) # Setup DynamoDB ddb_file_list = ddb.Table(self, "s3migrate_serverless", partition_key=ddb.Attribute( name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) ddb_file_list.add_global_secondary_index( partition_key=ddb.Attribute(name="desBucket", type=ddb.AttributeType.STRING), index_name="desBucket-index", projection_type=ddb.ProjectionType.INCLUDE, non_key_attributes=["desKey", "versionId"]) # Setup SQS sqs_queue_DLQ = sqs.Queue(self, "s3migrate_serverless_Q_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14)) sqs_queue = sqs.Queue(self, "s3migrate_serverless_Q", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=60, queue=sqs_queue_DLQ)) # Setup API for Lambda to get IP address (for debug networking routing purpose) checkip = api.RestApi( self, "lambda-checkip-api", cloud_watch_role=True, deploy=True, description="For Lambda get IP address", default_integration=api.MockIntegration( integration_responses=[ api.IntegrationResponse(status_code="200", response_templates={ "application/json": "$context.identity.sourceIp" }) ], request_templates={"application/json": '{"statusCode": 200}'}), endpoint_types=[api.EndpointType.REGIONAL]) checkip.root.add_method("GET", method_responses=[ api.MethodResponse( status_code="200", response_models={ "application/json": api.Model.EMPTY_MODEL }) ]) # Setup Lambda functions handler = lam.Function(self, "s3-migrate-worker", code=lam.Code.asset("./lambda"), handler="lambda_function_worker.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'ssm_parameter_credentials': ssm_parameter_credentials, 'JobType': JobType, 'MaxRetry': MaxRetry, 'MaxThread': MaxThread, 'MaxParallelFile': MaxParallelFile, 'JobTimeout': JobTimeout, 'UpdateVersionId': UpdateVersionId, 'GetObjectWithVersionId': GetObjectWithVersionId }) handler_jobsender = lam.Function( self, "s3-migrate-jobsender", code=lam.Code.asset("./lambda"), handler="lambda_function_jobsender.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'sqs_queue': sqs_queue.queue_name, 'ssm_parameter_credentials': ssm_parameter_credentials, 'ssm_parameter_ignore_list': ssm_parameter_ignore_list.parameter_name, 'ssm_parameter_bucket': ssm_bucket_para.parameter_name, 'JobType': JobType, 'MaxRetry': MaxRetry, 'JobsenderCompareVersionId': JobsenderCompareVersionId }) # Allow lambda read/write DDB, SQS ddb_file_list.grant_read_write_data(handler) ddb_file_list.grant_read_write_data(handler_jobsender) sqs_queue.grant_send_messages(handler_jobsender) # SQS trigger Lambda worker handler.add_event_source(SqsEventSource(sqs_queue, batch_size=1)) # Option1: Create S3 Bucket, all new objects in this bucket will be transmitted by Lambda Worker s3bucket = s3.Bucket(self, "s3_new_migrate") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # Option2: Allow Exist S3 Buckets to be read by Lambda functions. # Lambda Jobsender will scan and compare the these buckets and trigger Lambda Workers to transmit bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) if JobType == 'PUT': s3exist_bucket.grant_read(handler_jobsender) s3exist_bucket.grant_read(handler) else: # 'GET' mode s3exist_bucket.grant_read_write(handler_jobsender) s3exist_bucket.grant_read_write(handler) # Allow Lambda read ssm parameters ssm_bucket_para.grant_read(handler_jobsender) ssm_credential_para.grant_read(handler) ssm_credential_para.grant_read(handler_jobsender) ssm_parameter_ignore_list.grant_read(handler_jobsender) # Schedule cron event to trigger Lambda Jobsender per hour: event.Rule(self, 'cron_trigger_jobsender', schedule=event.Schedule.rate(core.Duration.hours(1)), targets=[target.LambdaFunction(handler_jobsender)]) # TODO: Trigger event imediately, add custom resource lambda to invoke handler_jobsender # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) handler.log_group.add_metric_filter( "MaxMemoryUsed", metric_name="MaxMemoryUsed", metric_namespace="s3_migrate", metric_value="$memory", filter_pattern=logs.FilterPattern.literal( '[head="REPORT", a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, ' 'a13, a14, a15, a16, memory, MB="MB", rest]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_MaxMemoryUsed = cw.Metric( namespace="s3_migrate", metric_name="MaxMemoryUsed", statistic="Maximum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) handler.log_group.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) # Task timed out handler.log_group.add_metric_filter( "TIMEOUT", metric_name="TIMEOUT-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"Task timed out"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_TIMEOUT = cw.Metric(namespace="s3_migrate", metric_name="TIMEOUT-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate_serverless") board.add_widgets( cw.GraphWidget(title="Lambda-NETWORK", left=[ lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete ]), cw.GraphWidget(title="Lambda-concurrent", left=[ handler.metric( metric_name="ConcurrentExecutions", period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ handler.metric_invocations( period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-duration", left=[ handler.metric_duration(period=core.Duration.minutes(1)) ]), ) board.add_widgets( cw.GraphWidget(title="Lambda_MaxMemoryUsed(MB)", left=[lambda_metric_MaxMemoryUsed]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING, log_metric_TIMEOUT]), cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="Running/Waiting and Dead Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm( self, "SQS_DLQ", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #import function code try: with open("serverless_stack/functions/metric_logs_generator.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCustomMEtric", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'PERCENTAGE_ERRORS': '75' }) #attached cloudwatch log group custom_metric_log_group01 = aws_logs.LogGroup( self, "cloudwatchlog01", log_group_name=f"/aws/lambda/{function_01.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=aws_logs.RetentionDays.ONE_DAY) #Custom metric namespace custom_metric_namespace01 = aws_cw.Metric( namespace=f"custom-error-metric", metric_name="custom-error-metric", label="Amount of Custom API errors", period=core.Duration.minutes(1), statistic="Sum") #Custom metric logs filter custom_metric_filter01 = aws_logs.MetricFilter( self, "customMetricFilter", filter_pattern=aws_logs.FilterPattern.boolean_value( "$.custom_api_error", True), log_group=custom_metric_log_group01, metric_namespace=custom_metric_namespace01.namespace, metric_name=custom_metric_namespace01.metric_name, default_value=0, metric_value="1") #create custom alarm custom_metric_alarm01 = aws_cw.Alarm( self, "customMetricAlarm", alarm_description="Custom API errors", alarm_name="Custom-API-alarm", metric=custom_metric_namespace01, comparison_operator=aws_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING) #cloudwatch dashboard custom_dashboard01 = aws_cw.Dashboard( self, id="CustomDashBoard", dashboard_name="CDK-custom-DashBoard") #lambda metrics to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.GraphWidget(title="Lambda-invoke", left=[ function_01.metric_invocations( statistic="Sum", period=core.Duration.minutes(1)) ]), aws_cw.GraphWidget(title="Lambda-errors", left=[ function_01.metric_errors( statistic="Sum", period=core.Duration.minutes(1)) ]))) #custom api errors to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.SingleValueWidget(title="Custom-API-errors", metrics=[custom_metric_namespace01])))
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) ddb_file_list = ddb.Table(self, "ddb", partition_key=ddb.Attribute(name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) sqs_queue_DLQ = sqs.Queue(self, "sqs_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14) ) sqs_queue = sqs.Queue(self, "sqs_queue", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=100, queue=sqs_queue_DLQ ) ) handler = lam.Function(self, "lambdaFunction", code=lam.Code.asset("./lambda"), handler="lambda_function.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, 'aws_access_key_region': aws_access_key_region }) ddb_file_list.grant_read_write_data(handler) handler.add_event_source(SqsEventSource(sqs_queue)) s3bucket = s3.Bucket(self, "s3bucket") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # You can import an existing bucket and grant access to lambda # exist_s3bucket = s3.Bucket.from_bucket_name(self, "import_bucket", # bucket_name="you_bucket_name") # exist_s3bucket.grant_read(handler) # But You have to add sqs as imported bucket event notification manually, it doesn't support by CloudFormation # An work around is to add on_cloud_trail_event for the bucket, but will trigger could_trail first # 因为是导入的Bucket,需要手工建Bucket Event Trigger SQS,以及设置SQS允许该bucekt触发的Permission core.CfnOutput(self, "DynamoDB_Table", value=ddb_file_list.table_name) core.CfnOutput(self, "SQS_Job_Queue", value=sqs_queue.queue_name) core.CfnOutput(self, "SQS_Job_Queue_DLQ", value=sqs_queue_DLQ.queue_name) core.CfnOutput(self, "Worker_Lambda_Function", value=handler.function_name) core.CfnOutput(self, "New_S3_Bucket", value=s3bucket.bucket_name) # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter("Complete-bytes", metric_name="Complete-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter("Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter("Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Complete-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter("ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"ERROR"')) handler.log_group.add_metric_filter("WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_serverless") board.add_widgets(cw.GraphWidget(title="Lambda-NETWORK", left=[lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete]), # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support cw.GraphWidget(title="Lambda-all-concurrent", left=[handler.metric_all_concurrent_executions(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-invocations/errors/throttles", left=[handler.metric_invocations(period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-duration", left=[handler.metric_duration(period=core.Duration.minutes(1))]), ) board.add_widgets(cw.GraphWidget(title="SQS-Jobs", left=[sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="SQS-DeadLetterQueue", left=[sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING]), cw.SingleValueWidget(title="Running/Waiting and Dead Jobs", metrics=[sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) )], height=6) ) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm(self, "SQS_DLQ", alarm_name="s3-migration-serverless-SQS Dead Letter Queue", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) # Alarm for queue empty, i.e. no visible message and no in-visible message # metric_all_message = cw.MathExpression( # expression="a + b", # label="empty_queue_expression", # using_metrics={ # "a": sqs_queue.metric_approximate_number_of_messages_visible(), # "b": sqs_queue.metric_approximate_number_of_messages_not_visible() # } # ) # alarm_0 = cw.Alarm(self, "SQSempty", # alarm_name="SQS queue empty-Serverless", # metric=metric_all_message, # threshold=0, # comparison_operator=cw.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, # evaluation_periods=3, # datapoints_to_alarm=3, # treat_missing_data=cw.TreatMissingData.IGNORE # ) # alarm_topic = sns.Topic(self, "SQS queue empty-Serverless") # alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) # alarm_0.add_alarm_action(action.SnsAction(alarm_topic)) # core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for Serverless: " + alarm_email) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, # s3bucket, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create jobsender ec2 node jobsender = ec2.Instance( self, "jobsender", instance_name="s3_migrate_cluster_jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(user_data_jobsender), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC)) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(user_data_worker), desired_capacity=1, min_capacity=1, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access new s3 bucket # s3bucket.grant_read(jobsender) # s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_cluster") ec2_metric_net = cw.Metric( namespace="AWS/EC2", metric_name="NetworkOut", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Sum") ec2_metric_cpu_max = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Maximum") ec2_metric_cpu_avg = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1)) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Average", period=core.Duration.minutes(1)) cwagent_mem_max = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Maximum", period=core.Duration.minutes(1)) # CWAgent collected application logs - filter metric s3_migrate_log = logs.LogGroup(self, "applog", log_group_name="s3_migration_log") s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]), cw.GraphWidget(title="EC2-ALL-CPU", left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]), cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY", left=[cwagent_mem_max, cwagent_mem_avg]), cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING], height=6), cw.SingleValueWidget( title="Running/Waiting and Death Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id) core.CfnOutput(self, "WorkerEC2AutoscalingGroup", value=worker_asg.auto_scaling_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, id: str, stream_producer_lg, stream_pipe, py_stream_record_processor_fn, node_stream_record_processor_fn, ** kwargs ) -> None: super().__init__(scope, id, **kwargs) # ): ##### MONITORING ###### ################################################## ########## STREAM METRICS ######### ################################################## # Shows you the ingestion rate into the shard. stream_in_bytes_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingBytes", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingBytes", period=core.Duration.minutes(30), statistic="Sum" ) stream_in_records_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingRecords", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingRecords", period=core.Duration.minutes(30), statistic="Sum" ) stream_w_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="WriteProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="WriteProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_r_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="ReadProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="ReadProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_success_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Success", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.LatSuccessency", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) stream_get_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="GetRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="GetRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) ################################################## ########## STREAM PRODUCER METRICS ######### ################################################## # JSON Metric Filter - https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html records_produced_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="recordsProducedCount", label="Total No. Of Records Produced", period=core.Duration.minutes(30), statistic="Sum" ) records_produced_metric_filter = _logs.MetricFilter(self, "recordsProducedCountFilter", filter_pattern=_logs.FilterPattern.exists( "$.records_produced"), log_group=stream_producer_lg, metric_namespace=records_produced_metric.namespace, metric_name=records_produced_metric.metric_name, default_value=0, metric_value="$.records_produced", ) ################################################## ########## STREAM CONSUMER METRICS ######### ################################################## py_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", # dimensions={ # "RecordsProcessed": "py_processor" # }, metric_name="pyRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) py_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter01", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=py_stream_record_processor_fn.log_group, metric_namespace=py_records_processed_metric.namespace, metric_name=py_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) node_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="nodeRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) node_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter02", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=node_stream_record_processor_fn.log_group, metric_namespace=node_records_processed_metric.namespace, metric_name=node_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) # Create CloudWatch Dashboard for Streams stream_processor_dashboard = _cloudwatch.Dashboard(self, id="streamProcessorDashboard", dashboard_name="Stream-Processor" ) stream_processor_dashboard.add_widgets( _cloudwatch.SingleValueWidget( title="TotalRecordsProduced", metrics=[records_produced_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Python-Consumer", metrics=[py_records_processed_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Node-Consumer", metrics=[node_records_processed_metric] ) ) # Stream Incoming bytes Graph stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Shard Ingestion Metrics", left=[stream_in_bytes_metric], right=[stream_in_records_metric] ), _cloudwatch.GraphWidget( title="Shard Throttle Metrics", left=[stream_w_throttle_metric], right=[stream_r_throttle_metric] ) ) ) stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Stream Put Latency", left=[stream_put_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Get Latency", left=[stream_get_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Put Success", left=[stream_put_success_metric] ) ) ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput(self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description="To know more about this automation stack, check out our github page." )
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) bucket_name = 'devassoc-monitored' bucket = s3.Bucket(self, 'bucket-monitored', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) core.CfnOutput(self, 'monitored-bucket', value=bucket.bucket_name) size_metric = cw.Metric(namespace='AWS/S3', metric_name='BucketSizeBytes', dimensions={ 'BucketName': bucket.bucket_name, 'StorageType': 'StandardStorage' }, period=core.Duration.days(1)) size_alarm = size_metric.create_alarm( self, 'bucket-alarm', alarm_name='S3 Storage Alarm', comparison_operator=cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, period=core.Duration.days(1), threshold=1000, actions_enabled=True) size_topic = sns.Topic(self, 'size-topic', display_name='My S3 Alarm List') email_param = ssm.StringParameter.from_string_parameter_name( self, 'email-param', 'notification-email') size_topic_sub = sns.Subscription( self, 'size-topic-sub', topic=size_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=email_param.string_value) size_action = cwa.SnsAction(size_topic) size_alarm.add_alarm_action(size_action) bucket_name = 'devassoc-s3-logs' log_bucket = s3.Bucket(self, 'bucket-s3-logs', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) s3_trail = ct.Trail(self, 'bucket-trail', bucket=log_bucket, trail_name='s3_logs') s3_trail.add_s3_event_selector([ct.S3EventSelector(bucket=bucket)]) s3_trail.log_all_s3_data_events() single_value_widget = cw.SingleValueWidget(metrics=[size_metric]) graph_widget = cw.GraphWidget(left=[size_metric]) cw.Dashboard(self, 'cloudwatch-dashboard', dashboard_name='S3Dashboard', widgets=[[single_value_widget, graph_widget]])