def add_monitoring(self, monitoring): resource_metrics = [ (self.new_recording_resource, "4XXError"), (self.new_recording_resource, "5XXError"), (self.ingest_resource, "5XXError"), ] for resource, metric_name in resource_metrics: construct_id = ( f"{metric_name}-{resource.path.replace('/', '_')}-alarm" ) alarm = cloudwatch.Alarm( self, construct_id, metric=cloudwatch.Metric( metric_name=metric_name, namespace="AWS/ApiGateway", dimensions={ "ApiName": self.rest_api_name, "Stage": names.API_STAGE, "Method": "POST", "Resource": resource.path, }, period=core.Duration.minutes(1), ), statistic="sum", threshold=1, evaluation_periods=1, comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, ) monitoring.add_alarm_action(alarm) webhook_latency_alarm = cloudwatch.Alarm( self, "WebhookLatencyAlarm", metric=cloudwatch.Metric( metric_name="Latency", namespace="AWS/ApiGateway", dimensions={ "ApiName": self.rest_api_name, "Stage": names.API_STAGE, "Method": "POST", "Resource": self.new_recording_resource.path, }, period=core.Duration.minutes(1), ), statistic="avg", threshold=10000, evaluation_periods=3, comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, ) monitoring.add_alarm_action(webhook_latency_alarm)
def createOps(self): alarmTopic = sns.Topic(self, 'TipBotAlarmTopic', display_name='TipBotAlarmTopic', fifo=False, ) alarmTopic.add_subscription(snss.EmailSubscription(self.getEmail(), json=True)) cw.CompositeAlarm(self, 'TipBotCompositeAlarm', alarm_rule=cw.AlarmRule.any_of( cw.Alarm(self, "LNDAlarm", metric=cw.Metric( metric_name='LndUp', namespace='LNTipBot', period=cdk.Duration.minutes(1), statistic='sum', unit=cw.Unit.NONE, ), threshold=1, actions_enabled=False, alarm_description='Alarm for when the LND service has gone down', alarm_name='LND Alarm', comparison_operator=cw.ComparisonOperator.LESS_THAN_THRESHOLD, datapoints_to_alarm=5, evaluation_periods=5, treat_missing_data=cw.TreatMissingData.BREACHING ), cw.Alarm(self, "BTCAlarm", metric=cw.Metric( metric_name='BtcUp', namespace='LNTipBot', period=cdk.Duration.minutes(1), statistic='sum', unit=cw.Unit.NONE, ), threshold=1, actions_enabled=False, alarm_description='Alarm for when the BTC service has gone down', alarm_name='BTC Alarm', comparison_operator=cw.ComparisonOperator.LESS_THAN_THRESHOLD, datapoints_to_alarm=5, evaluation_periods=5, treat_missing_data=cw.TreatMissingData.BREACHING ) ), actions_enabled=True, alarm_description='TipBot Composite Alarm', composite_alarm_name='TipBot Composite Alarm', ).add_alarm_action(cwa.SnsAction(alarmTopic))
def _add_conditional_storage_widgets( self, conditional_metrics, volumes_list, namespace, dimension_vol_name, vol_attribute_name, ): """Add widgets for conditional metrics for EBS, Raid and EFS.""" widgets_list = [] for metric_condition_params in conditional_metrics: metric_list = [] for volume in volumes_list: if getattr(volume.config, vol_attribute_name ) in metric_condition_params.supported_vol_types: cloudwatch_metric = cloudwatch.Metric( namespace=namespace, metric_name=metric_condition_params.metrics, dimensions={dimension_vol_name: volume.id}, ) metric_list.append(cloudwatch_metric) if len( metric_list ) > 0: # Add the metrics only if there exist support volumes for it graph_widget = self._generate_graph_widget( metric_condition_params.title, metric_list) widgets_list.append(graph_widget) return widgets_list
def create_cw_alarm_with_action( self, metric_name, threshold, comparison_operator, period, evaluation_periods, statistic, sns_topic_list=[], ) -> None: # Creating a CW Alarm for the provided metric self._cw_alarm = cloudwatch.Alarm( self, self._domain_name + f"-{metric_name}Alarm", metric=cloudwatch.Metric( metric_name=metric_name, namespace="AWS/ES", dimensions={ "DomainName": self._domain_name, "ClientId": self._account }, ), threshold=threshold, comparison_operator=comparison_operator, period=core.Duration.minutes(period), evaluation_periods=evaluation_periods, statistic=statistic, treat_missing_data=cloudwatch.TreatMissingData.MISSING, ) # If SNS topic list is provided by the user, setting the Alarm action to the topic(s) if sns_topic_list: self._cw_alarm.add_alarm_action( *list(map(cloudwatch_actions.SnsAction, sns_topic_list)))
def __create_asg_scaling_policy(asg): cpu_utilization = cloudwatch.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={"AutoScalingGroupName": asg.auto_scaling_group_name}, period=core.Duration.minutes(15)) asg.scale_on_metric( "ImagizerClusterCpuTarget", metric=cpu_utilization, adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY, estimated_instance_warmup=core.Duration.seconds(400), scaling_steps=[ autoscaling.ScalingInterval( change=variables.ASG_CAPACITY_INCREASE, lower=variables.ASG_CPU_HIGH_THRESHOLD), autoscaling.ScalingInterval( change=-variables.ASG_CAPACITY_DECREASE, upper=variables.ASG_CPU_LOW_THRESHOLD) ]) asg.scale_on_request_count( "ImagizerClusterRpsTarget", target_requests_per_second=variables.ASG_RPS_THRESHOLD, disable_scale_in=True)
def _generate_ec2_metrics_list(self, metrics): metric_list = [] for metric in metrics: cloudwatch_metric = cloudwatch.Metric( namespace="AWS/EC2", metric_name=metric, dimensions={"InstanceId": self.head_node_instance.ref}) metric_list.append(cloudwatch_metric) return metric_list
def get_yelp_cleaner_graph(): return YelpOrchestratorStack.graph_widget( "YelpCleanerDeletions", *[ aws_cloudwatch.Metric( namespace="YelpOrchestrator", metric_name=metric_name, statistic="Sum", period=core.Duration.minutes(5), ) for metric_name in ("UrlTableRecordsDeleted", "YelpTableRecordsDeleted") ], )
def get_s3_graphs(bucket): return (YelpOrchestratorStack.graph_widget( "ObjectCount", aws_cloudwatch.Metric( namespace="AWS/S3", metric_name="NumberOfObjects", dimensions={ "StorageType": "AllStorageTypes", "BucketName": bucket.bucket_name, }, statistic="Sum", period=core.Duration.minutes(5), ), ), )
def build_metric(metric_name: str, name_space: str, dimensions, unit: cloud_watch.Unit, label: str, stat: str = 'avg', period: int = 900): return cloud_watch.Metric(metric_name=metric_name, namespace=name_space, dimensions=dimensions, unit=unit, label=label, statistic=stat, period=core.Duration.seconds(period))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) role_arn = 'arn:aws:iam::315207712355:role/lbrole' role = iam.Role.from_role_arn(self, id='role_id', role_arn=role_arn) # The code that defines your stack goes here this_dir = path.dirname(__file__) handler = lmb.Function(self, 'Handler', runtime=lmb.Runtime.PYTHON_3_7, role=role, handler='handler.handler', code=lmb.Code.from_asset( path.join(this_dir, 'lambda'))) alias = lmb.Alias(self, 'HandlerAlias', alias_name='Current', version=handler.current_version) gw = apigw.LambdaRestApi( self, 'Gateway', description='Endpoint for a simple Lambda-powered web service', handler=alias) failure_alarm = cloudwatch.Alarm(self, 'FailureAlarm', metric=cloudwatch.Metric( metric_name='5XXError', namespace='AWS/ApiGateway', dimensions={ 'ApiName': 'Gateway', }, statistic='Sum', period=core.Duration.minutes(1)), threshold=1, evaluation_periods=1) codedeploy.LambdaDeploymentGroup( self, 'DeploymentGroup', alias=alias, deployment_config=codedeploy.LambdaDeploymentConfig. CANARY_10_PERCENT_10_MINUTES, alarms=[failure_alarm]) self.url_output = core.CfnOutput(self, 'Url', value=gw.url)
def _add_storage_widgets(self, metrics, storages_list, namespace, dimension_name): widgets_list = [] for metrics_param in metrics: metric_list = [] for metric in metrics_param.metrics: for storage in storages_list: cloudwatch_metric = cloudwatch.Metric( namespace=namespace, metric_name=metric, dimensions={dimension_name: storage.id}, ) metric_list.append(cloudwatch_metric) graph_widget = self._generate_graph_widget(metrics_param.title, metric_list) widgets_list.append(graph_widget) return widgets_list
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) this_dir = path.dirname(__file__) handler = lmb.Function(self, 'Handler', runtime=lmb.Runtime.PYTHON_3_7, handler='handler.handler', code=lmb.Code.from_asset(path.join(this_dir, 'lambda')) ) # For canary deployments alias = lmb.Alias(self, 'HandlerAlias', alias_name='Current', version=handler.current_version ) gw = apigw.LambdaRestApi(self, 'Gateway', description='Endpoint for app', handler=alias ) failure_alarm = cloudwatch.Alarm(self, 'FailureAlarm', metric=cloudwatch.Metric( metric_name='5XXError', namespace='AWS/ApiGateway', dimensions={ 'ApiName': 'Gateway' }, statistic='Sum', period=core.Duration.minutes(1) ), threshold=1, evaluation_periods=1 ) codedeploy.LambdaDeploymentGroup(self, 'Deploy', alias=alias, deployment_config=codedeploy.LambdaDeploymentConfig.CANARY_10_PERCENT_10_MINUTES, alarms=[failure_alarm] ) self.url_output = core.CfnOutput(self, 'Url', value=gw.url )
def get_alarm(self, params): description = params['description'].format(params['name'], self.account) metric = cloudwatch.Metric( metric_name=params['metric_name'], namespace=params['namespace'], dimensions={params['dimension']: params['name']}) alarm = cloudwatch.Alarm( self, "{}Alarm".format(params['name']), alarm_description=description, alarm_name=description, comparison_operator=cloudwatch.ComparisonOperator( 'GREATER_THAN_OR_EQUAL_TO_THRESHOLD'), metric=metric, evaluation_periods=params['evaluation_periods'], period=core.Duration.seconds(params['period']), statistic=params['statistic'], threshold=params['threshold'], treat_missing_data=cloudwatch.TreatMissingData('MISSING')) return alarm
def __init__(self, scope: core.Construct, id: str, squid_asgs: list) -> None: super().__init__(scope, id) # SNS Topic for alarm self.squid_alarm_topic = sns.Topic( self, "squid-asg-alarm-topic", display_name='Squid ASG Alarm topic') # Create metric to use for triggering alarm when there is no CPU usage from the squid process for count, asg in enumerate(squid_asgs, start=1): squid_metric = cloudwatch.Metric( metric_name="procstat_cpu_usage", namespace='CWAgent', dimensions=dict( AutoScalingGroupName=asg.auto_scaling_group_name, pidfile="/var/run/squid.pid", process_name="squid")) # CloudWatch alarms to alert on Squid ASG issue squid_alarm = cloudwatch.Alarm( self, f"squid-alarm-{count}", alarm_description=f"Heart beat for Squid instance {count}", alarm_name=f"squid-alarm_{asg.auto_scaling_group_name}", comparison_operator=cloudwatch.ComparisonOperator. LESS_THAN_THRESHOLD, metric=squid_metric, period=core.Duration.seconds(10), evaluation_periods=1, threshold=0.0, statistic='Average', treat_missing_data=cloudwatch.TreatMissingData.BREACHING) squid_alarm.add_alarm_action( cw_actions.SnsAction(self.squid_alarm_topic)) squid_alarm.add_ok_action( cw_actions.SnsAction(self.squid_alarm_topic))
def get_dashboard(self, params): with open(params['dashboard_file']) as json_file: params['dashboard_widgets'] = json.load(json_file) graph_widgets = [] for widget in params['dashboard_widgets']: metric = [ cloudwatch.Metric( namespace=widget['properties']['metrics'][0][0], metric_name=widget['properties']['metrics'][0][1], dimensions={ widget['properties']['metrics'][0][2]: params['name'] }) ] graph_widget = cloudwatch.GraphWidget(height=widget['height'], width=widget['width'], left=metric) graph_widget.position(widget['x'], widget['y']) graph_widgets.append(graph_widget) dashboard = cloudwatch.Dashboard(self, "{}Dashboard".format(params['name']), dashboard_name=params['name'], widgets=[graph_widgets]) return dashboard
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, # s3bucket, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create jobsender ec2 node jobsender = ec2.Instance( self, "jobsender", instance_name="s3_migrate_cluster_jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(user_data_jobsender), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC)) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(user_data_worker), desired_capacity=1, min_capacity=1, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access new s3 bucket # s3bucket.grant_read(jobsender) # s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_cluster") ec2_metric_net = cw.Metric( namespace="AWS/EC2", metric_name="NetworkOut", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Sum") ec2_metric_cpu_max = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1), statistic="Maximum") ec2_metric_cpu_avg = cw.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", # dimensions={"AutoScalingGroupName": worker_asg.auto_scaling_group_name}, period=core.Duration.minutes(1)) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Average", period=core.Duration.minutes(1)) cwagent_mem_max = cw.Metric(namespace="CWAgent", metric_name="mem_used_percent", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, statistic="Maximum", period=core.Duration.minutes(1)) # CWAgent collected application logs - filter metric s3_migrate_log = logs.LogGroup(self, "applog", log_group_name="s3_migration_log") s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="EC2-ALL-NETWORK", left=[ec2_metric_net]), cw.GraphWidget(title="EC2-ALL-CPU", left=[ec2_metric_cpu_avg, ec2_metric_cpu_max]), cw.GraphWidget(title="EC2-AutoscalingGroup-MEMORY", left=[cwagent_mem_max, cwagent_mem_avg]), cw.SingleValueWidget(title="EC2-AutoscalingGroup-Capacity", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6), ) board.add_widgets( cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="SQS-DeadLetterQueue", left=[ sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING], height=6), cw.SingleValueWidget( title="Running/Waiting and Death Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Autoscaling up when visible message > 100 every 3 of 3 x 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "JobSenderEC2", value=jobsender.instance_id) core.CfnOutput(self, "WorkerEC2AutoscalingGroup", value=worker_asg.auto_scaling_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__( self, scope: core.Construct, _id: str, vpc, bucket_para, # key_name, ddb_file_list, sqs_queue, sqs_queue_DLQ, ssm_bucket_para, ssm_credential_para, s3bucket, s3_deploy, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Create environment variable into userdata env_var = f'export table_queue_name={ddb_file_list.table_name}\n' \ f'export sqs_queue_name={sqs_queue.queue_name}\n' \ f'export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\n' env_var_st = f'echo \"export table_queue_name={ddb_file_list.table_name}\" >> /etc/rc.local\n' \ f'echo \"export sqs_queue_name={sqs_queue.queue_name}\" >> /etc/rc.local\n' \ f'echo \"export ssm_parameter_bucket={ssm_bucket_para.parameter_name}\" >> /etc/rc.local\n' # Create log group and put group name into userdata s3_migrate_log = logs.LogGroup(self, "applog") cw_agent_config['logs']['logs_collected']['files']['collect_list'][0][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['logs']['logs_collected']['files']['collect_list'][1][ 'log_group_name'] = s3_migrate_log.log_group_name cw_agent_config['metrics']['append_dimensions'][ 'AutoScalingGroupName'] = "\\${aws:AutoScalingGroupName}" cw_agent_config['metrics']['append_dimensions'][ 'InstanceId'] = "\\${aws:InstanceId}" cw_agent_config_str = json.dumps(cw_agent_config, indent=4).replace("\\\\", "\\") userdata_head = user_data_part1 + cw_agent_config_str + user_data_part2 + \ s3_deploy.bucket_name + " .\n" + env_var + env_var_st jobsender_userdata = userdata_head + user_data_jobsender_p worker_userdata = userdata_head + user_data_worker_p # Create jobsender ec2 node jobsender = autoscaling.AutoScalingGroup( self, "jobsender", instance_type=ec2.InstanceType( instance_type_identifier=jobsender_type), machine_image=linux_ami, # key_name=key_name, user_data=ec2.UserData.custom(jobsender_userdata), vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), desired_capacity=1, min_capacity=0, max_capacity=1) # jobsender.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) jobsender.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # jobsender.role.add_managed_policy( # iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess")) # Don't give full access s3 to ec2, violate security rule # Create Autoscaling Group with fixed 2*EC2 hosts worker_asg = autoscaling.AutoScalingGroup( self, "worker-asg", vpc=vpc, vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), instance_type=ec2.InstanceType( instance_type_identifier=worker_type), machine_image=linux_ami, # key_name=key_name, # Optional if use SSM-SessionManager user_data=ec2.UserData.custom(worker_userdata), desired_capacity=2, min_capacity=2, max_capacity=10, spot_price="0.5") # TODO: There is no MetricsCollection in CDK autoscaling group high level API yet. # You need to enable "Group Metrics Collection" in EC2 Console Autoscaling Group - Monitoring tab for metric: # GroupDesiredCapacity, GroupInServiceInstances, GroupPendingInstances and etc. # worker_asg.connections.allow_from_any_ipv4(ec2.Port.tcp(22), "Internet access SSH") # Don't need SSH since we use Session Manager # Assign EC2 Policy to use SSM and CWAgent worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore")) worker_asg.role.add_managed_policy( iam.ManagedPolicy.from_aws_managed_policy_name( "CloudWatchAgentServerPolicy")) # Allow EC2 access new DynamoDB Table ddb_file_list.grant_full_access(jobsender) ddb_file_list.grant_full_access(worker_asg) # Allow EC2 access new sqs and its DLQ sqs_queue.grant_consume_messages(jobsender) sqs_queue.grant_send_messages(jobsender) sqs_queue.grant_consume_messages(worker_asg) sqs_queue_DLQ.grant_consume_messages(jobsender) # Allow EC2 access SSM Parameter Store, get bucket infor and get credential ssm_bucket_para.grant_read(jobsender) ssm_credential_para.grant_read(jobsender) ssm_credential_para.grant_read(worker_asg) # Allow EC2 access source code on s3_deploy bucket s3_deploy.grant_read(jobsender) s3_deploy.grant_read(worker_asg) # Allow EC2 access new s3 bucket s3bucket.grant_read(jobsender) s3bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for PUT mode: readonly access the source buckets bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) s3exist_bucket.grant_read(jobsender) s3exist_bucket.grant_read(worker_asg) # Allow EC2 access exist s3 bucket for GET mode: read and write access the destination buckets # bucket_name = '' # for b in bucket_para: # if bucket_name != b['des_bucket']: # 如果列了多个相同的Bucket,就跳过 # bucket_name = b['des_bucket'] # s3exist_bucket = s3.Bucket.from_bucket_name(self, # bucket_name, # 用这个做id # bucket_name=bucket_name) # s3exist_bucket.grant_read_write(jobsender) # s3exist_bucket.grant_read_write(worker_asg) # Dashboard to monitor SQS and EC2 board = cw.Dashboard(self, "s3_migrate") ec2_metric_cpu_avg = cw.Metric(namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) ec2_metric_net_out = cw.MathExpression( expression= "SEARCH('{AWS/EC2, InstanceId} NetworkOut', 'Average', 60)", label="EC2-NetworkOut", using_metrics={}) autoscaling_GroupDesiredCapacity = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupDesiredCapacity", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupInServiceInstances = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupInServiceInstances", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMinSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMinSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) autoscaling_GroupMaxSize = cw.Metric( namespace="AWS/AutoScaling", metric_name="GroupMaxSize", dimensions={ "AutoScalingGroupName": worker_asg.auto_scaling_group_name }, period=core.Duration.minutes(1)) # CWAgent collected metric cwagent_mem_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=mem_used_percent)', 'Average', 60)", label="mem_avg", using_metrics={}) cwagent_disk_avg = cw.MathExpression( expression= "SEARCH('{CWAgent, path, InstanceId, AutoScalingGroupName, device, fstype} " "(AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=disk_used_percent AND path=\"/\")', 'Average', 60)", label="disk_avg", using_metrics={}) cwagent_net_tcp = cw.MathExpression( expression= "SEARCH('{CWAgent, AutoScalingGroupName, InstanceId} (AutoScalingGroupName=" + worker_asg.auto_scaling_group_name + " AND MetricName=tcp_established)', 'Average', 60)", label="tcp_conn", using_metrics={}) # CWAgent collected application logs - filter metric s3_migrate_log.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Complete", bytes, key]')) s3_migrate_log.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Uploading", bytes, key]')) s3_migrate_log.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[date, time, info, hs, p="--->Downloading", bytes, key]')) traffic_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) traffic_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) s3_migrate_log.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) s3_migrate_log.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) board.add_widgets( cw.GraphWidget(title="S3-MIGRATION-TOTAL-TRAFFIC", left=[ traffic_metric_Complete, traffic_metric_Upload, traffic_metric_Download ], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False)), cw.GraphWidget(title="ERROR/WARNING LOGS", left=[log_metric_ERROR], left_y_axis=cw.YAxisProps(label="Count", show_units=False), right=[log_metric_WARNING], right_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget( title="SQS-JOBS", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="RUNNING, WAITING & DEATH JOBS", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-AutoscalingGroup-TCP", left=[cwagent_net_tcp], left_y_axis=cw.YAxisProps(label="Count", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-CPU/MEMORY", left=[ec2_metric_cpu_avg, cwagent_mem_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.GraphWidget(title="EC2-AutoscalingGroup-DISK", left=[cwagent_disk_avg], left_y_axis=cw.YAxisProps(max=100, min=0, label="%", show_units=False)), cw.SingleValueWidget(title="EC2-AutoscalingGroup-CAPACITY", metrics=[ autoscaling_GroupDesiredCapacity, autoscaling_GroupInServiceInstances, autoscaling_GroupMinSize, autoscaling_GroupMaxSize ], height=6)) board.add_widgets( cw.GraphWidget(title="EC2-NetworkOut", left=[ec2_metric_net_out], left_y_axis=cw.YAxisProps(label="Bytes/min", show_units=False))) # Autoscaling up when visible message > 100 in 5 mins worker_asg.scale_on_metric( "scaleup", metric=sqs_queue.metric_approximate_number_of_messages_visible(), scaling_steps=[ autoscaling.ScalingInterval(change=1, lower=100, upper=500), autoscaling.ScalingInterval(change=2, lower=500), autoscaling.ScalingInterval(change=0, upper=100, lower=0) ], adjustment_type=autoscaling.AdjustmentType.CHANGE_IN_CAPACITY) # Alarm for queue empty and ec2 > 1 # 消息队列空(没有Visible+Invisible),并且EC2不止一台,则告警,并设置EC2为1台 # 这里还可以根据场景调整,如果Jobsender也用来做传输,则可以在这里设置没有任务的时候,Autoscaling Group为0 metric_all_message = cw.MathExpression( expression="IF(((a+b) == 0) AND (c >1), 0, 1)", # a+b且c>1则设置为0,告警 label="empty_queue_expression", using_metrics={ "a": sqs_queue.metric_approximate_number_of_messages_visible(), "b": sqs_queue.metric_approximate_number_of_messages_not_visible(), "c": autoscaling_GroupInServiceInstances }) alarm_0 = cw.Alarm( self, "SQSempty", alarm_name= "s3-migration-cluster-SQS queue empty and ec2 more than 1 in Cluster", metric=metric_all_message, threshold=0, comparison_operator=cw.ComparisonOperator. LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.NOT_BREACHING) alarm_topic_empty = sns.Topic( self, "SQS queue empty and ec2 more than 1 in Cluster") # 这个告警可以作为批量传输完成后的通知,而且这样做可以只通知一次,而不会不停地通知 alarm_topic_empty.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_0.add_alarm_action(action.SnsAction(alarm_topic_empty)) # If queue empty, set autoscale down to 1 EC2 action_shutdown = autoscaling.StepScalingAction( self, "shutdown", auto_scaling_group=worker_asg, adjustment_type=autoscaling.AdjustmentType.EXACT_CAPACITY) action_shutdown.add_adjustment(adjustment=1, upper_bound=0) alarm_0.add_alarm_action(action.AutoScalingAction(action_shutdown)) # While message in SQS-DLQ, alarm to sns alarm_DLQ = cw.Alarm( self, "SQS_DLQ", alarm_name= "s3-migration-cluster-SQS DLQ more than 1 message-Cluster", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=3, datapoints_to_alarm=3, treat_missing_data=cw.TreatMissingData.IGNORE) alarm_topic_DLQ = sns.Topic(self, "SQS DLQ more than 1 message-Cluster") alarm_topic_DLQ.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic_DLQ)) # Output core.CfnOutput(self, "LogGroup", value=s3_migrate_log.log_group_name) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_cluster") core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for cluster: " + alarm_email)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) high_cpu_topic = sns.Topic(self, 'high-cpu-topic', display_name='myHighCpuAlarm') # phone number format must be 12225558888 for US phone_param = ssm.StringParameter.from_string_parameter_name(self, 'phone-param', 'notification-phone') high_cpu_topic_sub = sns.Subscription(self, 'high-cpu-topic-sub', topic=high_cpu_topic, protocol=sns.SubscriptionProtocol.SMS, endpoint=phone_param.string_value) default_vpc = ec2.Vpc.from_lookup(self, 'default-vpc', is_default=True) monitored_instance = ec2.Instance(self, 'monitored-instance', instance_name='devassoc-monitored', instance_type=type.R3_XLARGE, machine_image=ec2.MachineImage.generic_linux( ami_map=ami_map ), vpc=default_vpc) high_cpu_metric = cw.Metric(namespace='AWS/EC2', metric_name='CPUUtilization', dimensions={ 'InstanceId': monitored_instance.instance_id }, statistic='Average', unit=cw.Unit.PERCENT, period=core.Duration.seconds(300)) high_cpu_alarm = high_cpu_metric.create_alarm(self, 'high-cpu-alarm', alarm_name='cpu-mon', alarm_description='Alarm when CPU exceeds 70%', comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=2, period=core.Duration.seconds(300), threshold=70, actions_enabled=True) high_cpu_action = cwa.SnsAction(high_cpu_topic) high_cpu_alarm.add_alarm_action(high_cpu_action) ec2.CfnEIP(self, 'devassoc-elastic-ip') # not really a service role, but there are problems with that, per # https://github.com/aws/aws-cdk/issues/3492 config_service_role = iam.Role(self, 'devassoc-config-service-role', assumed_by=iam.ServicePrincipal('config.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AWSConfigRole') ]) config_recorder = config.CfnConfigurationRecorder(self, 'devassoc-recorder', name='ConfigRecorder', role_arn=config_service_role.role_arn, recording_group=config.CfnConfigurationRecorder.RecordingGroupProperty( all_supported=True) ) config_bucket = s3.Bucket(self, 'config-bucket', bucket_name='devassoc-config', removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) config_bucket.add_to_resource_policy(iam.PolicyStatement(effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[config_bucket.bucket_arn], actions=['s3:GetBucketAcl'])) config_bucket.add_to_resource_policy(iam.PolicyStatement(effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[config_bucket.arn_for_objects( f"AWSLogs/{core.Stack.of(self).account}/Config/*")], actions=['s3:PutObject'], conditions={'StringEquals': { 's3:x-amz-acl': 'bucket-owner-full-control'}})) eip_rule = config.ManagedRule(self, 'devassoc-managed-rule', identifier=config.ManagedRuleIdentifiers.EIP_ATTACHED, config_rule_name='devassoc-eip-rule') eip_rule.node.add_dependency(config_recorder) eip_compliance_topic = sns.Topic(self, 'eip-compliance-topic', display_name='EIP Compliance Topic') eip_compliance_topic_sub = sns.Subscription(self, 'eip-compliance-topic-sub', topic=eip_compliance_topic, protocol=sns.SubscriptionProtocol.SMS, endpoint=phone_param.string_value) eip_rule.on_compliance_change('eip-compliance-change', target=targets.SnsTopic(eip_compliance_topic)) config.CfnDeliveryChannel(self, 'devassoc-config-delivery', s3_bucket_name=config_bucket.bucket_name, sns_topic_arn=eip_compliance_topic.topic_arn)
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) security_distribution_list_email = '*****@*****.**' # securityhub_instance = securityhub.CfnHub(self, 'SecurityHub') # Ensure AWS Config is enabled / Ensure CloudTrail is enabled in all Regions 2.1 - 2.8 cloudtrail_bucket_accesslogs = s3.Bucket( self, "CloudTrailS3Accesslogs", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN) cloudtrail_bucket = s3.Bucket( self, "CloudTrailS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, server_access_logs_bucket=cloudtrail_bucket_accesslogs, ) cloudtrail_kms = kms.Key(self, "CloudTrailKey", enable_key_rotation=True) # CloudTrail - single account, not Organization trail = cloudtrail.Trail( self, "CloudTrail", enable_file_validation=True, is_multi_region_trail=True, include_global_service_events=True, send_to_cloud_watch_logs=True, cloud_watch_logs_retention=logs.RetentionDays.FOUR_MONTHS, bucket=cloudtrail_bucket, kms_key=cloudtrail_kms) cloudtrail_kms.grant(iam.ServicePrincipal('cloudtrail.amazonaws.com'), 'kms:DescribeKey') cloudtrail_kms.grant( iam.ServicePrincipal( 'cloudtrail.amazonaws.com', conditions={ 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }), 'kms:GenerateDataKey*') cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement(actions=["kms:CreateAlias"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account, 'kms:ViaService': 'ec2.' + core.Stack.of(self).region + '.amazonaws.com' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) config_role = iam.CfnServiceLinkedRole( self, id='ServiceLinkedRoleConfig', aws_service_name='config.amazonaws.com') global_config = config.CfnConfigurationRecorder(self, 'ConfigRecorder', name='default', # role_arn=config_role.role_arn, role_arn="arn:aws:iam::" + \ core.Stack.of( self).account+":role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig", # role_arn=config_role.get_att( # attribute_name='resource.arn').to_string(), recording_group=config.CfnConfigurationRecorder.RecordingGroupProperty( all_supported=True, include_global_resource_types=True ) ) config_bucket = s3.Bucket( self, "ConfigS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, ) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:GetBucketAcl'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[config_bucket.bucket_arn])) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:PutObject'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[ config_bucket.arn_for_objects('AWSLogs/' + core.Stack.of(self).account + '/Config/*') ], conditions={ "StringEquals": { 's3:x-amz-acl': 'bucket-owner-full-control', } })) config_delivery_stream = config.CfnDeliveryChannel( self, "ConfigDeliveryChannel", s3_bucket_name=config_bucket.bucket_name) # Config Aggregator in Organizations account # config_aggregator = config.CfnConfigurationAggregator(self, 'ConfigAggregator', # configuration_aggregator_name='ConfigAggregator', # organization_aggregation_source=config.CfnConfigurationAggregator.OrganizationAggregationSourceProperty( # role_arn=iam.Role(self, "AWSConfigRoleForOrganizations", # assumed_by=iam.ServicePrincipal( # 'config.amazonaws.com'), # managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name( # 'service-role/AWSConfigRoleForOrganizations')] # ).role_arn, # all_aws_regions=True # ) # ) # 2.9 – Ensure VPC flow logging is enabled in all VPCs # vpc = ec2.Vpc.from_lookup(self, "VPC", # is_default=True, # ) # S3 for VPC flow logs # vpc_flow_logs_bucket = s3.Bucket(self, "VPCFlowLogsBucket", # block_public_access=s3.BlockPublicAccess.BLOCK_ALL, # encryption=s3.BucketEncryption.S3_MANAGED, # removal_policy=core.RemovalPolicy.RETAIN # ) # Ensure a log metric filter and alarm exist for 3.1 – 3.14 security_notifications_topic = sns.Topic(self, 'CIS_Topic', display_name='CIS_Topic', topic_name='CIS_Topic') sns.Subscription(self, 'CIS_Subscription', topic=security_notifications_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=security_distribution_list_email) cloudwatch_actions_cis = cloudwatch_actions.SnsAction( security_notifications_topic) cis_metricfilter_alarms = { 'CIS-3.1-UnauthorizedAPICalls': '($.errorCode="*UnauthorizedOperation") || ($.errorCode="AccessDenied*")', 'CIS-3.2-ConsoleSigninWithoutMFA': '($.eventName="ConsoleLogin") && ($.additionalEventData.MFAUsed !="Yes")', 'RootAccountUsageAlarm': '$.userIdentity.type="Root" && $.userIdentity.invokedBy NOT EXISTS && $.eventType !="AwsServiceEvent"', 'CIS-3.4-IAMPolicyChanges': '($.eventName=DeleteGroupPolicy) || ($.eventName=DeleteRolePolicy) || ($.eventName=DeleteUserPolicy) || ($.eventName=PutGroupPolicy) || ($.eventName=PutRolePolicy) || ($.eventName=PutUserPolicy) || ($.eventName=CreatePolicy) || ($.eventName=DeletePolicy) || ($.eventName=CreatePolicyVersion) || ($.eventName=DeletePolicyVersion) || ($.eventName=AttachRolePolicy) || ($.eventName=DetachRolePolicy) || ($.eventName=AttachUserPolicy) || ($.eventName=DetachUserPolicy) || ($.eventName=AttachGroupPolicy) || ($.eventName=DetachGroupPolicy)', 'CIS-3.5-CloudTrailChanges': '($.eventName=CreateTrail) || ($.eventName=UpdateTrail) || ($.eventName=DeleteTrail) || ($.eventName=StartLogging) || ($.eventName=StopLogging)', 'CIS-3.6-ConsoleAuthenticationFailure': '($.eventName=ConsoleLogin) && ($.errorMessage="Failed authentication")', 'CIS-3.7-DisableOrDeleteCMK': '($.eventSource=kms.amazonaws.com) && (($.eventName=DisableKey) || ($.eventName=ScheduleKeyDeletion))', 'CIS-3.8-S3BucketPolicyChanges': '($.eventSource=s3.amazonaws.com) && (($.eventName=PutBucketAcl) || ($.eventName=PutBucketPolicy) || ($.eventName=PutBucketCors) || ($.eventName=PutBucketLifecycle) || ($.eventName=PutBucketReplication) || ($.eventName=DeleteBucketPolicy) || ($.eventName=DeleteBucketCors) || ($.eventName=DeleteBucketLifecycle) || ($.eventName=DeleteBucketReplication))', 'CIS-3.9-AWSConfigChanges': '($.eventSource=config.amazonaws.com) && (($.eventName=StopConfigurationRecorder) || ($.eventName=DeleteDeliveryChannel) || ($.eventName=PutDeliveryChannel) || ($.eventName=PutConfigurationRecorder))', 'CIS-3.10-SecurityGroupChanges': '($.eventName=AuthorizeSecurityGroupIngress) || ($.eventName=AuthorizeSecurityGroupEgress) || ($.eventName=RevokeSecurityGroupIngress) || ($.eventName=RevokeSecurityGroupEgress) || ($.eventName=CreateSecurityGroup) || ($.eventName=DeleteSecurityGroup)', 'CIS-3.11-NetworkACLChanges': '($.eventName=CreateNetworkAcl) || ($.eventName=CreateNetworkAclEntry) || ($.eventName=DeleteNetworkAcl) || ($.eventName=DeleteNetworkAclEntry) || ($.eventName=ReplaceNetworkAclEntry) || ($.eventName=ReplaceNetworkAclAssociation)', 'CIS-3.12-NetworkGatewayChanges': '($.eventName=CreateCustomerGateway) || ($.eventName=DeleteCustomerGateway) || ($.eventName=AttachInternetGateway) || ($.eventName=CreateInternetGateway) || ($.eventName=DeleteInternetGateway) || ($.eventName=DetachInternetGateway)', 'CIS-3.13-RouteTableChanges': '($.eventName=CreateRoute) || ($.eventName=CreateRouteTable) || ($.eventName=ReplaceRoute) || ($.eventName=ReplaceRouteTableAssociation) || ($.eventName=DeleteRouteTable) || ($.eventName=DeleteRoute) || ($.eventName=DisassociateRouteTable)', 'CIS-3.14-VPCChanges': '($.eventName=CreateVpc) || ($.eventName=DeleteVpc) || ($.eventName=ModifyVpcAttribute) || ($.eventName=AcceptVpcPeeringConnection) || ($.eventName=CreateVpcPeeringConnection) || ($.eventName=DeleteVpcPeeringConnection) || ($.eventName=RejectVpcPeeringConnection) || ($.eventName=AttachClassicLinkVpc) || ($.eventName=DetachClassicLinkVpc) || ($.eventName=DisableVpcClassicLink) || ($.eventName=EnableVpcClassicLink)', } for x, y in cis_metricfilter_alarms.items(): str_x = str(x) str_y = str(y) logs.MetricFilter( self, "MetricFilter_" + str_x, log_group=trail.log_group, filter_pattern=logs.JsonPattern(json_pattern_string=str_y), metric_name=str_x, metric_namespace="LogMetrics", metric_value='1') cloudwatch.Alarm( self, "Alarm_" + str_x, alarm_name=str_x, alarm_description=str_x, statistic='Sum', period=core.Duration.minutes(5), comparison_operator=cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, threshold=1, metric=cloudwatch.Metric(metric_name=str_x, namespace="LogMetrics"), ).add_alarm_action(cloudwatch_actions_cis) # IAM Password Policy custom resource CIS 1.5 - 1.11 cfn_template = cfn_inc.CfnInclude( self, "includeTemplate", template_file="account-password-policy.yaml", parameters={ "MaxPasswordAge": 90, "MinimumPasswordLength": 14, "PasswordReusePrevention": 24, "RequireLowercaseCharacters": True, "RequireNumbers": True, "RequireSymbols": True, "RequireUppercaseCharacters": True, }) # CIS 1.20 support_role = iam.Role( self, "SupportRole", assumed_by=iam.AccountPrincipal( account_id=core.Stack.of(self).account), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'AWSSupportAccess') ], role_name='AWSSupportAccess') guardduty_detector = guardduty.CfnDetector(self, 'GuardDutyDetector', enable=True) guardduty_event = events.Rule( self, 'GuardDutyEvent', rule_name='guardduty-notification', description='GuardDuty Notification', event_pattern=events.EventPattern( source=['aws.guardduty'], detail_type=['GuardDuty Finding']), targets=[events_targets.SnsTopic(security_notifications_topic)])
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) bucket_name = 'devassoc-monitored' bucket = s3.Bucket(self, 'bucket-monitored', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) core.CfnOutput(self, 'monitored-bucket', value=bucket.bucket_name) size_metric = cw.Metric(namespace='AWS/S3', metric_name='BucketSizeBytes', dimensions={ 'BucketName': bucket.bucket_name, 'StorageType': 'StandardStorage' }, period=core.Duration.days(1)) size_alarm = size_metric.create_alarm( self, 'bucket-alarm', alarm_name='S3 Storage Alarm', comparison_operator=cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, period=core.Duration.days(1), threshold=1000, actions_enabled=True) size_topic = sns.Topic(self, 'size-topic', display_name='My S3 Alarm List') email_param = ssm.StringParameter.from_string_parameter_name( self, 'email-param', 'notification-email') size_topic_sub = sns.Subscription( self, 'size-topic-sub', topic=size_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=email_param.string_value) size_action = cwa.SnsAction(size_topic) size_alarm.add_alarm_action(size_action) bucket_name = 'devassoc-s3-logs' log_bucket = s3.Bucket(self, 'bucket-s3-logs', bucket_name=bucket_name, removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) s3_trail = ct.Trail(self, 'bucket-trail', bucket=log_bucket, trail_name='s3_logs') s3_trail.add_s3_event_selector([ct.S3EventSelector(bucket=bucket)]) s3_trail.log_all_s3_data_events() single_value_widget = cw.SingleValueWidget(metrics=[size_metric]) graph_widget = cw.GraphWidget(left=[size_metric]) cw.Dashboard(self, 'cloudwatch-dashboard', dashboard_name='S3Dashboard', widgets=[[single_value_widget, graph_widget]])
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #import function code try: with open("serverless_stack/functions/metric_logs_generator.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCustomMEtric", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'PERCENTAGE_ERRORS': '75' }) #attached cloudwatch log group custom_metric_log_group01 = aws_logs.LogGroup( self, "cloudwatchlog01", log_group_name=f"/aws/lambda/{function_01.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=aws_logs.RetentionDays.ONE_DAY) #Custom metric namespace custom_metric_namespace01 = aws_cw.Metric( namespace=f"custom-error-metric", metric_name="custom-error-metric", label="Amount of Custom API errors", period=core.Duration.minutes(1), statistic="Sum") #Custom metric logs filter custom_metric_filter01 = aws_logs.MetricFilter( self, "customMetricFilter", filter_pattern=aws_logs.FilterPattern.boolean_value( "$.custom_api_error", True), log_group=custom_metric_log_group01, metric_namespace=custom_metric_namespace01.namespace, metric_name=custom_metric_namespace01.metric_name, default_value=0, metric_value="1") #create custom alarm custom_metric_alarm01 = aws_cw.Alarm( self, "customMetricAlarm", alarm_description="Custom API errors", alarm_name="Custom-API-alarm", metric=custom_metric_namespace01, comparison_operator=aws_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING) #cloudwatch dashboard custom_dashboard01 = aws_cw.Dashboard( self, id="CustomDashBoard", dashboard_name="CDK-custom-DashBoard") #lambda metrics to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.GraphWidget(title="Lambda-invoke", left=[ function_01.metric_invocations( statistic="Sum", period=core.Duration.minutes(1)) ]), aws_cw.GraphWidget(title="Lambda-errors", left=[ function_01.metric_errors( statistic="Sum", period=core.Duration.minutes(1)) ]))) #custom api errors to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.SingleValueWidget(title="Custom-API-errors", metrics=[custom_metric_namespace01])))
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # load configs from "./comfigurations/config.json" configs = {} with open("./configurations/config.json") as json_file: configs = json.load(json_file) # Default lambdas for testing mem_list = configs['MemorySizeList'] cold_start_lambdas = {} for mem in mem_list: python38_lambda = lambda_.Function( self, id="coldstart_python38_" + str(mem) + "_", runtime=lambda_.Runtime.PYTHON_3_8, handler="lambda_function.lambda_handler", memory_size=mem, tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/python38")) cold_start_lambdas['PYTHON38_' + str(mem)] = python38_lambda for mem in mem_list: nodejs12x_lambda = lambda_.Function( self, id="coldstart_nodejs12x" + str(mem) + "_", runtime=lambda_.Runtime.NODEJS_12_X, handler="index.handler", memory_size=mem, tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/nodejs12x")) cold_start_lambdas['NODEJS12X_' + str(mem)] = nodejs12x_lambda for mem in mem_list: go1x_lambda = lambda_.Function( self, id="coldstart_go1x" + str(mem) + "_", runtime=lambda_.Runtime.GO_1_X, handler="hello", memory_size=mem, tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/go1x")) cold_start_lambdas['GO1X_' + str(mem)] = go1x_lambda for mem in mem_list: netcore31_lambda = lambda_.Function( self, id="coldstart_netcore31" + str(mem) + "_", runtime=lambda_.Runtime.DOTNET_CORE_3_1, handler="LambdaTest::LambdaTest.LambdaHandler::handleRequest", tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/netcore31"), memory_size=mem, ) cold_start_lambdas['NETCORE31_' + str(mem)] = netcore31_lambda for mem in mem_list: java11corretto_lambda = lambda_.Function( self, id="coldstart_java11corretto" + str(mem) + "_", runtime=lambda_.Runtime.JAVA_11, handler="example.Hello::handleRequest", memory_size=mem, tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/java11corretto")) cold_start_lambdas['JAVA11_' + str(mem)] = java11corretto_lambda for mem in mem_list: ruby27_lambda = lambda_.Function( self, id="coldstart_ruby27" + str(mem) + "_", runtime=lambda_.Runtime.RUBY_2_7, handler="lambda_function.lambda_handler", memory_size=mem, tracing=lambda_.Tracing.ACTIVE, code=lambda_.Code.asset("./cold_start_lambdas/ruby27")) cold_start_lambdas['RUBY27_' + str(mem)] = ruby27_lambda # Caller cold_start_caller = lambda_.Function( self, id="cold_start_caller", runtime=lambda_.Runtime.PYTHON_3_8, handler="ColdStartCaller.lambda_handler", code=lambda_.Code.asset("./cold_start_lambdas/cold_start_caller"), timeout=core.Duration.seconds(180)) cold_start_caller.role.add_managed_policy( iam_.ManagedPolicy.from_aws_managed_policy_name( "AWSXrayReadOnlyAccess")) cold_start_caller.role.add_to_policy( iam_.PolicyStatement(effect=iam_.Effect.ALLOW, actions=['lambda:GetFunctionConfiguration'], resources=["*"])) for lambda_name in cold_start_lambdas: cold_start_caller.add_environment( lambda_name, cold_start_lambdas[lambda_name].function_arn) cold_start_lambdas[lambda_name].grant_invoke(cold_start_caller) # DynamoDB cold_start_table = dynamodb_.Table( self, id="cold_start_benchmark_table", partition_key=dynamodb_.Attribute( name="PK", type=dynamodb_.AttributeType.STRING), sort_key=dynamodb_.Attribute(name="SK", type=dynamodb_.AttributeType.NUMBER), time_to_live_attribute="TTL") cold_start_table.grant_write_data(cold_start_caller) cold_start_caller.add_environment('TABLE_NAME', cold_start_table.table_name) # S3 life_cycle_rule = s3_.LifecycleRule(transitions=[ s3_.Transition(storage_class=s3_.StorageClass.INFREQUENT_ACCESS, transition_after=core.Duration.days(30)) ]) cold_start_backup_s3 = s3_.Bucket(self, "cold_start_benchmark_backup", lifecycle_rules=[life_cycle_rule]) cold_start_backup_s3.grant_write(cold_start_caller) cold_start_caller.add_environment('BACKUP_BUCKET_NAME', cold_start_backup_s3.bucket_name) # CW event cron_job = events_.Rule( self, "cold_start_caller_cron_job", description="Run cold start caller twice every 1 hour", schedule=events_.Schedule.cron(minute="0,1"), targets=[targets_.LambdaFunction(cold_start_caller)]) # alarm when caller failed, send email for notification errorAlarm = cloudwatch_.Alarm( self, "cold_start_caller_error_alarm", metric=cloudwatch_.Metric( metric_name="Errors", namespace="AWS/Lambda", period=core.Duration.minutes(5), statistic="Maximum", dimensions={"FunctionName": cold_start_caller.function_name}), evaluation_periods=1, datapoints_to_alarm=1, threshold=1, actions_enabled=True, alarm_description="Alarm when cold start caller failed", alarm_name="cold_start_caller_errer_alarm", comparison_operator=cloudwatch_.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, treat_missing_data=cloudwatch_.TreatMissingData.MISSING) cold_start_caller_error_alarm_topic = sns_.Topic( self, "cold_start_caller_error_alarm_topic", display_name="ColdStartCallerErrorAlarmTopic", topic_name="ColdStartCallerErrorAlarmTopic") cold_start_caller_error_alarm_topic.add_subscription( sns_subs_.EmailSubscription( configs['AlarmNotificationEmailAddress'])) errorAlarm.add_alarm_action( cloudwatch_actions_.SnsAction(cold_start_caller_error_alarm_topic)) # Summarizer cold_start_summarizer = lambda_.Function( self, id="cold_start_summarizer", runtime=lambda_.Runtime.PYTHON_3_8, handler="ColdStartSummarizer.lambda_handler", code=lambda_.Code.asset( "./cold_start_lambdas/cold_start_summarizer"), timeout=core.Duration.seconds(10)) cold_start_table.grant_read_write_data(cold_start_summarizer) cold_start_summarizer.add_environment('TABLE_NAME', cold_start_table.table_name) # setup CW event for summarizer cron_job_summarizer = events_.Rule( self, "cold_start_summarizer_cron_job", description="Run cold start summarizer once every day", schedule=events_.Schedule.cron(minute='30', hour='0'), targets=[targets_.LambdaFunction(cold_start_summarizer)]) # error alarm for summarizer errorAlarm_summarizer = cloudwatch_.Alarm( self, "cold_start_summarizer_error_alarm", metric=cloudwatch_.Metric(metric_name='Errors', namespace='AWS/Lambda', period=core.Duration.minutes(5), statistic='Maximum', dimensions={ 'FunctionName': cold_start_summarizer.function_name }), evaluation_periods=1, datapoints_to_alarm=1, threshold=1, actions_enabled=True, alarm_description="Alarm when cold start summarizer failed", alarm_name="cold_start_summarizer_errer_alarm", comparison_operator=cloudwatch_.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, treat_missing_data=cloudwatch_.TreatMissingData.MISSING) cold_start_summarizer_error_alarm_topic = sns_.Topic( self, "cold_start_summarizer_error_alarm_topic", display_name="ColdStartSummarizerErrorAlarmTopic", topic_name="ColdStartSummarizerErrorAlarmTopic") cold_start_summarizer_error_alarm_topic.add_subscription( sns_subs_.EmailSubscription( configs['AlarmNotificationEmailAddress'])) errorAlarm_summarizer.add_alarm_action( cloudwatch_actions_.SnsAction( cold_start_summarizer_error_alarm_topic)) # GraphQL API graphql_api = appsync_.GraphqlApi( self, "cold_start_benchmark_graphql_api", name="cold_start_benchmark_graphql_api", authorization_config=appsync_.AuthorizationConfig( default_authorization=appsync_.AuthorizationMode( authorization_type=appsync_.AuthorizationType.API_KEY, api_key_config=appsync_.ApiKeyConfig( description="cold_start_benchmark_graphql_api_key", expires=core.Expiration.after(core.Duration.days(365)), name="cold_start_benchmark_graphql_api_key"))), schema=appsync_.Schema.from_asset( './cold_start_benchmark/graphql_schema/schema.graphql'), xray_enabled=True) dynamodb_data_source = graphql_api.add_dynamo_db_data_source( id="cold_start_dynamodb_data_source", table=cold_start_table) dynamodb_data_source.create_resolver( field_name="listColdStartSummariesAfterTimestamp", type_name="Query", request_mapping_template=appsync_.MappingTemplate.from_file( './cold_start_benchmark/graphql_schema/request_mapping_template' ), response_mapping_template=appsync_.MappingTemplate.from_file( './cold_start_benchmark/graphql_schema/response_mapping_template' )) front_end_amplify_app = amplify_.App( self, "cold-start-front-end", app_name="cold_start_front_end", source_code_provider=amplify_.GitHubSourceCodeProvider( owner="ZzzGin", repository="cold-start-frontend-website", oauth_token=core.SecretValue.secrets_manager( "zzzgin/github/token", json_field="zzzgin-github-token"))) master_Branch = front_end_amplify_app.add_branch("master") domain = front_end_amplify_app.add_domain('zzzgin.com') domain.map_sub_domain(master_Branch, 'coldstart')
def __init__(self, app: core.App, id: str) -> None: super().__init__(app, id) with open("config.json") as f: self.config = json.load(f) assert ( "SECRET_KEY" in self.config), "Need random SECRET_KEY specified in config.json" assert ( "CERTIFICATE_ARN" in self.config), "Need CERTIFICATE_ARN specified in config.json" self.lambda_dir = "assets/lambda" os.makedirs(os.path.join(self.lambda_dir, "templates", "generated"), exist_ok=True) r = requests.get( "https://api.github.com/repos/sumpfork/dominiontabs/releases") changelog = r.json() changelog = [{ "url": ch["html_url"], "date": dt.datetime.strptime(ch["published_at"][:10], "%Y-%m-%d").date(), "name": ch["name"], "tag": ch["tag_name"], "description": ch["body"], } for ch in changelog] env = Environment(loader=FileSystemLoader("templates"), autoescape=select_autoescape(["html"])) t = env.get_template("changelog.html.j2") generated_template_path = os.path.join(self.lambda_dir, "templates", "generated") shutil.rmtree(generated_template_path) os.mkdir(generated_template_path) with open( os.path.join(generated_template_path, "changelog.html"), "w", ) as f: f.write(t.render(changelog=changelog)) static_website_bucket = s3.Bucket( self, "Dominion Divider Generator Site", ) cf_static_dist = cloudfront.Distribution( self, "StaticCloudfrontDist", default_behavior=cloudfront.BehaviorOptions( origin=cloudfront_origins.S3Origin(static_website_bucket)), ) s3_deployment.BucketDeployment( self, "Static Files Deployment", sources=[s3_deployment.Source.asset("./static")], destination_bucket=static_website_bucket, destination_key_prefix="static", ) flask_app = lambda_python.PythonFunction( self, "DominionDividersFlaskApp", entry=self.lambda_dir, index="lambda-handlers.py", handler="apig_wsgi_handler", environment={ "STATIC_WEB_URL": f"https://{cf_static_dist.domain_name}", "FLASK_SECRET_KEY": self.config["SECRET_KEY"], "GA_CONFIG": self.config.get("GA_CONFIG", ""), }, timeout=core.Duration.seconds(60), memory_size=512, runtime=lambda_.Runtime.PYTHON_3_8, ) api = apig.LambdaRestApi( self, "bgtools-api", handler=flask_app, binary_media_types=["*/*"], minimum_compression_size=10e4, deploy_options={ "method_options": { "/*/*": apig.MethodDeploymentOptions(throttling_rate_limit=10, throttling_burst_limit=20) } }, ) cloudfront.Distribution( self, "BGToolsCloudfrontDist", default_behavior=cloudfront.BehaviorOptions( origin=cloudfront_origins.HttpOrigin( core.Fn.select(2, core.Fn.split("/", api.url)), origin_path=core.Fn.join( "", ["/", core.Fn.select(3, core.Fn.split("/", api.url))]), ), origin_request_policy=cloudfront.OriginRequestPolicy( self, "OriginRequestPolicy", cookie_behavior=cloudfront.OriginRequestCookieBehavior.all( ), ), allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL, ), domain_names=["domdiv.bgtools.net"], certificate=acm.Certificate.from_certificate_arn( self, "cert", self.config["CERTIFICATE_ARN"], ), ) dashboard = aws_cloudwatch.Dashboard( self, f"bgtools-dashboard", dashboard_name=f"bgtools-prod", start="-P1D", period_override=aws_cloudwatch.PeriodOverride.INHERIT, ) dashboard.add_widgets( aws_cloudwatch.GraphWidget( title="API Gateway Counts", width=6, height=6, left=[ aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="5XXError", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#d62728", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="4XXError", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#8c564b", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="Count", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Sum", color="#2ca02c", ), ], ), aws_cloudwatch.GraphWidget( title="API Gateway Latencies", width=6, height=6, left=[ aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="Latency", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Average", ), aws_cloudwatch.Metric( namespace="AWS/ApiGateway", metric_name="IntegrationLatency", dimensions={ "ApiName": "bgtools-api", "Stage": api.deployment_stage.stage_name, }, period=core.Duration.minutes(amount=30), statistic="Average", ), ], ), )
def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None: super().__init__(scope, id, **kwargs) # Create SNS Topic for Operations Team): konstone_ops_team = _sns.Topic(self, "konstoneOpsTeam", display_name="KonStone 24x7 On Watsapp? Support", topic_name="konstoneOpsTeam" ) # Add Subscription to SNS Topic konstone_ops_team.add_subscription( _subs.EmailSubscription("*****@*****.**") ) # Create a MultiAZ VPC): vpc = _ec2.Vpc( self, "konstoneVpcId", cidr="10.111.0.0/16", max_azs=2, nat_gateways=0, subnet_configuration=[ _ec2.SubnetConfiguration( name="public", subnet_type=_ec2.SubnetType.PUBLIC ) ] ) # Read EC2 BootStrap Script try: with open("bootstrap_scripts/install_httpd.sh", mode="r") as file: user_data = file.read() except OSError: print('Unable to read UserData script') # Get the latest ami amzn_linux_ami = _ec2.MachineImage.latest_amazon_linux( generation=_ec2.AmazonLinuxGeneration.AMAZON_LINUX_2, edition=_ec2.AmazonLinuxEdition.STANDARD, storage=_ec2.AmazonLinuxStorage.EBS, virtualization=_ec2.AmazonLinuxVirt.HVM ) # WebServer Instance web_server = _ec2.Instance(self, "WebServer004Id", instance_type=_ec2.InstanceType( instance_type_identifier="t2.micro"), instance_name="WebServer004", machine_image=amzn_linux_ami, vpc=vpc, vpc_subnets=_ec2.SubnetSelection( subnet_type=_ec2.SubnetType.PUBLIC ), user_data=_ec2.UserData.custom(user_data) ) # Allow Web Traffic to WebServer web_server.connections.allow_from_any_ipv4( _ec2.Port.tcp(80), description="Allow Web Traffic" ) # Add permission to web server instance profile web_server.role.add_managed_policy( _iam.ManagedPolicy.from_aws_managed_policy_name( "AmazonSSMManagedInstanceCore") ) # Read Lambda Code try: with open("serverless_stacks/lambda_src/konstone_processor.py", mode="r") as f: konstone_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") # Simple Lambda Function to return event konstone_fn = _lambda.Function(self, "konstoneFunction", function_name="konstone_function", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode( konstone_fn_code), timeout=core.Duration.seconds(3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "AUTOMATION": "SKON" } ) # EC2 Metric for Avg. CPU ec2_metric_for_avg_cpu = _cloudwatch.Metric( namespace="AWS/EC2", metric_name="CPUUtilization", dimensions={ "InstanceId": web_server.instance_id }, period=core.Duration.minutes(5) ) # Low CPU Alarm for Web Server low_cpu_alarm = _cloudwatch.Alarm( self, "lowCPUAlarm", alarm_description="Alert if CPU is less than 10%", alarm_name="low-cpu-alarm", actions_enabled=True, metric=ec2_metric_for_avg_cpu, threshold=10, comparison_operator=_cloudwatch.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING ) # Inform SNS on EC2 Alarm State low_cpu_alarm.add_alarm_action( _cloudwatch_actions.SnsAction( konstone_ops_team ) ) # Create Lambda Alarm konstone_fn_error_alarm = _cloudwatch.Alarm( self, "konstoneFunctionErrorAlarm", metric=konstone_fn.metric_errors(), threshold=2, evaluation_periods=1, datapoints_to_alarm=1, period=core.Duration.minutes(5) ) # Inform SNS on Lambda Alarm State konstone_fn_error_alarm.add_alarm_action( _cloudwatch_actions.SnsAction( konstone_ops_team ) )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Lets create couple of instances to test): vpc = _ec2.Vpc(self, "abacVPC", cidr="10.13.0.0/21", max_azs=2, nat_gateways=0, subnet_configuration=[ _ec2.SubnetConfiguration( name="pubSubnet", cidr_mask=24, subnet_type=_ec2.SubnetType.PUBLIC) ]) core.Tag.add(vpc, key="ServiceProvider", value="KonStone", include_resource_types=[]) weak_sg = _ec2.SecurityGroup( self, "web_sec_grp", vpc=vpc, description="Allow internet access from the world", allow_all_outbound=True) # vpc_cidr_block # weak_sg.add_ingress_rule(_ec2.Peer.any_ipv4(), weak_sg.add_ingress_rule(_ec2.Peer.ipv4(vpc.vpc_cidr_block), _ec2.Port.tcp(22), "Allow SSH access from the VPC Only.") # We are using the latest AMAZON LINUX AMI # Benefit of having SSM Agent pre-installed ami_id = _ec2.AmazonLinuxImage(generation=_ec2.AmazonLinuxGeneration. AMAZON_LINUX_2).get_image(self).image_id # https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_iam/Role.html instace_profile_role = _iam.Role( self, 'ec2ssmroleid', assumed_by=_iam.ServicePrincipal('ec2.amazonaws.com'), role_name="instace_profile_role") instace_profile_role.add_managed_policy( _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonSSMManagedInstanceCore')) instance_profile_role_additional_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ "arn:aws:logs:*:*:*", ], actions=["logs:Create*", "logs:PutLogEvents"]) instance_profile_role_additional_perms.sid = "PutBucketPolicy" instace_profile_role.add_to_policy( instance_profile_role_additional_perms) inst_profile_01 = _iam.CfnInstanceProfile( self, "instProfile01Id", roles=[instace_profile_role.role_name], ) # Let us bootstrap the server with the required agents try: with open("./bootstrap_scripts/install_agents.sh", mode='rb') as file: bootstrap_data = file.read() except OSError: print('Failed to get UserData script') install_agents = _ec2.UserData.for_linux() install_agents.add_commands(str(bootstrap_data, 'utf-8')) # The EC2 Instance to monitor for failed SSH Logins ssh_monitored_inst_01 = _ec2.CfnInstance( self, "sshMonitoredInstance01", image_id=ami_id, instance_type="t2.micro", monitoring=False, tags=[{ "key": "ServiceProvider", "value": "KonStone" }], iam_instance_profile=inst_profile_01.ref, network_interfaces=[{ "deviceIndex": "0", "associatePublicIpAddress": True, "subnetId": vpc.public_subnets[0].subnet_id, "groupSet": [weak_sg.security_group_id] }], #https: //github.com/aws/aws-cdk/issues/3419 user_data=core.Fn.base64(install_agents.render()), ) """ linux_ami = _ec2.GenericLinuxImage({ "cn-northwest-1": "ami-0f62e91915e16cfc2","eu-west-1": "ami-12345678"}) ssh_monitored_inst_01_02 = _ec2.Instance(self, "monitoredInstance02", instance_type=_ec2.InstanceType(instance_type_identifier="t2.micro"), instance_name="monitoredInstance02", machine_image=linux_ami, vpc=vpc, security_group=[weak_sg.security_group_id], # vpc_subnets=_ec2.SubnetSelection(subnet_type=_ec2.SubnetType.PUBLIC) vpc_subnets=vpc.public_subnets[0].subnet_id, # user_data=_ec2.UserData.custom(t_user_data) ) """ # The log group name to store logs info_sec_ops_log_group = _logs.LogGroup( self, "infoSecOpsLogGroupId", log_group_name=(f"/Mystique/InfoSec/Automation/" f"{ssh_monitored_inst_01.ref}"), retention=_logs.RetentionDays.ONE_WEEK) # Defines an AWS Lambda resource with open("lambda_src/quarantine_ec2_instance.py", encoding="utf8") as fp: quarantine_ec2_instance_fn_handler_code = fp.read() quarantine_ec2_instance_fn = _lambda.Function( self, id='quarantineEc2InstanceFnId', function_name="quarantine_ec2_instance", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(quarantine_ec2_instance_fn_handler_code), handler='index.lambda_handler', timeout=core.Duration.seconds(5)) quarantine_ec2_instance_fn_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ "*", ], actions=[ "ec2:RevokeSecurityGroupIngress", "ec2:DescribeSecurityGroupReferences", "ec2:RevokeSecurityGroupEgress", "ec2:ApplySecurityGroupsToClientVpnTargetNetwork", "ec2:DescribeSecurityGroups", "ec2:CreateSecurityGroup", "ec2:DescribeInstances", "ec2:CreateTags", "ec2:StopInstances", "ec2:CreateVolume", "ec2:CreateSnapshots", "ec2:CreateSnapshot", "ec2:DescribeSnapshots", "ec2:ModifyInstanceAttribute" ]) quarantine_ec2_instance_fn_perms.sid = "AllowLambdaToQuarantineEC2" quarantine_ec2_instance_fn.add_to_role_policy( quarantine_ec2_instance_fn_perms) info_sec_ops_topic = _sns.Topic(self, "infoSecOpsTopicId", display_name="InfoSecTopic", topic_name="InfoSecOpsTopic") # Ref: https://docs.aws.amazon.com/cdk/api/latest/docs/aws-stepfunctions-readme.html ############################################################################### ################# STEP FUNCTIONS EXPERIMENTAL CODE - UNSTABLE ################# ############################################################################### quarantine_ec2_instance_task = _sfn.Task( self, "Quarantine EC2 Instance", task=_tasks.InvokeFunction(quarantine_ec2_instance_fn), result_path="$") notify_secops_task = _sfn.Task( self, "Notify InfoSecOps", task=_tasks.PublishToTopic( info_sec_ops_topic, integration_pattern=_sfn.ServiceIntegrationPattern. FIRE_AND_FORGET, message=_sfn.TaskInput.from_data_at("$.message"), subject="SSH Error Response Notification")) ssh_error_response_failure = _sfn.Fail( self, "SSH Error Response Actions Failed", cause="All Response Actions were NOT completed", error="Check Logs") ssh_error_response_success = _sfn.Succeed( self, "SSH Error Response Actions Succeeded", comment="All Response Action Completed Successfully", ) ssh_error_response_sfn_definition = quarantine_ec2_instance_task\ .next(notify_secops_task\ .next(_sfn.Choice(self, "SSH Errors Response Complete?")\ .when(_sfn.Condition.number_equals("$.SdkHttpMetadata.HttpStatusCode", 200),ssh_error_response_success)\ .when(_sfn.Condition.not_( _sfn.Condition.number_equals("$.SdkHttpMetadata.HttpStatusCode", 200)), ssh_error_response_failure)\ .otherwise(ssh_error_response_failure) ) ) ssh_error_response_statemachine = _sfn.StateMachine( self, "stateMachineId", definition=ssh_error_response_sfn_definition, timeout=core.Duration.minutes(5)) ############################################################################### ################# STEP FUNCTIONS EXPERIMENTAL CODE - UNSTABLE ################# ############################################################################### # LAMBDA TO TRIGGER STATE MACHINE - since state cannot be invoked by SNS with open("lambda_src/trigger_state_machine.py", encoding="utf8") as fp: trigger_state_machine_fn_handler_code = fp.read() trigger_state_machine_fn = _lambda.Function( self, id='sshErrorResponseFnId', function_name="trigger_ssh_error_response_state_machine_fn", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(trigger_state_machine_fn_handler_code), # code=_lambda.Code.asset("lambda_src/is_policy_permissive.py"), # code=_lambda.Code.asset('lambda_src'), # code=_lambda.InlineCode(code_body), handler='index.lambda_handler', timeout=core.Duration.seconds(5), environment={ "STATE_MACHINE_ARN": f"{ssh_error_response_statemachine.state_machine_arn}", }) trigger_state_machine_fn_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ f"{ssh_error_response_statemachine.state_machine_arn}", ], actions=["states:StartExecution"]) trigger_state_machine_fn_perms.sid = "PutBucketPolicy" trigger_state_machine_fn.add_to_role_policy( trigger_state_machine_fn_perms) """ version = trigger_state_machine_fn.add_version(name=datetime.now().isoformat()) trigger_state_machine_fn_alias = _lambda.Alias(self, 'lmdaAliasId', alias_name='MystiqueTestAlias', version=version ) """ # Lets add permission to SNS to trigger our lambda function trigger_lambda_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ trigger_state_machine_fn.function_arn, ], actions=[ "lambda:InvokeFunction", ]) trigger_lambda_perms.sid = "TriggerLambaFunction" # info_sec_ops_topic.add_to_resource_policy( trigger_lambda_perms ) # Subscribe InfoSecOps Email to topic info_sec_ops_topic.add_subscription( _subs.EmailSubscription(global_args.INFO_SEC_OPS_EMAIL)) # info_sec_ops_topic.add_subscription(_subs.LambdaSubscription(trigger_state_machine_fn)) trigger_state_machine_fn_alarm = trigger_state_machine_fn.metric_all_errors( ).create_alarm( self, "fn-error-alarm", threshold=5, alarm_name="trigger_state_machine_fn_error_alarm", evaluation_periods=5, period=core.Duration.minutes(1), ) subscribe_trigger_state_machine_fn_to_logs = _logs.SubscriptionFilter( self, "sshErrorLogSubscriptionId", log_group=info_sec_ops_log_group, destination=_logs_destination.LambdaDestination( trigger_state_machine_fn), filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "status", "...").where_string("status", "=", "Invalid"), ) # https://pypi.org/project/aws-cdk.aws-logs/ # We are creating three filter # tooManySshDisconnects, invalidSshUser and invalidSshKey: # When a user tries to SSH with invalid username the next line is logged in the SSH log file: # Apr 20 02:39:35 ip-172-31-63-56 sshd[17136]: Received disconnect from xxx.xxx.xxx.xxx: 11: [preauth] too_many_ssh_disconnects_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="tooManySshDisconnects") too_many_ssh_disconnects_filter = _logs.MetricFilter( self, "tooManySshDisconnectsFilterId", log_group=info_sec_ops_log_group, metric_namespace=too_many_ssh_disconnects_metric.namespace, metric_name=too_many_ssh_disconnects_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "msg1", "msg2", "...").where_string("msg2", "=", "disconnect"), metric_value="1") invalid_ssh_user_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="invalidSshUser", ) invalid_ssh_user_filter = _logs.MetricFilter( self, "invalidSshUserFilterId", log_group=info_sec_ops_log_group, metric_namespace=invalid_ssh_user_metric.namespace, metric_name=invalid_ssh_user_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "status", "...").where_string("status", "=", "Invalid"), metric_value="1") invalid_ssh_key_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="invalidSshKey") invalid_ssh_key_filter = _logs.MetricFilter( self, "invalidSshKeyFilterId", log_group=info_sec_ops_log_group, metric_namespace=invalid_ssh_key_metric.namespace, metric_name=invalid_ssh_key_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "msg1", "msg2", "...").where_string("msg1", "=", "Connection").where_string( "msg2", "=", "closed"), metric_value="1") # Now let us create alarms # alarm is raised there are more than 5(threshold) of the measured metrics in two(datapoint) of the last three seconds(evaluation): # Period=60Seconds, Eval=3, Threshold=5 too_many_ssh_disconnects_alarm = _cloudwatch.Alarm( self, "tooManySshDisconnectsAlarmId", alarm_name="too_many_ssh_disconnects_alarm", alarm_description= "The number disconnect requests is greater then 5, even 1 time in 3 minutes", metric=too_many_ssh_disconnects_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD) invalid_ssh_user_alarm = _cloudwatch.Alarm( self, "invalidSshUserAlarmId", alarm_name="too_many_invalid_ssh_users_alarm", alarm_description= "The number of invalid ssh users connecting is greater then 5, even 1 time in 3 minutes", metric=invalid_ssh_user_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_THRESHOLD) invalid_ssh_user_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(info_sec_ops_topic)) invalid_ssh_key_alarm = _cloudwatch.Alarm( self, "invalidSshKeyAlarmId", alarm_name="too_many_invalid_ssh_key_alarm", alarm_description= "The number of invalid ssh keys connecting is greater then 5, even 1 time in 3 minutes", metric=invalid_ssh_key_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD) invalid_ssh_key_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(info_sec_ops_topic)) ########################################### ################# OUTPUTS ################# ########################################### output0 = core.CfnOutput( self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page." ) output1_1 = core.Fn.get_att( logical_name_of_resource="sshMonitoredInstance01", attribute_name="PublicIp") output1 = core.CfnOutput(self, "MonitoredInstance", value=output1_1.to_string(), description="Web Server Public IP to attack") output2 = core.CfnOutput( self, "SSHAlarms", value= (f"https://console.aws.amazon.com/cloudwatch/home?region=" f"{core.Aws.REGION}" f"#/configuration/" f"#alarmsV2:?search=ssh&alarmStateFilter=ALL&alarmTypeFilter=ALL" ), description="Check out the cloudwatch Alarms") output3 = core.CfnOutput( self, "SubscribeToNotificationTopic", value=(f"https://console.aws.amazon.com/sns/v3/home?" f"{core.Aws.REGION}" f"#/topic/" f"{info_sec_ops_topic.topic_arn}"), description= "Add your email to subscription and confirm subscription") output_test_1 = core.CfnOutput( self, "ToGenInvalidKeyErrors", value= (f"for i in {{1..30}}; do ssh -i $RANDOM ec2-user@{output1_1.to_string()}; sleep 2; done &" ), description= "Generates random key names and connects to server 30 times over 60 seconds" ) output_test_2 = core.CfnOutput( self, "ToGenInvalidUserErrors", value= (f"for i in {{1..30}}; do ssh ec2-user$RANDOM@{output1_1.to_string()}; sleep 2; done &" ), description= "Generates random user names and connects to server 30 times over 60 seconds" ) """
def __init__(self, scope: core.Construct, id: str, stream_producer_lg, stream_pipe, py_stream_record_processor_fn, node_stream_record_processor_fn, ** kwargs ) -> None: super().__init__(scope, id, **kwargs) # ): ##### MONITORING ###### ################################################## ########## STREAM METRICS ######### ################################################## # Shows you the ingestion rate into the shard. stream_in_bytes_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingBytes", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingBytes", period=core.Duration.minutes(30), statistic="Sum" ) stream_in_records_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingRecords", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingRecords", period=core.Duration.minutes(30), statistic="Sum" ) stream_w_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="WriteProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="WriteProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_r_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="ReadProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="ReadProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_success_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Success", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.LatSuccessency", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) stream_get_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="GetRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="GetRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) ################################################## ########## STREAM PRODUCER METRICS ######### ################################################## # JSON Metric Filter - https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html records_produced_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="recordsProducedCount", label="Total No. Of Records Produced", period=core.Duration.minutes(30), statistic="Sum" ) records_produced_metric_filter = _logs.MetricFilter(self, "recordsProducedCountFilter", filter_pattern=_logs.FilterPattern.exists( "$.records_produced"), log_group=stream_producer_lg, metric_namespace=records_produced_metric.namespace, metric_name=records_produced_metric.metric_name, default_value=0, metric_value="$.records_produced", ) ################################################## ########## STREAM CONSUMER METRICS ######### ################################################## py_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", # dimensions={ # "RecordsProcessed": "py_processor" # }, metric_name="pyRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) py_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter01", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=py_stream_record_processor_fn.log_group, metric_namespace=py_records_processed_metric.namespace, metric_name=py_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) node_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="nodeRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) node_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter02", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=node_stream_record_processor_fn.log_group, metric_namespace=node_records_processed_metric.namespace, metric_name=node_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) # Create CloudWatch Dashboard for Streams stream_processor_dashboard = _cloudwatch.Dashboard(self, id="streamProcessorDashboard", dashboard_name="Stream-Processor" ) stream_processor_dashboard.add_widgets( _cloudwatch.SingleValueWidget( title="TotalRecordsProduced", metrics=[records_produced_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Python-Consumer", metrics=[py_records_processed_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Node-Consumer", metrics=[node_records_processed_metric] ) ) # Stream Incoming bytes Graph stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Shard Ingestion Metrics", left=[stream_in_bytes_metric], right=[stream_in_records_metric] ), _cloudwatch.GraphWidget( title="Shard Throttle Metrics", left=[stream_w_throttle_metric], right=[stream_r_throttle_metric] ) ) ) stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Stream Put Latency", left=[stream_put_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Get Latency", left=[stream_get_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Put Success", left=[stream_put_success_metric] ) ) ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput(self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description="To know more about this automation stack, check out our github page." )
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) # Setup SSM parameter of credentials, bucket_para, ignore_list ssm_credential_para = ssm.StringParameter.from_secure_string_parameter_attributes( self, "ssm_parameter_credentials", parameter_name=ssm_parameter_credentials, version=1) ssm_bucket_para = ssm.StringParameter(self, "s3bucket_serverless", string_value=json.dumps( bucket_para, indent=4)) ssm_parameter_ignore_list = ssm.StringParameter( self, "s3_migrate_ignore_list", string_value=ignore_list) # Setup DynamoDB ddb_file_list = ddb.Table(self, "s3migrate_serverless", partition_key=ddb.Attribute( name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) ddb_file_list.add_global_secondary_index( partition_key=ddb.Attribute(name="desBucket", type=ddb.AttributeType.STRING), index_name="desBucket-index", projection_type=ddb.ProjectionType.INCLUDE, non_key_attributes=["desKey", "versionId"]) # Setup SQS sqs_queue_DLQ = sqs.Queue(self, "s3migrate_serverless_Q_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14)) sqs_queue = sqs.Queue(self, "s3migrate_serverless_Q", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=60, queue=sqs_queue_DLQ)) # Setup API for Lambda to get IP address (for debug networking routing purpose) checkip = api.RestApi( self, "lambda-checkip-api", cloud_watch_role=True, deploy=True, description="For Lambda get IP address", default_integration=api.MockIntegration( integration_responses=[ api.IntegrationResponse(status_code="200", response_templates={ "application/json": "$context.identity.sourceIp" }) ], request_templates={"application/json": '{"statusCode": 200}'}), endpoint_types=[api.EndpointType.REGIONAL]) checkip.root.add_method("GET", method_responses=[ api.MethodResponse( status_code="200", response_models={ "application/json": api.Model.EMPTY_MODEL }) ]) # Setup Lambda functions handler = lam.Function(self, "s3-migrate-worker", code=lam.Code.asset("./lambda"), handler="lambda_function_worker.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'ssm_parameter_credentials': ssm_parameter_credentials, 'JobType': JobType, 'MaxRetry': MaxRetry, 'MaxThread': MaxThread, 'MaxParallelFile': MaxParallelFile, 'JobTimeout': JobTimeout, 'UpdateVersionId': UpdateVersionId, 'GetObjectWithVersionId': GetObjectWithVersionId }) handler_jobsender = lam.Function( self, "s3-migrate-jobsender", code=lam.Code.asset("./lambda"), handler="lambda_function_jobsender.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'StorageClass': StorageClass, 'checkip_url': checkip.url, 'sqs_queue': sqs_queue.queue_name, 'ssm_parameter_credentials': ssm_parameter_credentials, 'ssm_parameter_ignore_list': ssm_parameter_ignore_list.parameter_name, 'ssm_parameter_bucket': ssm_bucket_para.parameter_name, 'JobType': JobType, 'MaxRetry': MaxRetry, 'JobsenderCompareVersionId': JobsenderCompareVersionId }) # Allow lambda read/write DDB, SQS ddb_file_list.grant_read_write_data(handler) ddb_file_list.grant_read_write_data(handler_jobsender) sqs_queue.grant_send_messages(handler_jobsender) # SQS trigger Lambda worker handler.add_event_source(SqsEventSource(sqs_queue, batch_size=1)) # Option1: Create S3 Bucket, all new objects in this bucket will be transmitted by Lambda Worker s3bucket = s3.Bucket(self, "s3_new_migrate") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # Option2: Allow Exist S3 Buckets to be read by Lambda functions. # Lambda Jobsender will scan and compare the these buckets and trigger Lambda Workers to transmit bucket_name = '' for b in bucket_para: if bucket_name != b['src_bucket']: # 如果列了多个相同的Bucket,就跳过 bucket_name = b['src_bucket'] s3exist_bucket = s3.Bucket.from_bucket_name( self, bucket_name, # 用这个做id bucket_name=bucket_name) if JobType == 'PUT': s3exist_bucket.grant_read(handler_jobsender) s3exist_bucket.grant_read(handler) else: # 'GET' mode s3exist_bucket.grant_read_write(handler_jobsender) s3exist_bucket.grant_read_write(handler) # Allow Lambda read ssm parameters ssm_bucket_para.grant_read(handler_jobsender) ssm_credential_para.grant_read(handler) ssm_credential_para.grant_read(handler_jobsender) ssm_parameter_ignore_list.grant_read(handler_jobsender) # Schedule cron event to trigger Lambda Jobsender per hour: event.Rule(self, 'cron_trigger_jobsender', schedule=event.Schedule.rate(core.Duration.hours(1)), targets=[target.LambdaFunction(handler_jobsender)]) # TODO: Trigger event imediately, add custom resource lambda to invoke handler_jobsender # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter( "Completed-bytes", metric_name="Completed-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter( "Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter( "Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) handler.log_group.add_metric_filter( "MaxMemoryUsed", metric_name="MaxMemoryUsed", metric_namespace="s3_migrate", metric_value="$memory", filter_pattern=logs.FilterPattern.literal( '[head="REPORT", a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, ' 'a13, a14, a15, a16, memory, MB="MB", rest]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Completed-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_MaxMemoryUsed = cw.Metric( namespace="s3_migrate", metric_name="MaxMemoryUsed", statistic="Maximum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter( "ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"ERROR"')) handler.log_group.add_metric_filter( "WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"WARNING"')) # Task timed out handler.log_group.add_metric_filter( "TIMEOUT", metric_name="TIMEOUT-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal('"Task timed out"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_TIMEOUT = cw.Metric(namespace="s3_migrate", metric_name="TIMEOUT-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate_serverless") board.add_widgets( cw.GraphWidget(title="Lambda-NETWORK", left=[ lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete ]), cw.GraphWidget(title="Lambda-concurrent", left=[ handler.metric( metric_name="ConcurrentExecutions", period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-invocations/errors/throttles", left=[ handler.metric_invocations( period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1)) ]), cw.GraphWidget( title="Lambda-duration", left=[ handler.metric_duration(period=core.Duration.minutes(1)) ]), ) board.add_widgets( cw.GraphWidget(title="Lambda_MaxMemoryUsed(MB)", left=[lambda_metric_MaxMemoryUsed]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING, log_metric_TIMEOUT]), cw.GraphWidget( title="SQS-Jobs", left=[ sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)) ]), cw.SingleValueWidget( title="Running/Waiting and Dead Jobs", metrics=[ sqs_queue. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1)), sqs_queue_DLQ. metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1)) ], height=6)) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm( self, "SQS_DLQ", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible( ), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription( subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, _id: str, **kwargs) -> None: super().__init__(scope, _id, **kwargs) ddb_file_list = ddb.Table(self, "ddb", partition_key=ddb.Attribute(name="Key", type=ddb.AttributeType.STRING), billing_mode=ddb.BillingMode.PAY_PER_REQUEST) sqs_queue_DLQ = sqs.Queue(self, "sqs_DLQ", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14) ) sqs_queue = sqs.Queue(self, "sqs_queue", visibility_timeout=core.Duration.minutes(15), retention_period=core.Duration.days(14), dead_letter_queue=sqs.DeadLetterQueue( max_receive_count=100, queue=sqs_queue_DLQ ) ) handler = lam.Function(self, "lambdaFunction", code=lam.Code.asset("./lambda"), handler="lambda_function.lambda_handler", runtime=lam.Runtime.PYTHON_3_8, memory_size=1024, timeout=core.Duration.minutes(15), tracing=lam.Tracing.ACTIVE, environment={ 'table_queue_name': ddb_file_list.table_name, 'Des_bucket_default': Des_bucket_default, 'Des_prefix_default': Des_prefix_default, 'StorageClass': StorageClass, 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, 'aws_access_key_region': aws_access_key_region }) ddb_file_list.grant_read_write_data(handler) handler.add_event_source(SqsEventSource(sqs_queue)) s3bucket = s3.Bucket(self, "s3bucket") s3bucket.grant_read(handler) s3bucket.add_event_notification(s3.EventType.OBJECT_CREATED, s3n.SqsDestination(sqs_queue)) # You can import an existing bucket and grant access to lambda # exist_s3bucket = s3.Bucket.from_bucket_name(self, "import_bucket", # bucket_name="you_bucket_name") # exist_s3bucket.grant_read(handler) # But You have to add sqs as imported bucket event notification manually, it doesn't support by CloudFormation # An work around is to add on_cloud_trail_event for the bucket, but will trigger could_trail first # 因为是导入的Bucket,需要手工建Bucket Event Trigger SQS,以及设置SQS允许该bucekt触发的Permission core.CfnOutput(self, "DynamoDB_Table", value=ddb_file_list.table_name) core.CfnOutput(self, "SQS_Job_Queue", value=sqs_queue.queue_name) core.CfnOutput(self, "SQS_Job_Queue_DLQ", value=sqs_queue_DLQ.queue_name) core.CfnOutput(self, "Worker_Lambda_Function", value=handler.function_name) core.CfnOutput(self, "New_S3_Bucket", value=s3bucket.bucket_name) # Create Lambda logs filter to create network traffic metric handler.log_group.add_metric_filter("Complete-bytes", metric_name="Complete-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Complete", bytes, key]')) handler.log_group.add_metric_filter("Uploading-bytes", metric_name="Uploading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Uploading", bytes, key]')) handler.log_group.add_metric_filter("Downloading-bytes", metric_name="Downloading-bytes", metric_namespace="s3_migrate", metric_value="$bytes", filter_pattern=logs.FilterPattern.literal( '[info, date, sn, p="--->Downloading", bytes, key]')) lambda_metric_Complete = cw.Metric(namespace="s3_migrate", metric_name="Complete-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Upload = cw.Metric(namespace="s3_migrate", metric_name="Uploading-bytes", statistic="Sum", period=core.Duration.minutes(1)) lambda_metric_Download = cw.Metric(namespace="s3_migrate", metric_name="Downloading-bytes", statistic="Sum", period=core.Duration.minutes(1)) handler.log_group.add_metric_filter("ERROR", metric_name="ERROR-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"ERROR"')) handler.log_group.add_metric_filter("WARNING", metric_name="WARNING-Logs", metric_namespace="s3_migrate", metric_value="1", filter_pattern=logs.FilterPattern.literal( '"WARNING"')) log_metric_ERROR = cw.Metric(namespace="s3_migrate", metric_name="ERROR-Logs", statistic="Sum", period=core.Duration.minutes(1)) log_metric_WARNING = cw.Metric(namespace="s3_migrate", metric_name="WARNING-Logs", statistic="Sum", period=core.Duration.minutes(1)) # Dashboard to monitor SQS and Lambda board = cw.Dashboard(self, "s3_migrate", dashboard_name="s3_migrate_serverless") board.add_widgets(cw.GraphWidget(title="Lambda-NETWORK", left=[lambda_metric_Download, lambda_metric_Upload, lambda_metric_Complete]), # TODO: here monitor all lambda concurrency not just the working one. Limitation from CDK # Lambda now supports monitor single lambda concurrency, will change this after CDK support cw.GraphWidget(title="Lambda-all-concurrent", left=[handler.metric_all_concurrent_executions(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-invocations/errors/throttles", left=[handler.metric_invocations(period=core.Duration.minutes(1)), handler.metric_errors(period=core.Duration.minutes(1)), handler.metric_throttles(period=core.Duration.minutes(1))]), cw.GraphWidget(title="Lambda-duration", left=[handler.metric_duration(period=core.Duration.minutes(1))]), ) board.add_widgets(cw.GraphWidget(title="SQS-Jobs", left=[sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="SQS-DeadLetterQueue", left=[sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) )]), cw.GraphWidget(title="ERROR/WARNING Logs", left=[log_metric_ERROR], right=[log_metric_WARNING]), cw.SingleValueWidget(title="Running/Waiting and Dead Jobs", metrics=[sqs_queue.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_not_visible( period=core.Duration.minutes(1) ), sqs_queue_DLQ.metric_approximate_number_of_messages_visible( period=core.Duration.minutes(1) )], height=6) ) # Alarm for queue - DLQ alarm_DLQ = cw.Alarm(self, "SQS_DLQ", alarm_name="s3-migration-serverless-SQS Dead Letter Queue", metric=sqs_queue_DLQ.metric_approximate_number_of_messages_visible(), threshold=0, comparison_operator=cw.ComparisonOperator.GREATER_THAN_THRESHOLD, evaluation_periods=1, datapoints_to_alarm=1) alarm_topic = sns.Topic(self, "SQS queue-DLQ has dead letter") alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) alarm_DLQ.add_alarm_action(action.SnsAction(alarm_topic)) # Alarm for queue empty, i.e. no visible message and no in-visible message # metric_all_message = cw.MathExpression( # expression="a + b", # label="empty_queue_expression", # using_metrics={ # "a": sqs_queue.metric_approximate_number_of_messages_visible(), # "b": sqs_queue.metric_approximate_number_of_messages_not_visible() # } # ) # alarm_0 = cw.Alarm(self, "SQSempty", # alarm_name="SQS queue empty-Serverless", # metric=metric_all_message, # threshold=0, # comparison_operator=cw.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD, # evaluation_periods=3, # datapoints_to_alarm=3, # treat_missing_data=cw.TreatMissingData.IGNORE # ) # alarm_topic = sns.Topic(self, "SQS queue empty-Serverless") # alarm_topic.add_subscription(subscription=sub.EmailSubscription(alarm_email)) # alarm_0.add_alarm_action(action.SnsAction(alarm_topic)) # core.CfnOutput(self, "Alarm", value="CloudWatch SQS queue empty Alarm for Serverless: " + alarm_email) core.CfnOutput(self, "Dashboard", value="CloudWatch Dashboard name s3_migrate_serverless")
def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None: super().__init__(scope, id, **kwargs) # Read Lambda Code): try: with open("serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f: konstone_custom_metric_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") konstone_custom_metric_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_custom_metric_fn", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode( konstone_custom_metric_fn_code), timeout=core.Duration.seconds( 3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "PERCENTAGE_ERRORS": "75" } ) # Create Custom Loggroup # /aws/lambda/function-name konstone_custom_metric_lg = _logs.LogGroup( self, "konstoneLoggroup", log_group_name=f"/aws/lambda/{konstone_custom_metric_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY, ) # Create Custom Metric Namespace third_party_error_metric = _cloudwatch.Metric( namespace=f"third-party-error-metric", metric_name="third_party_error_metric", label="Total No. of Third Party API Errors", period=core.Duration.minutes(1), statistic="Sum" ) # Create Custom Metric Log Filter third_party_error_metric_filter = _logs.MetricFilter( self, "thirdPartyApiErrorMetricFilter", filter_pattern=_logs.FilterPattern.boolean_value( "$.third_party_api_error", True), log_group=konstone_custom_metric_lg, metric_namespace=third_party_error_metric.namespace, metric_name=third_party_error_metric.metric_name, default_value=0, metric_value="1" ) # Create Third Party Error Alarm third_party_error_alarm = _cloudwatch.Alarm( self, "thirdPartyApiErrorAlarm", alarm_description="Alert if 3rd party API has more than 2 errors in the last two minutes", alarm_name="third-party-api-alarm", metric=third_party_error_metric, comparison_operator=_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING ) # Create CloudWatch Dashboard konstone_dashboard = _cloudwatch.Dashboard( self, id="konstoneDashboard", dashboard_name="Konstone-App-Live-Dashboard" ) # Add Lambda Function Metrics to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Backend-Invocations", left=[ konstone_custom_metric_fn.metric_invocations( statistic="Sum", period=core.Duration.minutes(1) ) ] ), _cloudwatch.GraphWidget( title="Backend-Errors", left=[ konstone_custom_metric_fn.metric_errors( statistic="Sum", period=core.Duration.minutes(1) ) ] ) ) ) # Add 3rd Party API Error to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.SingleValueWidget( title="Third Party API Errors", metrics=[third_party_error_metric] ) ) )
def __init__(self, scope: core.Construct, id: str, vpc: aws_ec2.Vpc, ecs_cluster=aws_ecs.Cluster, alb=elbv2.ApplicationLoadBalancer, albTestListener=elbv2.ApplicationListener, albProdListener=elbv2.ApplicationListener, blueGroup=elbv2.ApplicationTargetGroup, greenGroup=elbv2.ApplicationTargetGroup, **kwargs) -> None: super().__init__(scope, id, **kwargs) ECS_APP_NAME = "Nginx-app", ECS_DEPLOYMENT_GROUP_NAME = "NginxAppECSBlueGreen" ECS_DEPLOYMENT_CONFIG_NAME = "CodeDeployDefault.ECSLinear10PercentEvery1Minutes" ECS_TASKSET_TERMINATION_WAIT_TIME = 10 ECS_TASK_FAMILY_NAME = "Nginx-microservice" ECS_APP_NAME = "Nginx-microservice" ECS_APP_LOG_GROUP_NAME = "/ecs/Nginx-microservice" DUMMY_TASK_FAMILY_NAME = "sample-Nginx-microservice" DUMMY_APP_NAME = "sample-Nginx-microservice" DUMMY_APP_LOG_GROUP_NAME = "/ecs/sample-Nginx-microservice" DUMMY_CONTAINER_IMAGE = "smuralee/nginx" # ============================================================================= # ECR and CodeCommit repositories for the Blue/ Green deployment # ============================================================================= # ECR repository for the docker images NginxecrRepo = aws_ecr.Repository(self, "NginxRepo", image_scan_on_push=True) NginxCodeCommitrepo = aws_codecommit.Repository( self, "NginxRepository", repository_name=ECS_APP_NAME, description="Oussama application hosted on NGINX") # ============================================================================= # CODE BUILD and ECS TASK ROLES for the Blue/ Green deployment # ============================================================================= # IAM role for the Code Build project codeBuildServiceRole = aws_iam.Role( self, "codeBuildServiceRole", assumed_by=aws_iam.ServicePrincipal('codebuild.amazonaws.com')) inlinePolicyForCodeBuild = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "ecr:GetAuthorizationToken", "ecr:BatchCheckLayerAvailability", "ecr:InitiateLayerUpload", "ecr:UploadLayerPart", "ecr:CompleteLayerUpload", "ecr:PutImage" ], resources=["*"]) codeBuildServiceRole.add_to_policy(inlinePolicyForCodeBuild) # ECS task role ecsTaskRole = aws_iam.Role( self, "ecsTaskRoleForWorkshop", assumed_by=aws_iam.ServicePrincipal('ecs-tasks.amazonaws.com')) ecsTaskRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "service-role/AmazonECSTaskExecutionRolePolicy")) # ============================================================================= # CODE DEPLOY APPLICATION for the Blue/ Green deployment # ============================================================================= # Creating the code deploy application codeDeployApplication = codedeploy.EcsApplication( self, "NginxAppCodeDeploy") # Creating the code deploy service role codeDeployServiceRole = aws_iam.Role( self, "codeDeployServiceRole", assumed_by=aws_iam.ServicePrincipal('codedeploy.amazonaws.com')) codeDeployServiceRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( "AWSCodeDeployRoleForECS")) # IAM role for custom lambda function customLambdaServiceRole = aws_iam.Role( self, "codeDeployCustomLambda", assumed_by=aws_iam.ServicePrincipal('lambda.amazonaws.com')) inlinePolicyForLambda = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "iam:PassRole", "sts:AssumeRole", "codedeploy:List*", "codedeploy:Get*", "codedeploy:UpdateDeploymentGroup", "codedeploy:CreateDeploymentGroup", "codedeploy:DeleteDeploymentGroup" ], resources=["*"]) customLambdaServiceRole.add_managed_policy( aws_iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSLambdaBasicExecutionRole')) customLambdaServiceRole.add_to_policy(inlinePolicyForLambda) # Custom resource to create the deployment group createDeploymentGroupLambda = aws_lambda.Function( self, 'createDeploymentGroupLambda', code=aws_lambda.Code.from_asset("custom_resources"), runtime=aws_lambda.Runtime.PYTHON_3_8, handler='create_deployment_group.handler', role=customLambdaServiceRole, description="Custom resource to create deployment group", memory_size=128, timeout=core.Duration.seconds(60)) # ================================================================================================ # CloudWatch Alarms for 4XX errors blue4xxMetric = aws_cloudwatch.Metric( namespace='AWS/ApplicationELB', metric_name='HTTPCode_Target_4XX_Count', dimensions={ "TargetGroup": blueGroup.target_group_full_name, "LoadBalancer": alb.load_balancer_full_name }, statistic="sum", period=core.Duration.minutes(1)) blueGroupAlarm = aws_cloudwatch.Alarm( self, "blue4xxErrors", alarm_name="Blue_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Blue target group", metric=blue4xxMetric, threshold=1, evaluation_periods=1) green4xxMetric = aws_cloudwatch.Metric( namespace='AWS/ApplicationELB', metric_name='HTTPCode_Target_4XX_Count', dimensions={ "TargetGroup": greenGroup.target_group_full_name, "LoadBalancer": alb.load_balancer_full_name }, statistic="sum", period=core.Duration.minutes(1)) greenGroupAlarm = aws_cloudwatch.Alarm( self, "green4xxErrors", alarm_name="Green_4xx_Alarm", alarm_description= "CloudWatch Alarm for the 4xx errors of Green target group", metric=green4xxMetric, threshold=1, evaluation_periods=1) # ================================================================================================ # DUMMY TASK DEFINITION for the initial service creation # This is required for the service being made available to create the CodeDeploy Deployment Group # ================================================================================================ sampleTaskDefinition = aws_ecs.FargateTaskDefinition( self, "sampleTaskDefn", family=DUMMY_TASK_FAMILY_NAME, cpu=256, memory_limit_mib=1024, task_role=ecsTaskRole, execution_role=ecsTaskRole) sampleContainerDefn = sampleTaskDefinition.add_container( "sampleAppContainer", image=aws_ecs.ContainerImage.from_registry(DUMMY_CONTAINER_IMAGE), logging=aws_ecs.AwsLogDriver(log_group=aws_logs.LogGroup( self, "sampleAppLogGroup", log_group_name=DUMMY_APP_LOG_GROUP_NAME, removal_policy=core.RemovalPolicy.DESTROY), stream_prefix=DUMMY_APP_NAME), docker_labels={"name": DUMMY_APP_NAME}) port_mapping = aws_ecs.PortMapping(container_port=80, protocol=aws_ecs.Protocol.TCP) sampleContainerDefn.add_port_mappings(port_mapping) # ================================================================================================ # ECS task definition using ECR image # Will be used by the CODE DEPLOY for Blue/Green deployment # ================================================================================================ NginxTaskDefinition = aws_ecs.FargateTaskDefinition( self, "appTaskDefn", family=ECS_TASK_FAMILY_NAME, cpu=256, memory_limit_mib=1024, task_role=ecsTaskRole, execution_role=ecsTaskRole) NginxcontainerDefinition = NginxTaskDefinition.add_container( "NginxAppContainer", image=aws_ecs.ContainerImage.from_ecr_repository( NginxecrRepo, "latest"), logging=aws_ecs.AwsLogDriver(log_group=aws_logs.LogGroup( self, "NginxAppLogGroup", log_group_name=ECS_APP_LOG_GROUP_NAME, removal_policy=core.RemovalPolicy.DESTROY), stream_prefix=ECS_APP_NAME), docker_labels={"name": ECS_APP_NAME}) NginxcontainerDefinition.add_port_mappings(port_mapping) # ============================================================================= # ECS SERVICE for the Blue/ Green deployment # ============================================================================= NginxAppService = aws_ecs.FargateService( self, "NginxAppService", cluster=ecs_cluster, task_definition=NginxTaskDefinition, health_check_grace_period=core.Duration.seconds(10), desired_count=3, deployment_controller={ "type": aws_ecs.DeploymentControllerType.CODE_DEPLOY }, service_name=ECS_APP_NAME) NginxAppService.connections.allow_from(alb, aws_ec2.Port.tcp(80)) NginxAppService.connections.allow_from(alb, aws_ec2.Port.tcp(8080)) NginxAppService.attach_to_application_target_group(blueGroup) # ============================================================================= # CODE DEPLOY - Deployment Group CUSTOM RESOURCE for the Blue/ Green deployment # ============================================================================= core.CustomResource( self, 'customEcsDeploymentGroup', service_token=createDeploymentGroupLambda.function_arn, properties={ "ApplicationName": codeDeployApplication.application_name, "DeploymentGroupName": ECS_DEPLOYMENT_GROUP_NAME, "DeploymentConfigName": ECS_DEPLOYMENT_CONFIG_NAME, "ServiceRoleArn": codeDeployServiceRole.role_arn, "BlueTargetGroup": blueGroup.target_group_name, "GreenTargetGroup": greenGroup.target_group_name, "ProdListenerArn": albProdListener.listener_arn, "TestListenerArn": albTestListener.listener_arn, "EcsClusterName": ecs_cluster.cluster_name, "EcsServiceName": NginxAppService.service_name, "TerminationWaitTime": ECS_TASKSET_TERMINATION_WAIT_TIME, "BlueGroupAlarm": blueGroupAlarm.alarm_name, "GreenGroupAlarm": greenGroupAlarm.alarm_name, }) ecsDeploymentGroup = codedeploy.EcsDeploymentGroup.from_ecs_deployment_group_attributes( self, "ecsDeploymentGroup", application=codeDeployApplication, deployment_group_name=ECS_DEPLOYMENT_GROUP_NAME, deployment_config=codedeploy.EcsDeploymentConfig. from_ecs_deployment_config_name(self, "ecsDeploymentConfig", ECS_DEPLOYMENT_CONFIG_NAME)) # ============================================================================= # CODE BUILD PROJECT for the Blue/ Green deployment # ============================================================================= # Creating the code build project NginxAppcodebuild = aws_codebuild.Project( self, "NginxAppCodeBuild", role=codeBuildServiceRole, environment=aws_codebuild.BuildEnvironment( build_image=aws_codebuild.LinuxBuildImage.STANDARD_4_0, compute_type=aws_codebuild.ComputeType.SMALL, privileged=True, environment_variables={ 'REPOSITORY_URI': { 'value': NginxecrRepo.repository_uri, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_EXECUTION_ARN': { 'value': ecsTaskRole.role_arn, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT }, 'TASK_FAMILY': { 'value': ECS_TASK_FAMILY_NAME, 'type': aws_codebuild.BuildEnvironmentVariableType.PLAINTEXT } }), source=aws_codebuild.Source.code_commit( repository=NginxCodeCommitrepo)) # ============================================================================= # CODE PIPELINE for Blue/Green ECS deployment # ============================================================================= codePipelineServiceRole = aws_iam.Role( self, "codePipelineServiceRole", assumed_by=aws_iam.ServicePrincipal('codepipeline.amazonaws.com')) inlinePolicyForCodePipeline = aws_iam.PolicyStatement( effect=aws_iam.Effect.ALLOW, actions=[ "iam:PassRole", "sts:AssumeRole", "codecommit:Get*", "codecommit:List*", "codecommit:GitPull", "codecommit:UploadArchive", "codecommit:CancelUploadArchive", "codebuild:BatchGetBuilds", "codebuild:StartBuild", "codedeploy:CreateDeployment", "codedeploy:Get*", "codedeploy:RegisterApplicationRevision", "s3:Get*", "s3:List*", "s3:PutObject" ], resources=["*"]) codePipelineServiceRole.add_to_policy(inlinePolicyForCodePipeline) sourceArtifact = codepipeline.Artifact('sourceArtifact') buildArtifact = codepipeline.Artifact('buildArtifact') # S3 bucket for storing the code pipeline artifacts NginxAppArtifactsBucket = s3.Bucket( self, "NginxAppArtifactsBucket", encryption=s3.BucketEncryption.S3_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL) # S3 bucket policy for the code pipeline artifacts denyUnEncryptedObjectUploads = aws_iam.PolicyStatement( effect=aws_iam.Effect.DENY, actions=["s3:PutObject"], principals=[aws_iam.AnyPrincipal()], resources=[NginxAppArtifactsBucket.bucket_arn + "/*"], conditions={ "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } }) denyInsecureConnections = aws_iam.PolicyStatement( effect=aws_iam.Effect.DENY, actions=["s3:*"], principals=[aws_iam.AnyPrincipal()], resources=[NginxAppArtifactsBucket.bucket_arn + "/*"], conditions={"Bool": { "aws:SecureTransport": "false" }}) NginxAppArtifactsBucket.add_to_resource_policy( denyUnEncryptedObjectUploads) NginxAppArtifactsBucket.add_to_resource_policy(denyInsecureConnections) # Code Pipeline - CloudWatch trigger event is created by CDK codepipeline.Pipeline( self, "ecsBlueGreen", role=codePipelineServiceRole, artifact_bucket=NginxAppArtifactsBucket, stages=[ codepipeline.StageProps( stage_name='Source', actions=[ aws_codepipeline_actions.CodeCommitSourceAction( action_name='Source', repository=NginxCodeCommitrepo, output=sourceArtifact, ) ]), codepipeline.StageProps( stage_name='Build', actions=[ aws_codepipeline_actions.CodeBuildAction( action_name='Build', project=NginxAppcodebuild, input=sourceArtifact, outputs=[buildArtifact]) ]), codepipeline.StageProps( stage_name='Deploy', actions=[ aws_codepipeline_actions.CodeDeployEcsDeployAction( action_name='Deploy', deployment_group=ecsDeploymentGroup, app_spec_template_input=buildArtifact, task_definition_template_input=buildArtifact, ) ]) ]) # ============================================================================= # Export the outputs # ============================================================================= core.CfnOutput(self, "ecsBlueGreenCodeRepo", description="Demo app code commit repository", export_name="ecsBlueGreenDemoAppRepo", value=NginxCodeCommitrepo.repository_clone_url_http) core.CfnOutput(self, "ecsBlueGreenLBDns", description="Load balancer DNS", export_name="ecsBlueGreenLBDns", value=alb.load_balancer_dns_name)