def add_monitoring(self, monitoring): super().add_monitoring(monitoring) invocations_alarm = cloudwatch.Alarm( self, "InvocationsAlarm", metric=self.function.metric_invocations(), alarm_name=f"{self.function.function_name}-invocations", statistic="sum", comparison_operator=cloudwatch.ComparisonOperator. LESS_THAN_THRESHOLD, threshold=1, period=core.Duration.minutes(1440), evaluation_periods=1, ) monitoring.add_alarm_action(invocations_alarm) recording_duration = logs.MetricFilter( self, "RecordingDurationLogMetric", log_group=self.function.log_group, metric_name="RecordingDuration", metric_value="$.message.duration", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.all( logs.JsonPattern("$.message.duration > 0"))) recording_skipped = logs.MetricFilter( self, "RecordingSkippedLogMetric", log_group=self.function.log_group, metric_name="SkippedForDuration", metric_value="1", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.literal("Skipping"))
def add_monitoring(self, monitoring): super().add_monitoring(monitoring) recording_completed = logs.MetricFilter( self, "RecordingCompletedLogMetric", log_group=self.function.log_group, metric_name="RecordingCompleted", metric_value="1", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.all( logs.JsonPattern( "$.message.payload.status = \"RECORDING_MEETING_COMPLETED\"" ))) meeting_started = logs.MetricFilter( self, "MeetingStartedLogMetric", log_group=self.function.log_group, metric_name="MeetingStarted", metric_value="1", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.all( logs.JsonPattern("$.message.payload.status= \"STARTED\""))) meeting_ended = logs.MetricFilter( self, "MeetingEndedLogMetric", log_group=self.function.log_group, metric_name="MeetingEnded", metric_value="1", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.all( logs.JsonPattern("$.message.payload.status= \"ENDED\"")))
def add_monitoring(self, monitoring): super().add_monitoring(monitoring) minutes_in_pipeline = logs.MetricFilter( self, "MinutesInPipelineLogMetric", log_group=self.function.log_group, metric_name="MinutesInPipeline", metric_value="$.message.minutes_in_pipeline", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.all( logs.JsonPattern("$.message.minutes_in_pipeline > 0"))) workflow_initiated = logs.MetricFilter( self, "WorkflowInitiatedLogMetric", log_group=self.function.log_group, metric_name="WorkflowInitiated", metric_value="1", metric_namespace=monitoring.custom_metric_namespace, filter_pattern=logs.FilterPattern.literal("Workflow"))
def __init__(self, scope: core.Construct, id: str, ** kwargs) -> None: super().__init__(scope, id, **kwargs) # Read Lambda Code): try: with open("serverless_stacks/lambda_src/konstone_custom_metric_log_generator.py", mode="r") as f: konstone_custom_metric_fn_code = f.read() except OSError: print("Unable to read Lambda Function Code") konstone_custom_metric_fn = _lambda.Function( self, "konstoneFunction", function_name="konstone_custom_metric_fn", runtime=_lambda.Runtime.PYTHON_3_7, handler="index.lambda_handler", code=_lambda.InlineCode( konstone_custom_metric_fn_code), timeout=core.Duration.seconds( 3), reserved_concurrent_executions=1, environment={ "LOG_LEVEL": "INFO", "PERCENTAGE_ERRORS": "75" } ) # Create Custom Loggroup # /aws/lambda/function-name konstone_custom_metric_lg = _logs.LogGroup( self, "konstoneLoggroup", log_group_name=f"/aws/lambda/{konstone_custom_metric_fn.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=_logs.RetentionDays.ONE_DAY, ) # Create Custom Metric Namespace third_party_error_metric = _cloudwatch.Metric( namespace=f"third-party-error-metric", metric_name="third_party_error_metric", label="Total No. of Third Party API Errors", period=core.Duration.minutes(1), statistic="Sum" ) # Create Custom Metric Log Filter third_party_error_metric_filter = _logs.MetricFilter( self, "thirdPartyApiErrorMetricFilter", filter_pattern=_logs.FilterPattern.boolean_value( "$.third_party_api_error", True), log_group=konstone_custom_metric_lg, metric_namespace=third_party_error_metric.namespace, metric_name=third_party_error_metric.metric_name, default_value=0, metric_value="1" ) # Create Third Party Error Alarm third_party_error_alarm = _cloudwatch.Alarm( self, "thirdPartyApiErrorAlarm", alarm_description="Alert if 3rd party API has more than 2 errors in the last two minutes", alarm_name="third-party-api-alarm", metric=third_party_error_metric, comparison_operator=_cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=_cloudwatch.TreatMissingData.NOT_BREACHING ) # Create CloudWatch Dashboard konstone_dashboard = _cloudwatch.Dashboard( self, id="konstoneDashboard", dashboard_name="Konstone-App-Live-Dashboard" ) # Add Lambda Function Metrics to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Backend-Invocations", left=[ konstone_custom_metric_fn.metric_invocations( statistic="Sum", period=core.Duration.minutes(1) ) ] ), _cloudwatch.GraphWidget( title="Backend-Errors", left=[ konstone_custom_metric_fn.metric_errors( statistic="Sum", period=core.Duration.minutes(1) ) ] ) ) ) # Add 3rd Party API Error to Dashboard konstone_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.SingleValueWidget( title="Third Party API Errors", metrics=[third_party_error_metric] ) ) )
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #import function code try: with open("serverless_stack/functions/metric_logs_generator.py", mode="r") as file: function_body = file.read() except OSError: print('File can not read') #function function_01 = aws_lambda.Function( self, "lambdafunction01", function_name="LambdaTestCustomMEtric", runtime=aws_lambda.Runtime.PYTHON_3_6, handler="index.lambda_handler", code=aws_lambda.InlineCode(function_body), timeout=core.Duration.seconds(5), reserved_concurrent_executions=1, environment={ 'LOG_LEVEL': 'INFO', 'PERCENTAGE_ERRORS': '75' }) #attached cloudwatch log group custom_metric_log_group01 = aws_logs.LogGroup( self, "cloudwatchlog01", log_group_name=f"/aws/lambda/{function_01.function_name}", removal_policy=core.RemovalPolicy.DESTROY, retention=aws_logs.RetentionDays.ONE_DAY) #Custom metric namespace custom_metric_namespace01 = aws_cw.Metric( namespace=f"custom-error-metric", metric_name="custom-error-metric", label="Amount of Custom API errors", period=core.Duration.minutes(1), statistic="Sum") #Custom metric logs filter custom_metric_filter01 = aws_logs.MetricFilter( self, "customMetricFilter", filter_pattern=aws_logs.FilterPattern.boolean_value( "$.custom_api_error", True), log_group=custom_metric_log_group01, metric_namespace=custom_metric_namespace01.namespace, metric_name=custom_metric_namespace01.metric_name, default_value=0, metric_value="1") #create custom alarm custom_metric_alarm01 = aws_cw.Alarm( self, "customMetricAlarm", alarm_description="Custom API errors", alarm_name="Custom-API-alarm", metric=custom_metric_namespace01, comparison_operator=aws_cw.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, threshold=2, evaluation_periods=2, datapoints_to_alarm=1, period=core.Duration.minutes(1), treat_missing_data=aws_cw.TreatMissingData.NOT_BREACHING) #cloudwatch dashboard custom_dashboard01 = aws_cw.Dashboard( self, id="CustomDashBoard", dashboard_name="CDK-custom-DashBoard") #lambda metrics to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.GraphWidget(title="Lambda-invoke", left=[ function_01.metric_invocations( statistic="Sum", period=core.Duration.minutes(1)) ]), aws_cw.GraphWidget(title="Lambda-errors", left=[ function_01.metric_errors( statistic="Sum", period=core.Duration.minutes(1)) ]))) #custom api errors to dashboard custom_dashboard01.add_widgets( aws_cw.Row( aws_cw.SingleValueWidget(title="Custom-API-errors", metrics=[custom_metric_namespace01])))
def __init__(self, scope: cdk.Construct, id: str, name: str, vpc_name: str, security_group_name: str, secrets_path: str = "/ibc/paper/", trading_mode: str = "paper", **kwargs) -> None: super().__init__(scope, id, *kwargs) # TODO: Create Log Group # Create a cluster vpc = ec2.Vpc.from_lookup(self, "vpc", vpc_name=vpc_name) privateSubnets = vpc.private_subnets cluster = ecs.Cluster(self, "cluster", vpc=vpc) # TODO: check for namespace before adding below. This is failing on stack updates. cluster.add_default_cloud_map_namespace(name="private") task = ecs.FargateTaskDefinition(self, "task", cpu="512", memory_mi_b="1024") # Add SSM Permissions to IAM Role SSM_ACTIONS = ["ssm:GetParametersByPath", "kms:Decrypt"] SSM_RESOURCES = [ "arn:aws:kms:*:*:alias/aws/ssm", "arn:aws:ssm:*:*:parameter{}*".format(secrets_path), ] ssmPolicy = iam.PolicyStatement(iam.PolicyStatementEffect.Allow) for action in SSM_ACTIONS: ssmPolicy.add_action(action) for resource in SSM_RESOURCES: ssmPolicy.add_resource(resource) task.add_to_task_role_policy(ssmPolicy) ibcRepo = ecr.Repository.from_repository_name(self, "container_repo", "ibc") ibcImage = ecs.ContainerImage.from_ecr_repository(ibcRepo, "latest") # TODO: Add to Existing Hierarchal Logger, add log_group argument with ref to it ibcLogger = ecs.AwsLogDriver(self, "logger", stream_prefix=name) connectionLossMetric = logs.MetricFilter( self, "connectionLossMetric", filter_pattern=logs.FilterPattern.literal("ERROR ?110 ?130"), log_group=ibcLogger.log_group, metric_name="ib_connection_loss", metric_namespace=name, ) newContainerMetric = logs.MetricFilter( self, "newContainerMetric", filter_pattern=logs.FilterPattern.literal( "Starting virtual X frame buffer"), log_group=ibcLogger.log_group, metric_name="new_container", metric_namespace=name, ) kinesisFirehoseBucketActions = [ "s3:AbortMultipartUpload", "s3:GetBucketLocation", "s3:GetObject", "s3:ListBucket", "s3:ListBucketMultipartUploads", ] kinesisFirehoseBucket = s3.Bucket(self, "firehoseBucket") kinesisFirehoseBucketPolicy = iam.PolicyStatement( iam.PolicyStatementEffect.Allow) for action in kinesisFirehoseBucketActions: kinesisFirehoseBucketPolicy.add_action(action) for resource in [ kinesisFirehoseBucket.bucket_arn, kinesisFirehoseBucket.bucket_arn + "/*", ]: kinesisFirehoseBucketPolicy.add_resource(resource) kinesisFirehoseBucketRole = iam.Role( self, "kinesisFirehoseBucketRole", assumed_by=iam.ServicePrincipal("firehose.amazonaws.com"), path="/service/" + name + "/", ) kinesisFirehoseBucketRole.add_to_policy(kinesisFirehoseBucketPolicy) kinesisFirehose = firehose.CfnDeliveryStream( self, "firehose", delivery_stream_name=name, delivery_stream_type="DirectPut", s3_destination_configuration={ "bucketArn": kinesisFirehoseBucket.bucket_arn, "bufferingHints": { "intervalInSeconds": 10 * 60, "sizeInMBs": 16 }, "compressionFormat": "GZIP", "roleArn": kinesisFirehoseBucketRole.role_arn, }, ) # Add Firehose Permissions to Task IAM Role FIREHOSE_ACTIONS = ["firehose:PutRecord", "firehose:PutRecordBatch"] firehosePolicy = iam.PolicyStatement(iam.PolicyStatementEffect.Allow) for action in FIREHOSE_ACTIONS: firehosePolicy.add_action(action) firehosePolicy.add_resource(kinesisFirehose.delivery_stream_arn) task.add_to_task_role_policy(firehosePolicy) environment = { "SECRETS_PATH": secrets_path, "TWS_LIVE_PAPER": trading_mode, "FIREHOSE_STREAM_NAME": kinesisFirehose.delivery_stream_name, } ibcContainer = ecs.ContainerDefinition( self, "container", task_definition=task, image=ibcImage, environment=environment, logging=ibcLogger, essential=True, ) securityGroup = ec2.SecurityGroup.from_security_group_id( self, "task_security_group", security_group_id=security_group_name) ibcService = ecs.FargateService( self, "fargate_service", cluster=cluster, task_definition=task, assign_public_ip=False, desired_count=1, security_group=securityGroup, service_discovery_options=ecs.ServiceDiscoveryOptions(name=name), service_name=name, vpc_subnets=privateSubnets, )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Lets create couple of instances to test): vpc = _ec2.Vpc(self, "abacVPC", cidr="10.13.0.0/21", max_azs=2, nat_gateways=0, subnet_configuration=[ _ec2.SubnetConfiguration( name="pubSubnet", cidr_mask=24, subnet_type=_ec2.SubnetType.PUBLIC) ]) core.Tag.add(vpc, key="ServiceProvider", value="KonStone", include_resource_types=[]) weak_sg = _ec2.SecurityGroup( self, "web_sec_grp", vpc=vpc, description="Allow internet access from the world", allow_all_outbound=True) # vpc_cidr_block # weak_sg.add_ingress_rule(_ec2.Peer.any_ipv4(), weak_sg.add_ingress_rule(_ec2.Peer.ipv4(vpc.vpc_cidr_block), _ec2.Port.tcp(22), "Allow SSH access from the VPC Only.") # We are using the latest AMAZON LINUX AMI # Benefit of having SSM Agent pre-installed ami_id = _ec2.AmazonLinuxImage(generation=_ec2.AmazonLinuxGeneration. AMAZON_LINUX_2).get_image(self).image_id # https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_iam/Role.html instace_profile_role = _iam.Role( self, 'ec2ssmroleid', assumed_by=_iam.ServicePrincipal('ec2.amazonaws.com'), role_name="instace_profile_role") instace_profile_role.add_managed_policy( _iam.ManagedPolicy.from_aws_managed_policy_name( 'AmazonSSMManagedInstanceCore')) instance_profile_role_additional_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ "arn:aws:logs:*:*:*", ], actions=["logs:Create*", "logs:PutLogEvents"]) instance_profile_role_additional_perms.sid = "PutBucketPolicy" instace_profile_role.add_to_policy( instance_profile_role_additional_perms) inst_profile_01 = _iam.CfnInstanceProfile( self, "instProfile01Id", roles=[instace_profile_role.role_name], ) # Let us bootstrap the server with the required agents try: with open("./bootstrap_scripts/install_agents.sh", mode='rb') as file: bootstrap_data = file.read() except OSError: print('Failed to get UserData script') install_agents = _ec2.UserData.for_linux() install_agents.add_commands(str(bootstrap_data, 'utf-8')) # The EC2 Instance to monitor for failed SSH Logins ssh_monitored_inst_01 = _ec2.CfnInstance( self, "sshMonitoredInstance01", image_id=ami_id, instance_type="t2.micro", monitoring=False, tags=[{ "key": "ServiceProvider", "value": "KonStone" }], iam_instance_profile=inst_profile_01.ref, network_interfaces=[{ "deviceIndex": "0", "associatePublicIpAddress": True, "subnetId": vpc.public_subnets[0].subnet_id, "groupSet": [weak_sg.security_group_id] }], #https: //github.com/aws/aws-cdk/issues/3419 user_data=core.Fn.base64(install_agents.render()), ) """ linux_ami = _ec2.GenericLinuxImage({ "cn-northwest-1": "ami-0f62e91915e16cfc2","eu-west-1": "ami-12345678"}) ssh_monitored_inst_01_02 = _ec2.Instance(self, "monitoredInstance02", instance_type=_ec2.InstanceType(instance_type_identifier="t2.micro"), instance_name="monitoredInstance02", machine_image=linux_ami, vpc=vpc, security_group=[weak_sg.security_group_id], # vpc_subnets=_ec2.SubnetSelection(subnet_type=_ec2.SubnetType.PUBLIC) vpc_subnets=vpc.public_subnets[0].subnet_id, # user_data=_ec2.UserData.custom(t_user_data) ) """ # The log group name to store logs info_sec_ops_log_group = _logs.LogGroup( self, "infoSecOpsLogGroupId", log_group_name=(f"/Mystique/InfoSec/Automation/" f"{ssh_monitored_inst_01.ref}"), retention=_logs.RetentionDays.ONE_WEEK) # Defines an AWS Lambda resource with open("lambda_src/quarantine_ec2_instance.py", encoding="utf8") as fp: quarantine_ec2_instance_fn_handler_code = fp.read() quarantine_ec2_instance_fn = _lambda.Function( self, id='quarantineEc2InstanceFnId', function_name="quarantine_ec2_instance", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(quarantine_ec2_instance_fn_handler_code), handler='index.lambda_handler', timeout=core.Duration.seconds(5)) quarantine_ec2_instance_fn_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ "*", ], actions=[ "ec2:RevokeSecurityGroupIngress", "ec2:DescribeSecurityGroupReferences", "ec2:RevokeSecurityGroupEgress", "ec2:ApplySecurityGroupsToClientVpnTargetNetwork", "ec2:DescribeSecurityGroups", "ec2:CreateSecurityGroup", "ec2:DescribeInstances", "ec2:CreateTags", "ec2:StopInstances", "ec2:CreateVolume", "ec2:CreateSnapshots", "ec2:CreateSnapshot", "ec2:DescribeSnapshots", "ec2:ModifyInstanceAttribute" ]) quarantine_ec2_instance_fn_perms.sid = "AllowLambdaToQuarantineEC2" quarantine_ec2_instance_fn.add_to_role_policy( quarantine_ec2_instance_fn_perms) info_sec_ops_topic = _sns.Topic(self, "infoSecOpsTopicId", display_name="InfoSecTopic", topic_name="InfoSecOpsTopic") # Ref: https://docs.aws.amazon.com/cdk/api/latest/docs/aws-stepfunctions-readme.html ############################################################################### ################# STEP FUNCTIONS EXPERIMENTAL CODE - UNSTABLE ################# ############################################################################### quarantine_ec2_instance_task = _sfn.Task( self, "Quarantine EC2 Instance", task=_tasks.InvokeFunction(quarantine_ec2_instance_fn), result_path="$") notify_secops_task = _sfn.Task( self, "Notify InfoSecOps", task=_tasks.PublishToTopic( info_sec_ops_topic, integration_pattern=_sfn.ServiceIntegrationPattern. FIRE_AND_FORGET, message=_sfn.TaskInput.from_data_at("$.message"), subject="SSH Error Response Notification")) ssh_error_response_failure = _sfn.Fail( self, "SSH Error Response Actions Failed", cause="All Response Actions were NOT completed", error="Check Logs") ssh_error_response_success = _sfn.Succeed( self, "SSH Error Response Actions Succeeded", comment="All Response Action Completed Successfully", ) ssh_error_response_sfn_definition = quarantine_ec2_instance_task\ .next(notify_secops_task\ .next(_sfn.Choice(self, "SSH Errors Response Complete?")\ .when(_sfn.Condition.number_equals("$.SdkHttpMetadata.HttpStatusCode", 200),ssh_error_response_success)\ .when(_sfn.Condition.not_( _sfn.Condition.number_equals("$.SdkHttpMetadata.HttpStatusCode", 200)), ssh_error_response_failure)\ .otherwise(ssh_error_response_failure) ) ) ssh_error_response_statemachine = _sfn.StateMachine( self, "stateMachineId", definition=ssh_error_response_sfn_definition, timeout=core.Duration.minutes(5)) ############################################################################### ################# STEP FUNCTIONS EXPERIMENTAL CODE - UNSTABLE ################# ############################################################################### # LAMBDA TO TRIGGER STATE MACHINE - since state cannot be invoked by SNS with open("lambda_src/trigger_state_machine.py", encoding="utf8") as fp: trigger_state_machine_fn_handler_code = fp.read() trigger_state_machine_fn = _lambda.Function( self, id='sshErrorResponseFnId', function_name="trigger_ssh_error_response_state_machine_fn", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.InlineCode(trigger_state_machine_fn_handler_code), # code=_lambda.Code.asset("lambda_src/is_policy_permissive.py"), # code=_lambda.Code.asset('lambda_src'), # code=_lambda.InlineCode(code_body), handler='index.lambda_handler', timeout=core.Duration.seconds(5), environment={ "STATE_MACHINE_ARN": f"{ssh_error_response_statemachine.state_machine_arn}", }) trigger_state_machine_fn_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ f"{ssh_error_response_statemachine.state_machine_arn}", ], actions=["states:StartExecution"]) trigger_state_machine_fn_perms.sid = "PutBucketPolicy" trigger_state_machine_fn.add_to_role_policy( trigger_state_machine_fn_perms) """ version = trigger_state_machine_fn.add_version(name=datetime.now().isoformat()) trigger_state_machine_fn_alias = _lambda.Alias(self, 'lmdaAliasId', alias_name='MystiqueTestAlias', version=version ) """ # Lets add permission to SNS to trigger our lambda function trigger_lambda_perms = _iam.PolicyStatement( effect=_iam.Effect.ALLOW, resources=[ trigger_state_machine_fn.function_arn, ], actions=[ "lambda:InvokeFunction", ]) trigger_lambda_perms.sid = "TriggerLambaFunction" # info_sec_ops_topic.add_to_resource_policy( trigger_lambda_perms ) # Subscribe InfoSecOps Email to topic info_sec_ops_topic.add_subscription( _subs.EmailSubscription(global_args.INFO_SEC_OPS_EMAIL)) # info_sec_ops_topic.add_subscription(_subs.LambdaSubscription(trigger_state_machine_fn)) trigger_state_machine_fn_alarm = trigger_state_machine_fn.metric_all_errors( ).create_alarm( self, "fn-error-alarm", threshold=5, alarm_name="trigger_state_machine_fn_error_alarm", evaluation_periods=5, period=core.Duration.minutes(1), ) subscribe_trigger_state_machine_fn_to_logs = _logs.SubscriptionFilter( self, "sshErrorLogSubscriptionId", log_group=info_sec_ops_log_group, destination=_logs_destination.LambdaDestination( trigger_state_machine_fn), filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "status", "...").where_string("status", "=", "Invalid"), ) # https://pypi.org/project/aws-cdk.aws-logs/ # We are creating three filter # tooManySshDisconnects, invalidSshUser and invalidSshKey: # When a user tries to SSH with invalid username the next line is logged in the SSH log file: # Apr 20 02:39:35 ip-172-31-63-56 sshd[17136]: Received disconnect from xxx.xxx.xxx.xxx: 11: [preauth] too_many_ssh_disconnects_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="tooManySshDisconnects") too_many_ssh_disconnects_filter = _logs.MetricFilter( self, "tooManySshDisconnectsFilterId", log_group=info_sec_ops_log_group, metric_namespace=too_many_ssh_disconnects_metric.namespace, metric_name=too_many_ssh_disconnects_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "msg1", "msg2", "...").where_string("msg2", "=", "disconnect"), metric_value="1") invalid_ssh_user_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="invalidSshUser", ) invalid_ssh_user_filter = _logs.MetricFilter( self, "invalidSshUserFilterId", log_group=info_sec_ops_log_group, metric_namespace=invalid_ssh_user_metric.namespace, metric_name=invalid_ssh_user_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "status", "...").where_string("status", "=", "Invalid"), metric_value="1") invalid_ssh_key_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}", metric_name="invalidSshKey") invalid_ssh_key_filter = _logs.MetricFilter( self, "invalidSshKeyFilterId", log_group=info_sec_ops_log_group, metric_namespace=invalid_ssh_key_metric.namespace, metric_name=invalid_ssh_key_metric.metric_name, filter_pattern=_logs.FilterPattern.space_delimited( "Mon", "day", "timestamp", "ip", "id", "msg1", "msg2", "...").where_string("msg1", "=", "Connection").where_string( "msg2", "=", "closed"), metric_value="1") # Now let us create alarms # alarm is raised there are more than 5(threshold) of the measured metrics in two(datapoint) of the last three seconds(evaluation): # Period=60Seconds, Eval=3, Threshold=5 too_many_ssh_disconnects_alarm = _cloudwatch.Alarm( self, "tooManySshDisconnectsAlarmId", alarm_name="too_many_ssh_disconnects_alarm", alarm_description= "The number disconnect requests is greater then 5, even 1 time in 3 minutes", metric=too_many_ssh_disconnects_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD) invalid_ssh_user_alarm = _cloudwatch.Alarm( self, "invalidSshUserAlarmId", alarm_name="too_many_invalid_ssh_users_alarm", alarm_description= "The number of invalid ssh users connecting is greater then 5, even 1 time in 3 minutes", metric=invalid_ssh_user_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_THRESHOLD) invalid_ssh_user_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(info_sec_ops_topic)) invalid_ssh_key_alarm = _cloudwatch.Alarm( self, "invalidSshKeyAlarmId", alarm_name="too_many_invalid_ssh_key_alarm", alarm_description= "The number of invalid ssh keys connecting is greater then 5, even 1 time in 3 minutes", metric=invalid_ssh_key_metric, actions_enabled=True, period=core.Duration.minutes(1), threshold=5, evaluation_periods=3, datapoints_to_alarm=1, statistic="sum", comparison_operator=_cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD) invalid_ssh_key_alarm.add_alarm_action( _cloudwatch_actions.SnsAction(info_sec_ops_topic)) ########################################### ################# OUTPUTS ################# ########################################### output0 = core.CfnOutput( self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page." ) output1_1 = core.Fn.get_att( logical_name_of_resource="sshMonitoredInstance01", attribute_name="PublicIp") output1 = core.CfnOutput(self, "MonitoredInstance", value=output1_1.to_string(), description="Web Server Public IP to attack") output2 = core.CfnOutput( self, "SSHAlarms", value= (f"https://console.aws.amazon.com/cloudwatch/home?region=" f"{core.Aws.REGION}" f"#/configuration/" f"#alarmsV2:?search=ssh&alarmStateFilter=ALL&alarmTypeFilter=ALL" ), description="Check out the cloudwatch Alarms") output3 = core.CfnOutput( self, "SubscribeToNotificationTopic", value=(f"https://console.aws.amazon.com/sns/v3/home?" f"{core.Aws.REGION}" f"#/topic/" f"{info_sec_ops_topic.topic_arn}"), description= "Add your email to subscription and confirm subscription") output_test_1 = core.CfnOutput( self, "ToGenInvalidKeyErrors", value= (f"for i in {{1..30}}; do ssh -i $RANDOM ec2-user@{output1_1.to_string()}; sleep 2; done &" ), description= "Generates random key names and connects to server 30 times over 60 seconds" ) output_test_2 = core.CfnOutput( self, "ToGenInvalidUserErrors", value= (f"for i in {{1..30}}; do ssh ec2-user$RANDOM@{output1_1.to_string()}; sleep 2; done &" ), description= "Generates random user names and connects to server 30 times over 60 seconds" ) """
def __init__(self, scope: core.Construct, id: str, stream_producer_lg, stream_pipe, py_stream_record_processor_fn, node_stream_record_processor_fn, ** kwargs ) -> None: super().__init__(scope, id, **kwargs) # ): ##### MONITORING ###### ################################################## ########## STREAM METRICS ######### ################################################## # Shows you the ingestion rate into the shard. stream_in_bytes_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingBytes", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingBytes", period=core.Duration.minutes(30), statistic="Sum" ) stream_in_records_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="IncomingRecords", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="IncomingRecords", period=core.Duration.minutes(30), statistic="Sum" ) stream_w_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="WriteProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="WriteProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_r_throttle_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="ReadProvisionedThroughputExceeded", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="ReadProvisionedThroughputExceeded", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_success_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Success", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.LatSuccessency", period=core.Duration.minutes(30), statistic="Sum" ) stream_put_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="PutRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="PutRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) stream_get_latency_metric = _cloudwatch.Metric( namespace="AWS/Kinesis", metric_name="GetRecords.Latency", dimensions={ "StreamName": f"{stream_pipe.stream_name}" }, label="GetRecords.Latency", period=core.Duration.minutes(30), statistic="Sum" ) ################################################## ########## STREAM PRODUCER METRICS ######### ################################################## # JSON Metric Filter - https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html records_produced_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="recordsProducedCount", label="Total No. Of Records Produced", period=core.Duration.minutes(30), statistic="Sum" ) records_produced_metric_filter = _logs.MetricFilter(self, "recordsProducedCountFilter", filter_pattern=_logs.FilterPattern.exists( "$.records_produced"), log_group=stream_producer_lg, metric_namespace=records_produced_metric.namespace, metric_name=records_produced_metric.metric_name, default_value=0, metric_value="$.records_produced", ) ################################################## ########## STREAM CONSUMER METRICS ######### ################################################## py_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", # dimensions={ # "RecordsProcessed": "py_processor" # }, metric_name="pyRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) py_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter01", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=py_stream_record_processor_fn.log_group, metric_namespace=py_records_processed_metric.namespace, metric_name=py_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) node_records_processed_metric = _cloudwatch.Metric( namespace=f"{global_args.OWNER}-stream-data-processor", metric_name="nodeRecordsProcessedCount", label="Total No. Of Records Processed", period=core.Duration.minutes(30), statistic="Sum" ) node_stream_record_processor = _logs.MetricFilter(self, "processedRecordCountFilter02", filter_pattern=_logs.FilterPattern.exists( "$.records_processed"), log_group=node_stream_record_processor_fn.log_group, metric_namespace=node_records_processed_metric.namespace, metric_name=node_records_processed_metric.metric_name, default_value=0, metric_value="$.records_processed", ) # Create CloudWatch Dashboard for Streams stream_processor_dashboard = _cloudwatch.Dashboard(self, id="streamProcessorDashboard", dashboard_name="Stream-Processor" ) stream_processor_dashboard.add_widgets( _cloudwatch.SingleValueWidget( title="TotalRecordsProduced", metrics=[records_produced_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Python-Consumer", metrics=[py_records_processed_metric] ), _cloudwatch.SingleValueWidget( title="RecordsProcessed-by-Node-Consumer", metrics=[node_records_processed_metric] ) ) # Stream Incoming bytes Graph stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Shard Ingestion Metrics", left=[stream_in_bytes_metric], right=[stream_in_records_metric] ), _cloudwatch.GraphWidget( title="Shard Throttle Metrics", left=[stream_w_throttle_metric], right=[stream_r_throttle_metric] ) ) ) stream_processor_dashboard.add_widgets( _cloudwatch.Row( _cloudwatch.GraphWidget( title="Stream Put Latency", left=[stream_put_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Get Latency", left=[stream_get_latency_metric] ), _cloudwatch.GraphWidget( title="Stream Put Success", left=[stream_put_success_metric] ) ) ) ########################################### ################# OUTPUTS ################# ########################################### output_0 = core.CfnOutput(self, "SecuirtyAutomationFrom", value=f"{global_args.SOURCE_INFO}", description="To know more about this automation stack, check out our github page." )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) security_distribution_list_email = '*****@*****.**' # securityhub_instance = securityhub.CfnHub(self, 'SecurityHub') # Ensure AWS Config is enabled / Ensure CloudTrail is enabled in all Regions 2.1 - 2.8 cloudtrail_bucket_accesslogs = s3.Bucket( self, "CloudTrailS3Accesslogs", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN) cloudtrail_bucket = s3.Bucket( self, "CloudTrailS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, server_access_logs_bucket=cloudtrail_bucket_accesslogs, ) cloudtrail_kms = kms.Key(self, "CloudTrailKey", enable_key_rotation=True) # CloudTrail - single account, not Organization trail = cloudtrail.Trail( self, "CloudTrail", enable_file_validation=True, is_multi_region_trail=True, include_global_service_events=True, send_to_cloud_watch_logs=True, cloud_watch_logs_retention=logs.RetentionDays.FOUR_MONTHS, bucket=cloudtrail_bucket, kms_key=cloudtrail_kms) cloudtrail_kms.grant(iam.ServicePrincipal('cloudtrail.amazonaws.com'), 'kms:DescribeKey') cloudtrail_kms.grant( iam.ServicePrincipal( 'cloudtrail.amazonaws.com', conditions={ 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }), 'kms:GenerateDataKey*') cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement(actions=["kms:CreateAlias"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account, 'kms:ViaService': 'ec2.' + core.Stack.of(self).region + '.amazonaws.com' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) cloudtrail_kms.add_to_resource_policy( iam.PolicyStatement( actions=["kms:Decrypt", "kms:ReEncryptFrom"], conditions={ 'StringEquals': { 'kms:CallerAccount': core.Stack.of(self).account }, 'StringLike': { 'kms:EncryptionContext:aws:cloudtrail:arn': 'arn:aws:cloudtrail:*:' + core.Stack.of(self).account + ':trail/*' } }, effect=iam.Effect.ALLOW, principals=[iam.AnyPrincipal()], resources=['*'])) config_role = iam.CfnServiceLinkedRole( self, id='ServiceLinkedRoleConfig', aws_service_name='config.amazonaws.com') global_config = config.CfnConfigurationRecorder(self, 'ConfigRecorder', name='default', # role_arn=config_role.role_arn, role_arn="arn:aws:iam::" + \ core.Stack.of( self).account+":role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig", # role_arn=config_role.get_att( # attribute_name='resource.arn').to_string(), recording_group=config.CfnConfigurationRecorder.RecordingGroupProperty( all_supported=True, include_global_resource_types=True ) ) config_bucket = s3.Bucket( self, "ConfigS3", block_public_access=s3.BlockPublicAccess.BLOCK_ALL, encryption=s3.BucketEncryption.S3_MANAGED, removal_policy=core.RemovalPolicy.RETAIN, ) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:GetBucketAcl'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[config_bucket.bucket_arn])) config_bucket.add_to_resource_policy( iam.PolicyStatement( actions=['s3:PutObject'], effect=iam.Effect.ALLOW, principals=[iam.ServicePrincipal('config.amazonaws.com')], resources=[ config_bucket.arn_for_objects('AWSLogs/' + core.Stack.of(self).account + '/Config/*') ], conditions={ "StringEquals": { 's3:x-amz-acl': 'bucket-owner-full-control', } })) config_delivery_stream = config.CfnDeliveryChannel( self, "ConfigDeliveryChannel", s3_bucket_name=config_bucket.bucket_name) # Config Aggregator in Organizations account # config_aggregator = config.CfnConfigurationAggregator(self, 'ConfigAggregator', # configuration_aggregator_name='ConfigAggregator', # organization_aggregation_source=config.CfnConfigurationAggregator.OrganizationAggregationSourceProperty( # role_arn=iam.Role(self, "AWSConfigRoleForOrganizations", # assumed_by=iam.ServicePrincipal( # 'config.amazonaws.com'), # managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name( # 'service-role/AWSConfigRoleForOrganizations')] # ).role_arn, # all_aws_regions=True # ) # ) # 2.9 – Ensure VPC flow logging is enabled in all VPCs # vpc = ec2.Vpc.from_lookup(self, "VPC", # is_default=True, # ) # S3 for VPC flow logs # vpc_flow_logs_bucket = s3.Bucket(self, "VPCFlowLogsBucket", # block_public_access=s3.BlockPublicAccess.BLOCK_ALL, # encryption=s3.BucketEncryption.S3_MANAGED, # removal_policy=core.RemovalPolicy.RETAIN # ) # Ensure a log metric filter and alarm exist for 3.1 – 3.14 security_notifications_topic = sns.Topic(self, 'CIS_Topic', display_name='CIS_Topic', topic_name='CIS_Topic') sns.Subscription(self, 'CIS_Subscription', topic=security_notifications_topic, protocol=sns.SubscriptionProtocol.EMAIL, endpoint=security_distribution_list_email) cloudwatch_actions_cis = cloudwatch_actions.SnsAction( security_notifications_topic) cis_metricfilter_alarms = { 'CIS-3.1-UnauthorizedAPICalls': '($.errorCode="*UnauthorizedOperation") || ($.errorCode="AccessDenied*")', 'CIS-3.2-ConsoleSigninWithoutMFA': '($.eventName="ConsoleLogin") && ($.additionalEventData.MFAUsed !="Yes")', 'RootAccountUsageAlarm': '$.userIdentity.type="Root" && $.userIdentity.invokedBy NOT EXISTS && $.eventType !="AwsServiceEvent"', 'CIS-3.4-IAMPolicyChanges': '($.eventName=DeleteGroupPolicy) || ($.eventName=DeleteRolePolicy) || ($.eventName=DeleteUserPolicy) || ($.eventName=PutGroupPolicy) || ($.eventName=PutRolePolicy) || ($.eventName=PutUserPolicy) || ($.eventName=CreatePolicy) || ($.eventName=DeletePolicy) || ($.eventName=CreatePolicyVersion) || ($.eventName=DeletePolicyVersion) || ($.eventName=AttachRolePolicy) || ($.eventName=DetachRolePolicy) || ($.eventName=AttachUserPolicy) || ($.eventName=DetachUserPolicy) || ($.eventName=AttachGroupPolicy) || ($.eventName=DetachGroupPolicy)', 'CIS-3.5-CloudTrailChanges': '($.eventName=CreateTrail) || ($.eventName=UpdateTrail) || ($.eventName=DeleteTrail) || ($.eventName=StartLogging) || ($.eventName=StopLogging)', 'CIS-3.6-ConsoleAuthenticationFailure': '($.eventName=ConsoleLogin) && ($.errorMessage="Failed authentication")', 'CIS-3.7-DisableOrDeleteCMK': '($.eventSource=kms.amazonaws.com) && (($.eventName=DisableKey) || ($.eventName=ScheduleKeyDeletion))', 'CIS-3.8-S3BucketPolicyChanges': '($.eventSource=s3.amazonaws.com) && (($.eventName=PutBucketAcl) || ($.eventName=PutBucketPolicy) || ($.eventName=PutBucketCors) || ($.eventName=PutBucketLifecycle) || ($.eventName=PutBucketReplication) || ($.eventName=DeleteBucketPolicy) || ($.eventName=DeleteBucketCors) || ($.eventName=DeleteBucketLifecycle) || ($.eventName=DeleteBucketReplication))', 'CIS-3.9-AWSConfigChanges': '($.eventSource=config.amazonaws.com) && (($.eventName=StopConfigurationRecorder) || ($.eventName=DeleteDeliveryChannel) || ($.eventName=PutDeliveryChannel) || ($.eventName=PutConfigurationRecorder))', 'CIS-3.10-SecurityGroupChanges': '($.eventName=AuthorizeSecurityGroupIngress) || ($.eventName=AuthorizeSecurityGroupEgress) || ($.eventName=RevokeSecurityGroupIngress) || ($.eventName=RevokeSecurityGroupEgress) || ($.eventName=CreateSecurityGroup) || ($.eventName=DeleteSecurityGroup)', 'CIS-3.11-NetworkACLChanges': '($.eventName=CreateNetworkAcl) || ($.eventName=CreateNetworkAclEntry) || ($.eventName=DeleteNetworkAcl) || ($.eventName=DeleteNetworkAclEntry) || ($.eventName=ReplaceNetworkAclEntry) || ($.eventName=ReplaceNetworkAclAssociation)', 'CIS-3.12-NetworkGatewayChanges': '($.eventName=CreateCustomerGateway) || ($.eventName=DeleteCustomerGateway) || ($.eventName=AttachInternetGateway) || ($.eventName=CreateInternetGateway) || ($.eventName=DeleteInternetGateway) || ($.eventName=DetachInternetGateway)', 'CIS-3.13-RouteTableChanges': '($.eventName=CreateRoute) || ($.eventName=CreateRouteTable) || ($.eventName=ReplaceRoute) || ($.eventName=ReplaceRouteTableAssociation) || ($.eventName=DeleteRouteTable) || ($.eventName=DeleteRoute) || ($.eventName=DisassociateRouteTable)', 'CIS-3.14-VPCChanges': '($.eventName=CreateVpc) || ($.eventName=DeleteVpc) || ($.eventName=ModifyVpcAttribute) || ($.eventName=AcceptVpcPeeringConnection) || ($.eventName=CreateVpcPeeringConnection) || ($.eventName=DeleteVpcPeeringConnection) || ($.eventName=RejectVpcPeeringConnection) || ($.eventName=AttachClassicLinkVpc) || ($.eventName=DetachClassicLinkVpc) || ($.eventName=DisableVpcClassicLink) || ($.eventName=EnableVpcClassicLink)', } for x, y in cis_metricfilter_alarms.items(): str_x = str(x) str_y = str(y) logs.MetricFilter( self, "MetricFilter_" + str_x, log_group=trail.log_group, filter_pattern=logs.JsonPattern(json_pattern_string=str_y), metric_name=str_x, metric_namespace="LogMetrics", metric_value='1') cloudwatch.Alarm( self, "Alarm_" + str_x, alarm_name=str_x, alarm_description=str_x, statistic='Sum', period=core.Duration.minutes(5), comparison_operator=cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, evaluation_periods=1, threshold=1, metric=cloudwatch.Metric(metric_name=str_x, namespace="LogMetrics"), ).add_alarm_action(cloudwatch_actions_cis) # IAM Password Policy custom resource CIS 1.5 - 1.11 cfn_template = cfn_inc.CfnInclude( self, "includeTemplate", template_file="account-password-policy.yaml", parameters={ "MaxPasswordAge": 90, "MinimumPasswordLength": 14, "PasswordReusePrevention": 24, "RequireLowercaseCharacters": True, "RequireNumbers": True, "RequireSymbols": True, "RequireUppercaseCharacters": True, }) # CIS 1.20 support_role = iam.Role( self, "SupportRole", assumed_by=iam.AccountPrincipal( account_id=core.Stack.of(self).account), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'AWSSupportAccess') ], role_name='AWSSupportAccess') guardduty_detector = guardduty.CfnDetector(self, 'GuardDutyDetector', enable=True) guardduty_event = events.Rule( self, 'GuardDutyEvent', rule_name='guardduty-notification', description='GuardDuty Notification', event_pattern=events.EventPattern( source=['aws.guardduty'], detail_type=['GuardDuty Finding']), targets=[events_targets.SnsTopic(security_notifications_topic)])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Get config value for alert email email = self.node.try_get_context("email") if email == 'changeme@localhost': exit( 'ERROR: Change the email in cdk.json or pass it with -c email=changeme@localhost' ) # Create SNS for alarms to be sent to alarm_topic = sns.Topic(self, "backup_alarm", display_name="backup_alarm") # Subscribe my email so the alarms go to me alarm_topic.add_subscription(subscriptions.EmailSubscription(email)) # Create VPC to run everything in. We make this public just because we don't # want to spend $30/mo on a NAT gateway. vpc = ec2.Vpc( self, "VPC", nat_gateways=0, subnet_configuration=[ ec2.SubnetConfiguration(name="public", subnet_type=ec2.SubnetType.PUBLIC) ], ) ecs_sg = ec2.SecurityGroup(self, "ecs_sg", vpc=vpc) efs_sg = ec2.SecurityGroup(self, "efs_sg", vpc=vpc) efs_sg.add_ingress_rule( peer=ecs_sg, connection=ec2.Port.tcp(2049), description="Allow backup runner access", ) # Open this to the VPC efs_sg.add_ingress_rule( peer=ec2.Peer.ipv4("10.0.0.0/8"), connection=ec2.Port.tcp(2049), description="Allow backup runner access", ) # Define the EFS fileSystem = efs.FileSystem( self, "MyEfsFileSystem", vpc=vpc, encrypted=True, lifecycle_policy=efs.LifecyclePolicy.AFTER_7_DAYS, performance_mode=efs.PerformanceMode.GENERAL_PURPOSE, throughput_mode=efs.ThroughputMode.BURSTING, security_group=efs_sg, ) # Define the ECS task cluster = ecs.Cluster(self, "Cluster", vpc=vpc) taskDefinition = ecs.FargateTaskDefinition( self, "taskDefinition", volumes=[ ecs.Volume( name="efsvolume", efs_volume_configuration=ecs.EfsVolumeConfiguration( file_system_id=fileSystem.file_system_id, root_directory="/", transit_encryption="ENABLED", ), ) ], memory_limit_mib=8192, cpu=2048, ) log_driver = ecs.AwsLogDriver( stream_prefix="backup_runner", log_retention=logs.RetentionDays.TWO_WEEKS, ) taskDefinition.add_container( "backup-runner", image=ecs.ContainerImage.from_asset("./resources/backup_runner"), memory_limit_mib=8192, cpu=2048, logging=log_driver, ) # The previous method to add the container doesn't let us specify the mount point for the EFS, # so we have to do it here, and referencing the container that was just added. taskDefinition.default_container.add_mount_points( ecs.MountPoint(container_path="/mnt/efs", read_only=False, source_volume="efsvolume")) # Create rule to trigger this be run every 24 hours events.Rule( self, "scheduled_run", rule_name="backup_runner", # Run at 2am EST (6am UTC) every night schedule=events.Schedule.expression("cron(0 0 * * ? *)"), description="Starts the backup runner task every night", targets=[ targets.EcsTask( cluster=cluster, task_definition=taskDefinition, subnet_selection=ec2.SubnetSelection( subnet_type=ec2.SubnetType.PUBLIC), platform_version=ecs.FargatePlatformVersion. VERSION1_4, # Required to use EFS # Because "Latest" does not yet support EFS security_groups=[ecs_sg], ) ], ) # Create notification topic for backups backup_topic = sns.Topic(self, "backup_topic", display_name="Backup status") # Create AWS Backup vault = backup.BackupVault( self, "Vault", access_policy=iam.PolicyDocument(statements=[ iam.PolicyStatement( effect=iam.Effect.DENY, actions=[ "backup:DeleteBackupVault", "backup:DeleteRecoveryPoint", "backup:UpdateRecoveryPointLifecycle", # "backup:PutBackupVaultAccessPolicy", # This results in "Failed putting policy for Backup vault backuprunnerVaultXXX as it will lock down from further policy changes" "backup:DeleteBackupVaultAccessPolicy", "backup:DeleteBackupVaultNotifications", # "backup:PutBackupVaultNotifications", # This causes oher part of this app to fail. ], resources=["*"], principals=[iam.AnyPrincipal()], ) ]), notification_topic=alarm_topic, notification_events=[ # Monitor for some failures or access to the backups backup.BackupVaultEvents.BACKUP_JOB_EXPIRED, backup.BackupVaultEvents.BACKUP_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_FAILED, backup.BackupVaultEvents.COPY_JOB_STARTED, backup.BackupVaultEvents.RESTORE_JOB_COMPLETED, backup.BackupVaultEvents.RESTORE_JOB_FAILED, backup.BackupVaultEvents.RESTORE_JOB_STARTED, backup.BackupVaultEvents.RESTORE_JOB_SUCCESSFUL, ], ) plan = backup.BackupPlan.daily35_day_retention(self, "backup") plan.add_selection( "Selection", resources=[backup.BackupResource.from_efs_file_system(fileSystem)], ) # # Create metric filter for errors in the CloudWatch Logs from the ECS # METRIC_NAME = "log_errors" METRIC_NAMESPACE = "backup_runner" metric = cloudwatch.Metric(namespace=METRIC_NAMESPACE, metric_name=METRIC_NAME) error_metric = logs.MetricFilter( self, "MetricFilterId", metric_name=METRIC_NAME, metric_namespace=METRIC_NAMESPACE, log_group=log_driver.log_group, filter_pattern=logs.FilterPattern.any_term("ERROR"), metric_value="1", ) error_alarm = cloudwatch.Alarm( self, "AlarmId", metric=metric, evaluation_periods=1, actions_enabled=True, alarm_name="backuper_runner_alarm", alarm_description="Errors in backup runner", comparison_operator=cloudwatch.ComparisonOperator. GREATER_THAN_OR_EQUAL_TO_THRESHOLD, treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, period=core.Duration.hours(1), threshold=1, statistic="sum", ) # Connect the alarm to the SNS error_alarm.add_alarm_action(cloudwatch_actions.SnsAction(alarm_topic)) # The above doesn't give it privileges, so add them to the alarm topic resource policy. alarm_topic.add_to_resource_policy( iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=["sns:Publish"], resources=[alarm_topic.topic_arn], principals=[iam.ServicePrincipal("cloudwatch.amazonaws.com")], ))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) CLUSTER_NAME = self.node.try_get_context("cluster_name") NOTIFY_EMAIL = self.node.try_get_context("notify_email") SLACK_WEBHOOK_URL = self.node.try_get_context("webhook_url") if not CLUSTER_NAME or not NOTIFY_EMAIL or not SLACK_WEBHOOK_URL: logger.error( f"Required context variables for {id} were not provided!") else: # Get the log group of our postgres instance log_group = logs.LogGroup.from_log_group_name( self, "InAur01DetectionLogGroup", f"/aws/rds/cluster/{CLUSTER_NAME}/postgresql", ) # Create new metric metric = cloudwatch.Metric( namespace="LogMetrics", metric_name="InAur01DetectionFailedDbLoginAttempts", ) # Apply metric filter # Filter all metrics of failed login attempts in log logs.MetricFilter( self, "InAur01DetectionMetricFilter", log_group=log_group, metric_namespace=metric.namespace, metric_name=metric.metric_name, filter_pattern=logs.FilterPattern.all_terms( "FATAL: password authentication failed for user"), metric_value="1", ) # Create new SNS topic topic = sns.Topic(self, "InAur01DetectionTopic") # Add email subscription topic.add_subscription(subs.EmailSubscription(NOTIFY_EMAIL)) # Create new alarm for metric # Alarm will trigger if there is >= 10 failed login attempts # over a period of 30 seconds. alarm = cloudwatch.Alarm( self, "InAur01DetectionAlarm", metric=metric, threshold=10, evaluation_periods=1, period=core.Duration.seconds(30), datapoints_to_alarm=1, statistic="sum", ) # Add SNS action to alarm alarm.add_alarm_action(cw_actions.SnsAction(topic)) # Create unban lambda lambda_dir_path = os.path.join(os.getcwd(), "ir_cdk_stacks", "in_aur_01") unban_lambda = _lambda.Function( self, "InAur01ResponseUnbanFunction", runtime=_lambda.Runtime.PYTHON_3_8, handler="unban_lambda.lambda_handler", code=_lambda.Code.from_asset(lambda_dir_path), ) # Assign EC2 permissions to lambda unban_lambda.add_to_role_policy( iam.PolicyStatement( actions=["ec2:DeleteNetworkAclEntry"], effect=iam.Effect.ALLOW, resources=["*"], )) # Create stepfunction # Define a second state machine to unban the blacklisted IP after 1 hour wait_step = sfn.Wait( self, "InAur01ResponseStepWait", time=sfn.WaitTime.duration(core.Duration.hours(1)), ) unban_step = sfn.Task( self, "InAur01ResponseStepUnban", task=tasks.RunLambdaTask( unban_lambda, integration_pattern=sfn.ServiceIntegrationPattern. FIRE_AND_FORGET, ), parameters={"Payload.$": "$"}, ) statemachine = sfn.StateMachine( self, "InAur01ResponseUnbanStateMachine", definition=wait_step.next(unban_step), timeout=core.Duration.hours(1.5), ) # Create lambda function lambda_func = _lambda.Function( self, "InAur01ResponseFunction", runtime=_lambda.Runtime.PYTHON_3_8, handler="response_lambda.lambda_handler", code=_lambda.Code.from_asset(lambda_dir_path), environment={ "webhook_url": SLACK_WEBHOOK_URL, "unban_sm_arn": statemachine.state_machine_arn, "cluster_name": CLUSTER_NAME, }, ) # AWS CDK has a bug where it would not add the correct permission # to the lambda for Cloudwatch log subscription to invoke it. # Hence, we need to manually add permission to lambda. lambda_func.add_permission( "InAur01ResponseFunctionInvokePermission", principal=iam.ServicePrincipal("logs.amazonaws.com"), action="lambda:InvokeFunction", source_arn=log_group.log_group_arn + ":*", ) # Assign permissions to response lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "states:StartExecution", ], effect=iam.Effect.ALLOW, resources=[statemachine.state_machine_arn], )) # Assign RDS Read-only permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=["rds:Describe*"], effect=iam.Effect.ALLOW, resources=["*"], )) # Assign EC2 permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "ec2:Describe*", "ec2:CreateNetworkAclEntry", "ec2:DeleteNetworkAclEntry", ], effect=iam.Effect.ALLOW, resources=["*"], )) # Assign CloudWatch logs permissions to lambda lambda_func.add_to_role_policy( iam.PolicyStatement( actions=[ "cloudwatch:Get*", "cloudwatch:Describe*", "logs:FilterLogEvents", "logs:DescribeMetricFilters", ], effect=iam.Effect.ALLOW, resources=["*"], )) sns_event_source = lambda_event_sources.SnsEventSource(topic) lambda_func.add_event_source(sns_event_source)