def __init__(self, scope: core.Construct, id: str, config: Dict,
                 vpc: ec2.Vpc, es_sg: ec2.SecurityGroup) -> None:
        super().__init__(scope, id)

        es_config = config['data']['elasticsearch']

        # Build ES domain construct parameter
        capacity_config = es.CapacityConfig(
            master_node_instance_type=es_config['capacity']['masterNodes']
            ['instanceType'],
            master_nodes=es_config['capacity']['masterNodes']['count'],
            data_node_instance_type=es_config['capacity']['dataNodes']
            ['instanceType'],
            data_nodes=es_config['capacity']['dataNodes']['count'],
        )

        vpc_options = es.VpcOptions(
            security_groups=[es_sg],
            subnets=vpc.select_subnets(
                subnet_group_name=es_config['subnetGroupName']).subnets,
        )

        ebs_options = es.EbsOptions(volume_size=es_config['ebs']['volumeSize'])

        zone_awareness = es.ZoneAwarenessConfig(
            availability_zone_count=es_config['zoneAwareness']['count'],
            enabled=es_config['zoneAwareness']['enabled'],
        )

        logging_options = es.LoggingOptions(
            app_log_enabled=es_config['logging']['appLogEnabled'],
            audit_log_enabled=es_config['logging']['auditLogEnabled'],
            slow_index_log_enabled=es_config['logging']['slowIndexLogEnabled'],
            slow_search_log_enabled=es_config['logging']
            ['slowIearchLogEnabled'])

        access_policy = iam.PolicyStatement(
            effect=iam.Effect.ALLOW,
            principals=[iam.AnyPrincipal()],
            actions=['es:*'],
            resources=[
                "arn:aws:es:" + config['awsRegion'] + ":" +
                config['awsAccount'] + ":domain/" + es_config['domainName'] +
                "/*"
            ])

        # Create ES domain
        es.Domain(
            self,
            'Domain',
            domain_name=es_config['domainName'],
            version=es.ElasticsearchVersion.of(es_config['version']),
            capacity=capacity_config,
            ebs=ebs_options,
            zone_awareness=zone_awareness,
            vpc_options=vpc_options,
            logging=logging_options,
            access_policies=[access_policy],
        )
Exemple #2
0
    def __init__(self, scope: core.Construct, id: str, vpc: ec2.Vpc,
                 region: str) -> None:
        super().__init__(scope, id)

        # create an IAM role to attach to the squid instances
        squid_iam_role = iam.Role(
            self,
            "squid-role",
            assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "CloudWatchAgentServerPolicy"),
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AmazonEC2RoleforSSM")
            ])

        # Add policy to allow EC2 update instance attributes
        squid_iam_role.add_to_policy(
            statement=iam.PolicyStatement(effect=iam.Effect.ALLOW,
                                          actions=[
                                              'ec2:ModifyInstanceAttribute',
                                          ],
                                          resources=['*']))

        # Create bucket to hold Squid config and whitelist files
        squid_config_bucket = s3.Bucket(
            self, "squid-config", encryption=s3.BucketEncryption.KMS_MANAGED)

        # Upload config and whiteliest files to S3 bucket
        s3_deployment.BucketDeployment(
            self,
            "config",
            destination_bucket=squid_config_bucket,
            sources=[
                s3_deployment.Source.asset(
                    path='./squid_app/squid_config_files/config_files_s3')
            ])

        # Provide access to EC2 instance role to read and write to bucket
        squid_config_bucket.grant_read_write(identity=squid_iam_role)

        # Set the AMI to the latest Amazon Linux 2
        amazon_linux_2_ami = ec2.MachineImage.latest_amazon_linux(
            generation=ec2.AmazonLinuxGeneration.AMAZON_LINUX_2,
            edition=ec2.AmazonLinuxEdition.STANDARD,
            virtualization=ec2.AmazonLinuxVirt.HVM,
            storage=ec2.AmazonLinuxStorage.GENERAL_PURPOSE)

        if vpc.public_subnets:
            # Squid ASGs with desired capacity as 1 Instance in each of the AZs
            self.squid_asgs = []
            for count, az in enumerate(vpc.availability_zones, start=1):
                asg = autoscaling.AutoScalingGroup(
                    self,
                    f"asg-{count}",
                    vpc=vpc,
                    instance_type=ec2.InstanceType("t3.nano"),
                    desired_capacity=1,
                    max_capacity=1,
                    min_capacity=1,
                    machine_image=amazon_linux_2_ami,
                    role=squid_iam_role,
                    vpc_subnets=ec2.SubnetSelection(
                        availability_zones=[az],
                        one_per_az=True,
                        subnet_type=ec2.SubnetType.PUBLIC),
                    health_check=autoscaling.HealthCheck.ec2(
                        grace=core.Duration.minutes(5)),
                    resource_signal_count=1,
                    resource_signal_timeout=core.Duration.minutes(10))

                cfn_asg: autoscaling.CfnAutoScalingGroup = asg.node.default_child
                asg_logical_id = cfn_asg.logical_id

                # User data: Required parameters in user data script
                user_data_mappings = {
                    "__S3BUCKET__": squid_config_bucket.bucket_name,
                    "__ASG__": asg_logical_id,
                    "__CW_ASG__": "${aws:AutoScalingGroupName}"
                }
                # Replace parameters with values in the user data
                with open(
                        "./squid_app/squid_config_files/user_data/squid_user_data.sh",
                        'r') as user_data_h:
                    # Use a substitution
                    user_data_sub = core.Fn.sub(user_data_h.read(),
                                                user_data_mappings)

                # Add User data to Launch Config of the autoscaling group
                asg.add_user_data(user_data_sub)

                # Security group attached to the ASG Squid instances
                # Outbound: All allowed
                # Inboud: Allowed from VPC CIDR on ports 80, 443)

                asg.connections.allow_from(
                    other=ec2.Peer.ipv4(vpc.vpc_cidr_block),
                    port_range=ec2.Port(protocol=ec2.Protocol.TCP,
                                        string_representation="HTTP from VPC",
                                        from_port=80,
                                        to_port=80))

                asg.connections.allow_from(
                    other=ec2.Peer.ipv4(vpc.vpc_cidr_block),
                    port_range=ec2.Port(protocol=ec2.Protocol.TCP,
                                        string_representation="HTTPS from VPC",
                                        from_port=443,
                                        to_port=443))

                # Create ASG Lifecycle hook to enable updating of route table using Lambda when instance launches and is marked Healthy

                autoscaling.LifecycleHook(
                    self,
                    f"asg-hook-{count}",
                    auto_scaling_group=asg,
                    lifecycle_transition=autoscaling.LifecycleTransition.
                    INSTANCE_LAUNCHING,
                    notification_target=hooktargets.TopicHook(
                        sns.Topic(self,
                                  f"squid-asg-{count}-lifecycle-hook-topic",
                                  display_name=
                                  f"Squid ASG {count} Lifecycle Hook topic")),
                    default_result=autoscaling.DefaultResult.ABANDON,
                    heartbeat_timeout=core.Duration.minutes(5))

                # Tag ASG with the route table IDs used by the isolated and/or private subnets in the availability zone
                # This tag will be used by the Squid Lambda function to identify route tables to update when alarm changes from ALARM to OK

                private_subnets_in_az = []
                isolated_subnets_in_az = []
                route_table_ids = ''

                if vpc.private_subnets:
                    private_subnets_in_az = vpc.select_subnets(
                        availability_zones=[az],
                        subnet_type=ec2.SubnetType.PRIVATE).subnets
                if vpc.isolated_subnets:
                    isolated_subnets_in_az = vpc.select_subnets(
                        availability_zones=[az],
                        subnet_type=ec2.SubnetType.ISOLATED).subnets

                non_public_subnets_in_az = isolated_subnets_in_az + private_subnets_in_az

                # Loop through all non public subnets in AZ to identify route table and create a tag value string
                for subnet in non_public_subnets_in_az:
                    if route_table_ids:
                        route_table_ids = f"{route_table_ids},{subnet.route_table.route_table_id}"
                    else:
                        route_table_ids = subnet.route_table.route_table_id

                # Tag the ASG with route table ids
                core.Tag.add(asg,
                             key='RouteTableIds',
                             value=route_table_ids,
                             apply_to_launched_instances=False)

                self.squid_asgs.append(asg)

        else:
            raise ValueError("No public subnets in VPC")
Exemple #3
0
class VPCConstruct(core.Construct):
    def __init__(self, scope: core.Construct, id_: str,
                 num_of_azs: int) -> None:
        super().__init__(scope, id_)

        self.audit_vpc = Vpc(
            self,
            id_,
            max_azs=num_of_azs,
            subnet_configuration=[
                #Currently IOT, AppConfig & Cloudmap are not accessable via VPC endpoint, so we use NAT GW access them
                SubnetConfiguration(name=PRIVATE_SUBNET_GROUP,
                                    subnet_type=SubnetType.PRIVATE,
                                    cidr_mask=24),
                SubnetConfiguration(name=PUBLIC_NAT_GWS_SUBNET_GROUP,
                                    subnet_type=SubnetType.PUBLIC,
                                    cidr_mask=24)
            ],
            gateway_endpoints={
                'S3':
                GatewayVpcEndpointOptions(
                    service=GatewayVpcEndpointAwsService.S3,
                    subnets=[
                        SubnetSelection(subnet_group_name=PRIVATE_SUBNET_GROUP)
                    ]),
                'DynamoDb':
                GatewayVpcEndpointOptions(
                    service=GatewayVpcEndpointAwsService.DYNAMODB,
                    subnets=[
                        SubnetSelection(subnet_group_name=PRIVATE_SUBNET_GROUP)
                    ]),
            },
            enable_dns_support=True,  # For the ElasticSearch Public Domain
            enable_dns_hostnames=True)

        self.audit_vpc.add_interface_endpoint(
            'SsmVpcEndpoint',
            service=InterfaceVpcEndpointAwsService.SSM,
            subnets=SubnetSelection(one_per_az=True))

        self.audit_vpc.add_interface_endpoint(
            'SqsVpcEndpoint',
            service=InterfaceVpcEndpointAwsService.SQS,
            subnets=SubnetSelection(one_per_az=True))
        self.audit_vpc.add_interface_endpoint(
            'Ec2VpcEndpoint',
            service=InterfaceVpcEndpointAwsService.EC2,
            subnets=SubnetSelection(one_per_az=True))

        self.audit_vpc.add_interface_endpoint(
            'LambdaVpcEndpoint',
            service=InterfaceVpcEndpointAwsService.LAMBDA_,
            subnets=SubnetSelection(one_per_az=True))

        self.lambdas_sg = SecurityGroup(self,
                                        id='LambdaSg',
                                        vpc=self.audit_vpc,
                                        security_group_name='Audit-Lambda')

    def _get_subnets(self, subnet_group: str) -> List[ISubnet]:
        return self.audit_vpc.select_subnets(
            subnet_group_name=subnet_group).subnets
    def __init__(self, scope: core.Construct, id: str,
                 log_bucket: _s3.Bucket,
                 config_table: _dynamodb.Table,
                 tshirt_size: str,
                 sink_bucket: _s3.Bucket,
                 vpc: _ec2.Vpc,
                 **kwargs) -> None:

        super().__init__(scope, id, **kwargs)

        service_role = _iam.Role(
            self, 'BatchEmrServiceRole',
            assumed_by=_iam.ServicePrincipal('elasticmapreduce.amazonaws.com')
        )

        service_role.add_managed_policy(_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AmazonElasticMapReduceRole'))

        cluster_role = _iam.Role(
            self, 'BatchEmrClusterRole',
            assumed_by=_iam.ServicePrincipal("ec2.amazonaws.com")
        )

        _iam.Policy(
            self, 'BatchEmrClusterPolicy',
            statements=[
                _iam.PolicyStatement(
                    actions=[
                        "glue:CreateDatabase",
                        "glue:UpdateDatabase",
                        "glue:DeleteDatabase",
                        "glue:GetDatabase",
                        "glue:GetDatabases",
                        "glue:CreateTable",
                        "glue:UpdateTable",
                        "glue:DeleteTable",
                        "glue:GetTable",
                        "glue:GetTables",
                        "glue:GetTableVersions",
                        "glue:CreatePartition",
                        "glue:BatchCreatePartition",
                        "glue:UpdatePartition",
                        "glue:DeletePartition",
                        "glue:BatchDeletePartition",
                        "glue:GetPartition",
                        "glue:GetPartitions",
                        "glue:BatchGetPartition",
                        "glue:CreateUserDefinedFunction",
                        "glue:UpdateUserDefinedFunction",
                        "glue:DeleteUserDefinedFunction",
                        "glue:GetUserDefinedFunction",
                        "glue:GetUserDefinedFunctions",
                        "cloudwatch:PutMetricData",
                        "dynamodb:ListTables",
                        "s3:HeadBucket",
                        "ec2:Describe*",
                    ],
                    resources=['*']
                ),
                _iam.PolicyStatement(
                    actions=['s3:GetObject'],
                    resources=[
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES + DataGenConfig.DSDGEN_INSTALL_SCRIPT,
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES + DataGenConfig.JAR_FILE
                    ]
                ),
                _iam.PolicyStatement(
                    actions=['s3:PutObject'],
                    resources=[log_bucket.bucket_arn + "/data-generator/*"]
                ),
                _iam.PolicyStatement(
                    actions=[
                        "s3:AbortMultipartUpload",
                        "s3:CreateBucket",
                        "s3:DeleteObject",
                        "s3:GetBucketVersioning",
                        "s3:GetObject",
                        "s3:GetObjectTagging",
                        "s3:GetObjectVersion",
                        "s3:ListBucket",
                        "s3:ListBucketMultipartUploads",
                        "s3:ListBucketVersions",
                        "s3:ListMultipartUploadParts",
                        "s3:PutBucketVersioning",
                        "s3:PutObject",
                        "s3:PutObjectTagging"
                    ],
                    resources=[
                        sink_bucket.bucket_arn + '/*',
                        sink_bucket.bucket_arn

                    ]
                )
            ],
            roles=[cluster_role]
        )

        cluster_role.add_managed_policy(_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMManagedInstanceCore'))

        _iam.CfnInstanceProfile(
            self, 'BatchEmrClusterInstanceProfile',
            roles=[cluster_role.role_name],
            instance_profile_name=cluster_role.role_name
        )

        # Security Groups for the EMR cluster (private subnet)
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-master-private
        master_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-Master-Private', vpc=vpc)
        slave_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-Slave-Private', vpc=vpc)
        service_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-ServiceAccess', vpc=vpc, allow_all_outbound=False)

        # Service SG used by the proxy instance
        service_sg.add_ingress_rule(master_sg, _ec2.Port.tcp(9443))
        service_sg.add_egress_rule(master_sg, _ec2.Port.tcp(8443))
        service_sg.add_egress_rule(slave_sg, _ec2.Port.tcp(8443))

        # EMR Master
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        # EMR Slave
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        with open('common/common_cdk/lambda/datagen_config.py', 'r') as f:
            lambda_source = f.read()

        configure_datagen_function = _lambda.SingletonFunction(
            self, 'BatchConfigureDatagenLambda',
            uuid="58a9a222-ff07-11ea-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='datagen-config',
            environment={
                'TABLE_NAME': config_table.table_name,
                'JAR_LOCATION': BINARIES_LOCATION + DataGenConfig.JAR_FILE,
            },
            timeout=core.Duration.seconds(10)
        )

        configure_datagen_function.role.add_to_policy(
            _iam.PolicyStatement(
                actions=[
                    'dynamodb:GetItem',
                    'dynamodb:PutItem',
                ],
                resources=[config_table.table_arn]
            )
        )

        terminate_cluster = _sfn_tasks.EmrTerminateCluster(
            self, 'BatchDeleteCluster',
            cluster_id=_sfn.TaskInput.from_data_at("$.Emr.Cluster.Id").value,
            integration_pattern=_sfn.IntegrationPattern.RUN_JOB,
        )

        terminate_cluster_error = _sfn_tasks.EmrTerminateCluster(
            self, 'BatchDeleteClusterError',
            cluster_id=_sfn.TaskInput.from_data_at("$.Emr.Cluster.Id").value,
            integration_pattern=_sfn.IntegrationPattern.RUN_JOB,
        ).next(_sfn.Fail(self, 'StepFailure'))

        create_cluster = _sfn_tasks.EmrCreateCluster(
            self, "BatchCreateEMRCluster",
            name="BatchDatagenCluster",
            result_path="$.Emr",
            release_label='emr-5.30.1',
            log_uri=log_bucket.s3_url_for_object() + "/data-generator",
            cluster_role=cluster_role,
            service_role=service_role,
            bootstrap_actions=[
                _sfn_tasks.EmrCreateCluster.BootstrapActionConfigProperty(
                    name="dsdgen-install",
                    script_bootstrap_action=_sfn_tasks.EmrCreateCluster.ScriptBootstrapActionConfigProperty(
                        path=BINARIES_LOCATION + DataGenConfig.DSDGEN_INSTALL_SCRIPT,
                    )
                )
            ],
            applications=[
                _sfn_tasks.EmrCreateCluster.ApplicationConfigProperty(
                    name="spark"
                ),
                _sfn_tasks.EmrCreateCluster.ApplicationConfigProperty(
                    name="hadoop"
                )
            ],
            instances=_sfn_tasks.EmrCreateCluster.InstancesConfigProperty(
                emr_managed_master_security_group=master_sg.security_group_id,
                emr_managed_slave_security_group=slave_sg.security_group_id,
                service_access_security_group=service_sg.security_group_id,
                ec2_subnet_ids=vpc.select_subnets().subnet_ids,
                instance_fleets=[
                    _sfn_tasks.EmrCreateCluster.InstanceFleetConfigProperty(
                        instance_fleet_type=_sfn_tasks.EmrCreateCluster.InstanceRoleType.MASTER,
                        instance_type_configs=[
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m4.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5d.xlarge',
                                weighted_capacity=1
                            ),
                        ],
                        launch_specifications=_sfn_tasks.EmrCreateCluster.InstanceFleetProvisioningSpecificationsProperty(
                            spot_specification=_sfn_tasks.EmrCreateCluster.SpotProvisioningSpecificationProperty(
                                timeout_action=_sfn_tasks.EmrCreateCluster.SpotTimeoutAction.SWITCH_TO_ON_DEMAND,
                                timeout_duration_minutes=5
                            )
                        ),
                        target_on_demand_capacity=0,
                        target_spot_capacity=1
                    ),
                    _sfn_tasks.EmrCreateCluster.InstanceFleetConfigProperty(
                        instance_fleet_type=_sfn_tasks.EmrCreateCluster.InstanceRoleType.CORE,
                        instance_type_configs=[
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.2xlarge',
                                weighted_capacity=2
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.2xlarge',
                                weighted_capacity=2
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m4.xlarge',
                                weighted_capacity=1
                            )
                        ],
                        launch_specifications=_sfn_tasks.EmrCreateCluster.InstanceFleetProvisioningSpecificationsProperty(
                            spot_specification=_sfn_tasks.EmrCreateCluster.SpotProvisioningSpecificationProperty(
                                timeout_action=_sfn_tasks.EmrCreateCluster.SpotTimeoutAction.SWITCH_TO_ON_DEMAND,
                                timeout_duration_minutes=5
                            )
                        ),
                        target_on_demand_capacity=0,
                        target_spot_capacity=DataGenConfig.BATCH_CLUSTER_SIZE[tshirt_size]

                    )
                ]
            )
        ).add_catch(handler=terminate_cluster_error, result_path="$.error")

        configure_datagen = _sfn_tasks.LambdaInvoke(
            self, "BatchConfigureDatagenTask",
            lambda_function=configure_datagen_function,
            payload=_sfn.TaskInput.from_text('{'
                                             '"Param": "batch_iterator",'
                                             '"Module": "batch",'
                                             '"SinkBucket": "'+sink_bucket.s3_url_for_object()+'",'
                                             '"Parallelism": "'+str(int(DataGenConfig.BATCH_DATA_SIZE[tshirt_size])*2)+'",'
                                             '"DataSize": "'+DataGenConfig.BATCH_DATA_SIZE[tshirt_size]+'",'
                                             '"TmpBucket": "fake-bucket"'
                                             '}'),
            result_path='$.Config'
        ).add_catch(handler=terminate_cluster_error, result_path="$.error")

        add_datagen_step = _sfn.CustomState(
            self, 'BatchAddDataGenStep',
            state_json={
                "Type": "Task",
                "Resource": "arn:aws:states:::elasticmapreduce:addStep.sync",
                "Parameters": {
                    "ClusterId.$": "$.Emr.Cluster.Id",
                    "Step": {
                        "Name": "DatagenStep",
                        "ActionOnFailure": "CONTINUE",
                        "HadoopJarStep": {
                            "Jar": "command-runner.jar",
                            "Args.$": "$.Config.Payload.StepParam"
                        }
                    }
                },
                "ResultPath": "$.Step",
                "Next": "BatchUpdateIterator",
                "Catch": [
                    {
                        "ErrorEquals": ["States.ALL"],
                        "Next": "BatchDeleteClusterError",
                        "ResultPath": "$.error"
                    }
                ]
            }
        )

        update_iterator = _sfn_tasks.DynamoUpdateItem(
            self, 'BatchUpdateIterator',
            table=config_table,
            key={
                'param': _sfn_tasks.DynamoAttributeValue.from_string('batch_iterator')
            },
            update_expression='SET iterator = if_not_exists(iterator, :start) + :inc',
            expression_attribute_values={
                ":inc": _sfn_tasks.DynamoAttributeValue.from_number(1),
                ":start": _sfn_tasks.DynamoAttributeValue.from_number(0)
            },
            result_path=_sfn.JsonPath.DISCARD
        )

        definition = configure_datagen \
            .next(create_cluster) \
            .next(add_datagen_step) \
            .next(update_iterator) \
            .next(terminate_cluster)

        datagen_stepfunctions = _sfn.StateMachine(
            self, "BatchDataGenStepFunctions",
            definition=definition,
            timeout=core.Duration.minutes(30)
        )

        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    'elasticmapreduce:AddJobFlowSteps',
                    'elasticmapreduce:DescribeStep'
                ],
                resources=['*']
            )
        )
        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(
                actions= [
                    "iam:CreateServiceLinkedRole",
                    "iam:PutRolePolicy"
                ],
                resources=["arn:aws:iam::*:role/aws-service-role/elasticmapreduce.amazonaws.com*/AWSServiceRoleForEMRCleanup*"],
                conditions= {
                    "StringLike": {
                        "iam:AWSServiceName": [
                            "elasticmapreduce.amazonaws.com",
                            "elasticmapreduce.amazonaws.com.cn"
                        ]
                    }
                }
            )
        )

        step_trigger = _events.Rule(
            self, 'BatchSteptrigger',
            schedule=_events.Schedule.cron(minute='0/30',
                                           hour='*',
                                           month='*',
                                           week_day='*',
                                           year='*')
        )

        step_trigger.add_target(_events_targets.SfnStateMachine(machine=datagen_stepfunctions))

        with open('common/common_cdk/lambda/stepfunctions_trigger.py', 'r') as f:
            lambda_source = f.read()

        stepfunctions_trigger_lambda = _lambda.SingletonFunction(
            self, 'BatchStepFunctionsTriggerLambda',
            uuid="9597f6f2-f840-11ea-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stepfunctions-batch-datagen-trigger'
        )

        stepfunctions_trigger_lambda.role.add_to_policy(
            _iam.PolicyStatement(
                actions=["states:StartExecution"],
                resources=['*']
            )
        )

        trigger_step_lambda_provider = _custom_resources.Provider(
            self, 'StepFunctionsTriggerLambdaProvider',
            on_event_handler=stepfunctions_trigger_lambda
        )

        core.CustomResource(
            self, 'StepFunctionsTrigger',
            service_token=trigger_step_lambda_provider.service_token,
            properties={
                "stepArn": datagen_stepfunctions.state_machine_arn
            }
        )

        # terminate clusters
        with open('common/common_cdk/lambda/stepfunctions_terminate_emr.py', 'r') as f:
            lambda_source = f.read()

        sfn_terminate = _lambda.SingletonFunction(
            self, 'StepFuncTerminateBatch',
            uuid='58a9a422-ff07-11ea-adc1-0242ac120002',
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            timeout=core.Duration.minutes(5)
        )

        sfn_terminate.role.add_to_policy(
            _iam.PolicyStatement(
                actions=[
                    'elasticmapreduce:ListClusters',
                    'elasticmapreduce:TerminateJobFlows',
                    'states:ListStateMachines',
                    'states:ListExecutions',
                    'states:StopExecution'
                ],
                resources=['*']
            )
        )

        sfn_terminate_provider = _custom_resources.Provider(
            self, 'StepFuncTerminateBatchLambdaProvider',
            on_event_handler=sfn_terminate
        )

        core.CustomResource(
            self, 'StepFuncTerminateBatchCustomResource',
            service_token=sfn_terminate_provider.service_token,
            properties={
                "state_machine": 'BatchDatagen'
            })
class DataLakeFoundations(NestedStack):

    @property
    def raw_s3_bucket(self):
        return self.__raw_s3_bucket

    @property
    def clean_s3_bucket(self):
        return self.__clean_s3_bucket

    @property
    def curated_s3_bucket(self):
        return self.__curated_s3_bucket

    @property
    def raw_glue_db(self):
        return self.__raw_glue_db

    @property
    def clean_glue_db(self):
        return self.__clean_glue_db

    @property
    def curated_glue_db(self):
        return self.__curated_glue_db

    @property
    def audit_glue_db(self):
        return self.__audit_glue_db

    @property
    def logs_s3_bucket(self):
        return self.__logs_s3_bucket

    @property
    def vpc(self):
        return self.__vpc

    @property
    def private_subnets_selection(self):
        return self.__private_subnets

    @property
    def public_subnets_selection(self):
        return self.__public_subnets

    @property
    def admin_group(self):
        return self.__admin_group

    @property
    def analysts_group(self):
        return self.__analysts_group

    @property
    def developers_group(self):
        return self.__developers_group

    def __init__(self, scope: Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # implement the glue data catalog databases used in the data lake
        catalog = DataLakeCatalog(self, 'DataLakeCatalog')
        self.__raw_glue_db = catalog.raw_database
        self.__clean_glue_db = catalog.clean_database
        self.__curated_glue_db = catalog.transform_database
        self.__audit_glue_db = Database(self, 'AuditGlueDB', database_name='ara_audit_data_' + self.account)


        # implement the S3 buckets for the data lake
        storage = DataLakeStorage(self, 'DataLakeStorage')
        self.__logs_s3_bucket = AutoEmptyBucket(
            self, 'Logs',
            bucket_name='ara-logs-' + self.account,
            uuid=AutoEmptyConfig.FOUNDATIONS_UUID
        ).bucket

        self.__raw_s3_bucket = storage.raw_bucket
        self.__clean_s3_bucket = storage.clean_bucket
        self.__curated_s3_bucket = storage.transform_bucket

        AuditTrailGlue(self, 'GlueAudit',
            log_bucket=self.__logs_s3_bucket,
            audit_bucket=self.__curated_s3_bucket,
            audit_db=self.__audit_glue_db,
            audit_table=self.__curated_s3_bucket.bucket_name
        )

        # the vpc used for the overall data lake (same vpc, different subnet for modules)
        self.__vpc = Vpc(self, 'Vpc')
        self.__public_subnets = self.__vpc.select_subnets(subnet_type=SubnetType.PUBLIC)
        self.__private_subnets = self.__vpc.select_subnets(subnet_type=SubnetType.PRIVATE)
        self.__vpc.add_gateway_endpoint("S3GatewayEndpoint",
                                        service=GatewayVpcEndpointAwsService.S3,
                                        subnets=[SubnetSelection(subnet_type=SubnetType.PUBLIC),
                                                 SubnetSelection(subnet_type=SubnetType.PRIVATE)])

        # IAM groups
        self.__admin_group = Group(self, 'GroupAdmins', group_name='ara-admins')
        self.__analysts_group = Group(self, 'GroupAnalysts', group_name='ara-analysts')
        self.__developers_group = Group(self, 'GroupDevelopers', group_name='ara-developers')