def __init__(self, scope: core.Construct, id: str,
                 log_bucket: _s3.Bucket,
                 config_table: _dynamodb.Table,
                 tshirt_size: str,
                 sink_bucket: _s3.Bucket,
                 vpc: _ec2.Vpc,
                 **kwargs) -> None:

        super().__init__(scope, id, **kwargs)

        service_role = _iam.Role(
            self, 'BatchEmrServiceRole',
            assumed_by=_iam.ServicePrincipal('elasticmapreduce.amazonaws.com')
        )

        service_role.add_managed_policy(_iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AmazonElasticMapReduceRole'))

        cluster_role = _iam.Role(
            self, 'BatchEmrClusterRole',
            assumed_by=_iam.ServicePrincipal("ec2.amazonaws.com")
        )

        _iam.Policy(
            self, 'BatchEmrClusterPolicy',
            statements=[
                _iam.PolicyStatement(
                    actions=[
                        "glue:CreateDatabase",
                        "glue:UpdateDatabase",
                        "glue:DeleteDatabase",
                        "glue:GetDatabase",
                        "glue:GetDatabases",
                        "glue:CreateTable",
                        "glue:UpdateTable",
                        "glue:DeleteTable",
                        "glue:GetTable",
                        "glue:GetTables",
                        "glue:GetTableVersions",
                        "glue:CreatePartition",
                        "glue:BatchCreatePartition",
                        "glue:UpdatePartition",
                        "glue:DeletePartition",
                        "glue:BatchDeletePartition",
                        "glue:GetPartition",
                        "glue:GetPartitions",
                        "glue:BatchGetPartition",
                        "glue:CreateUserDefinedFunction",
                        "glue:UpdateUserDefinedFunction",
                        "glue:DeleteUserDefinedFunction",
                        "glue:GetUserDefinedFunction",
                        "glue:GetUserDefinedFunctions",
                        "cloudwatch:PutMetricData",
                        "dynamodb:ListTables",
                        "s3:HeadBucket",
                        "ec2:Describe*",
                    ],
                    resources=['*']
                ),
                _iam.PolicyStatement(
                    actions=['s3:GetObject'],
                    resources=[
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES + DataGenConfig.DSDGEN_INSTALL_SCRIPT,
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES + DataGenConfig.JAR_FILE
                    ]
                ),
                _iam.PolicyStatement(
                    actions=['s3:PutObject'],
                    resources=[log_bucket.bucket_arn + "/data-generator/*"]
                ),
                _iam.PolicyStatement(
                    actions=[
                        "s3:AbortMultipartUpload",
                        "s3:CreateBucket",
                        "s3:DeleteObject",
                        "s3:GetBucketVersioning",
                        "s3:GetObject",
                        "s3:GetObjectTagging",
                        "s3:GetObjectVersion",
                        "s3:ListBucket",
                        "s3:ListBucketMultipartUploads",
                        "s3:ListBucketVersions",
                        "s3:ListMultipartUploadParts",
                        "s3:PutBucketVersioning",
                        "s3:PutObject",
                        "s3:PutObjectTagging"
                    ],
                    resources=[
                        sink_bucket.bucket_arn + '/*',
                        sink_bucket.bucket_arn

                    ]
                )
            ],
            roles=[cluster_role]
        )

        cluster_role.add_managed_policy(_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSSMManagedInstanceCore'))

        _iam.CfnInstanceProfile(
            self, 'BatchEmrClusterInstanceProfile',
            roles=[cluster_role.role_name],
            instance_profile_name=cluster_role.role_name
        )

        # Security Groups for the EMR cluster (private subnet)
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-master-private
        master_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-Master-Private', vpc=vpc)
        slave_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-Slave-Private', vpc=vpc)
        service_sg = _ec2.SecurityGroup(self, 'ElasticMapReduce-ServiceAccess', vpc=vpc, allow_all_outbound=False)

        # Service SG used by the proxy instance
        service_sg.add_ingress_rule(master_sg, _ec2.Port.tcp(9443))
        service_sg.add_egress_rule(master_sg, _ec2.Port.tcp(8443))
        service_sg.add_egress_rule(slave_sg, _ec2.Port.tcp(8443))

        # EMR Master
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        # EMR Slave
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        with open('common/common_cdk/lambda/datagen_config.py', 'r') as f:
            lambda_source = f.read()

        configure_datagen_function = _lambda.SingletonFunction(
            self, 'BatchConfigureDatagenLambda',
            uuid="58a9a222-ff07-11ea-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='datagen-config',
            environment={
                'TABLE_NAME': config_table.table_name,
                'JAR_LOCATION': BINARIES_LOCATION + DataGenConfig.JAR_FILE,
            },
            timeout=core.Duration.seconds(10)
        )

        configure_datagen_function.role.add_to_policy(
            _iam.PolicyStatement(
                actions=[
                    'dynamodb:GetItem',
                    'dynamodb:PutItem',
                ],
                resources=[config_table.table_arn]
            )
        )

        terminate_cluster = _sfn_tasks.EmrTerminateCluster(
            self, 'BatchDeleteCluster',
            cluster_id=_sfn.TaskInput.from_data_at("$.Emr.Cluster.Id").value,
            integration_pattern=_sfn.IntegrationPattern.RUN_JOB,
        )

        terminate_cluster_error = _sfn_tasks.EmrTerminateCluster(
            self, 'BatchDeleteClusterError',
            cluster_id=_sfn.TaskInput.from_data_at("$.Emr.Cluster.Id").value,
            integration_pattern=_sfn.IntegrationPattern.RUN_JOB,
        ).next(_sfn.Fail(self, 'StepFailure'))

        create_cluster = _sfn_tasks.EmrCreateCluster(
            self, "BatchCreateEMRCluster",
            name="BatchDatagenCluster",
            result_path="$.Emr",
            release_label='emr-5.30.1',
            log_uri=log_bucket.s3_url_for_object() + "/data-generator",
            cluster_role=cluster_role,
            service_role=service_role,
            bootstrap_actions=[
                _sfn_tasks.EmrCreateCluster.BootstrapActionConfigProperty(
                    name="dsdgen-install",
                    script_bootstrap_action=_sfn_tasks.EmrCreateCluster.ScriptBootstrapActionConfigProperty(
                        path=BINARIES_LOCATION + DataGenConfig.DSDGEN_INSTALL_SCRIPT,
                    )
                )
            ],
            applications=[
                _sfn_tasks.EmrCreateCluster.ApplicationConfigProperty(
                    name="spark"
                ),
                _sfn_tasks.EmrCreateCluster.ApplicationConfigProperty(
                    name="hadoop"
                )
            ],
            instances=_sfn_tasks.EmrCreateCluster.InstancesConfigProperty(
                emr_managed_master_security_group=master_sg.security_group_id,
                emr_managed_slave_security_group=slave_sg.security_group_id,
                service_access_security_group=service_sg.security_group_id,
                ec2_subnet_ids=vpc.select_subnets().subnet_ids,
                instance_fleets=[
                    _sfn_tasks.EmrCreateCluster.InstanceFleetConfigProperty(
                        instance_fleet_type=_sfn_tasks.EmrCreateCluster.InstanceRoleType.MASTER,
                        instance_type_configs=[
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m4.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5d.xlarge',
                                weighted_capacity=1
                            ),
                        ],
                        launch_specifications=_sfn_tasks.EmrCreateCluster.InstanceFleetProvisioningSpecificationsProperty(
                            spot_specification=_sfn_tasks.EmrCreateCluster.SpotProvisioningSpecificationProperty(
                                timeout_action=_sfn_tasks.EmrCreateCluster.SpotTimeoutAction.SWITCH_TO_ON_DEMAND,
                                timeout_duration_minutes=5
                            )
                        ),
                        target_on_demand_capacity=0,
                        target_spot_capacity=1
                    ),
                    _sfn_tasks.EmrCreateCluster.InstanceFleetConfigProperty(
                        instance_fleet_type=_sfn_tasks.EmrCreateCluster.InstanceRoleType.CORE,
                        instance_type_configs=[
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5.2xlarge',
                                weighted_capacity=2
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.xlarge',
                                weighted_capacity=1
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m5a.2xlarge',
                                weighted_capacity=2
                            ),
                            _sfn_tasks.EmrCreateCluster.InstanceTypeConfigProperty(
                                instance_type='m4.xlarge',
                                weighted_capacity=1
                            )
                        ],
                        launch_specifications=_sfn_tasks.EmrCreateCluster.InstanceFleetProvisioningSpecificationsProperty(
                            spot_specification=_sfn_tasks.EmrCreateCluster.SpotProvisioningSpecificationProperty(
                                timeout_action=_sfn_tasks.EmrCreateCluster.SpotTimeoutAction.SWITCH_TO_ON_DEMAND,
                                timeout_duration_minutes=5
                            )
                        ),
                        target_on_demand_capacity=0,
                        target_spot_capacity=DataGenConfig.BATCH_CLUSTER_SIZE[tshirt_size]

                    )
                ]
            )
        ).add_catch(handler=terminate_cluster_error, result_path="$.error")

        configure_datagen = _sfn_tasks.LambdaInvoke(
            self, "BatchConfigureDatagenTask",
            lambda_function=configure_datagen_function,
            payload=_sfn.TaskInput.from_text('{'
                                             '"Param": "batch_iterator",'
                                             '"Module": "batch",'
                                             '"SinkBucket": "'+sink_bucket.s3_url_for_object()+'",'
                                             '"Parallelism": "'+str(int(DataGenConfig.BATCH_DATA_SIZE[tshirt_size])*2)+'",'
                                             '"DataSize": "'+DataGenConfig.BATCH_DATA_SIZE[tshirt_size]+'",'
                                             '"TmpBucket": "fake-bucket"'
                                             '}'),
            result_path='$.Config'
        ).add_catch(handler=terminate_cluster_error, result_path="$.error")

        add_datagen_step = _sfn.CustomState(
            self, 'BatchAddDataGenStep',
            state_json={
                "Type": "Task",
                "Resource": "arn:aws:states:::elasticmapreduce:addStep.sync",
                "Parameters": {
                    "ClusterId.$": "$.Emr.Cluster.Id",
                    "Step": {
                        "Name": "DatagenStep",
                        "ActionOnFailure": "CONTINUE",
                        "HadoopJarStep": {
                            "Jar": "command-runner.jar",
                            "Args.$": "$.Config.Payload.StepParam"
                        }
                    }
                },
                "ResultPath": "$.Step",
                "Next": "BatchUpdateIterator",
                "Catch": [
                    {
                        "ErrorEquals": ["States.ALL"],
                        "Next": "BatchDeleteClusterError",
                        "ResultPath": "$.error"
                    }
                ]
            }
        )

        update_iterator = _sfn_tasks.DynamoUpdateItem(
            self, 'BatchUpdateIterator',
            table=config_table,
            key={
                'param': _sfn_tasks.DynamoAttributeValue.from_string('batch_iterator')
            },
            update_expression='SET iterator = if_not_exists(iterator, :start) + :inc',
            expression_attribute_values={
                ":inc": _sfn_tasks.DynamoAttributeValue.from_number(1),
                ":start": _sfn_tasks.DynamoAttributeValue.from_number(0)
            },
            result_path=_sfn.JsonPath.DISCARD
        )

        definition = configure_datagen \
            .next(create_cluster) \
            .next(add_datagen_step) \
            .next(update_iterator) \
            .next(terminate_cluster)

        datagen_stepfunctions = _sfn.StateMachine(
            self, "BatchDataGenStepFunctions",
            definition=definition,
            timeout=core.Duration.minutes(30)
        )

        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    'elasticmapreduce:AddJobFlowSteps',
                    'elasticmapreduce:DescribeStep'
                ],
                resources=['*']
            )
        )
        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(
                actions= [
                    "iam:CreateServiceLinkedRole",
                    "iam:PutRolePolicy"
                ],
                resources=["arn:aws:iam::*:role/aws-service-role/elasticmapreduce.amazonaws.com*/AWSServiceRoleForEMRCleanup*"],
                conditions= {
                    "StringLike": {
                        "iam:AWSServiceName": [
                            "elasticmapreduce.amazonaws.com",
                            "elasticmapreduce.amazonaws.com.cn"
                        ]
                    }
                }
            )
        )

        step_trigger = _events.Rule(
            self, 'BatchSteptrigger',
            schedule=_events.Schedule.cron(minute='0/30',
                                           hour='*',
                                           month='*',
                                           week_day='*',
                                           year='*')
        )

        step_trigger.add_target(_events_targets.SfnStateMachine(machine=datagen_stepfunctions))

        with open('common/common_cdk/lambda/stepfunctions_trigger.py', 'r') as f:
            lambda_source = f.read()

        stepfunctions_trigger_lambda = _lambda.SingletonFunction(
            self, 'BatchStepFunctionsTriggerLambda',
            uuid="9597f6f2-f840-11ea-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stepfunctions-batch-datagen-trigger'
        )

        stepfunctions_trigger_lambda.role.add_to_policy(
            _iam.PolicyStatement(
                actions=["states:StartExecution"],
                resources=['*']
            )
        )

        trigger_step_lambda_provider = _custom_resources.Provider(
            self, 'StepFunctionsTriggerLambdaProvider',
            on_event_handler=stepfunctions_trigger_lambda
        )

        core.CustomResource(
            self, 'StepFunctionsTrigger',
            service_token=trigger_step_lambda_provider.service_token,
            properties={
                "stepArn": datagen_stepfunctions.state_machine_arn
            }
        )

        # terminate clusters
        with open('common/common_cdk/lambda/stepfunctions_terminate_emr.py', 'r') as f:
            lambda_source = f.read()

        sfn_terminate = _lambda.SingletonFunction(
            self, 'StepFuncTerminateBatch',
            uuid='58a9a422-ff07-11ea-adc1-0242ac120002',
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            timeout=core.Duration.minutes(5)
        )

        sfn_terminate.role.add_to_policy(
            _iam.PolicyStatement(
                actions=[
                    'elasticmapreduce:ListClusters',
                    'elasticmapreduce:TerminateJobFlows',
                    'states:ListStateMachines',
                    'states:ListExecutions',
                    'states:StopExecution'
                ],
                resources=['*']
            )
        )

        sfn_terminate_provider = _custom_resources.Provider(
            self, 'StepFuncTerminateBatchLambdaProvider',
            on_event_handler=sfn_terminate
        )

        core.CustomResource(
            self, 'StepFuncTerminateBatchCustomResource',
            service_token=sfn_terminate_provider.service_token,
            properties={
                "state_machine": 'BatchDatagen'
            })
Example #2
0
    def __init__(self, scope: core.Construct, id: str, es_domain: CfnDomain, kda_role: iam.Role,
                 source_bucket: s3.Bucket, dest_bucket: s3.Bucket, **kwargs):
        super().__init__(scope, id, **kwargs)

        stack = Stack.of(self)

        kda_role.add_to_policy(PolicyStatement(actions=['cloudwatch:PutMetricData'],
                                               resources=['*']))

        artifacts_bucket_arn = 'arn:aws:s3:::' + _config.ARA_BUCKET.replace("s3://", "")
        kda_role.add_to_policy(PolicyStatement(actions=['s3:GetObject', 's3:GetObjectVersion'],
                                               resources=[artifacts_bucket_arn, artifacts_bucket_arn + '/binaries/*']))
        log_group = logs.LogGroup(scope=self,
                                  id='KdaLogGroup',
                                  retention=logs.RetentionDays.ONE_WEEK,
                                  removal_policy=RemovalPolicy.DESTROY)

        log_stream = logs.LogStream(scope=self,
                                    id='KdaLogStream',
                                    log_group=log_group,
                                    removal_policy=RemovalPolicy.DESTROY)

        log_stream_arn = stack.format_arn(service='logs',
                                          resource='log-group',
                                          resource_name=log_group.log_group_name + ':log-stream:' +
                                                        log_stream.log_stream_name,
                                          sep=':')

        # TODO: restrict
        kda_role.add_to_policy(PolicyStatement(actions=['logs:*'],
                                               resources=[stack.format_arn(service='logs', resource='*')]))

        kda_role.add_to_policy(PolicyStatement(actions=['logs:DescribeLogStreams', 'logs:DescribeLogGroups'],
                                               resources=[log_group.log_group_arn,
                                                          stack.format_arn(service='logs', resource='log-group',
                                                                           resource_name='*')]))

        kda_role.add_to_policy(PolicyStatement(actions=['logs:PutLogEvents'],
                                               resources=[log_stream_arn]))

        kda_role.add_to_policy(PolicyStatement(actions=['es:ESHttp*'],
                                               resources=[stack.format_arn(service='es', resource='domain',
                                                                           resource_name=es_domain.domain_name + '/*')]))

        # TODO: restrict
        kda_role.add_to_policy(PolicyStatement(actions=['s3:*'],
                                               resources=['arn:aws:s3::::*']))

        # Define delivery stream
        # delivery_stream_name = 'clean_delivery_stream'
        #
        # s3_configuration = {
        #     'bucketArn': '',
        #     'compressionFormat': 'Snappy',
        #     'dataFormatConversionConfiguration': {
        #         'enabled': True,
        #         'inputFormatConfiguration': {'deserializer': },
        #         'outputFormatConfiguration': {'serializer': {'parquetSerDe': }},
        #         'schemaConfiguration': {}
        #     },
        #     'prefix': 'streaming'
        # }
        #
        # delivery_stream = CfnDeliveryStream(scope=self,
        #                                     id='Firehose Delivery Stream',
        #                                     delivery_stream_name=delivery_stream_name,
        #                                     delivery_stream_type='DirectPut',
        #                                     extended_s3_destination_configuration=s3_configuration
        #                                     )

        # Define KDA application
        application_configuration = {
            'environmentProperties': {
                'propertyGroups': [
                    {
                        'propertyGroupId': 'ConsumerConfigProperties',
                        'propertyMap': {
                            'CustomerStream': scope.customer_stream.stream_name,
                            'AddressStream': scope.address_stream.stream_name,
                            'SaleStream': scope.sale_stream.stream_name,
                            'PromoDataPath': source_bucket.s3_url_for_object('promo'),
                            'ItemDataPath': source_bucket.s3_url_for_object('item'),
                            'aws.region': scope.region
                        }
                    },
                    {
                        'propertyGroupId': 'ProducerConfigProperties',
                        'propertyMap': {
                            'ElasticsearchHost': 'https://' + es_domain.attr_domain_endpoint + ':443',
                            'Region': scope.region,
                            'DenormalizedSalesS3Path': dest_bucket.s3_url_for_object() + '/',
                            'IndexName': 'ara-write'
                        }
                    }
                ]
            },
            'applicationCodeConfiguration': {
                'codeContent': {
                    's3ContentLocation': {
                        'bucketArn': artifacts_bucket_arn,
                        'fileKey': 'binaries/stream-processing-1.1.jar'
                    }
                },
                'codeContentType': 'ZIPFILE'
            },
            'flinkApplicationConfiguration': {
                'parallelismConfiguration': {
                    'configurationType': 'DEFAULT'
                },
                'checkpointConfiguration': {
                    'configurationType': 'DEFAULT'
                },
                'monitoringConfiguration': {
                    'logLevel': 'DEBUG',
                    'metricsLevel': 'TASK',
                    'configurationType': 'CUSTOM'
                }
            },
            'applicationSnapshotConfiguration': {
                'snapshotsEnabled': False
            }
        }

        self.__app = CfnApplicationV2(scope=self,
                                      id='KDA application',
                                      runtime_environment='FLINK-1_11',
                                      application_name='KDA-application',
                                      service_execution_role=kda_role.role_arn,
                                      application_configuration=application_configuration)

        logging = CfnApplicationCloudWatchLoggingOptionV2(scope=self, id='KDA application logging',
                                                          application_name=self.__app.ref,
                                                          cloud_watch_logging_option={'logStreamArn': log_stream_arn})

        logging.apply_removal_policy(policy=RemovalPolicy.RETAIN, apply_to_update_replace_policy=True,
                                     default=RemovalPolicy.RETAIN)

        # Use a custom resource to start the application
        create_params = {
            'ApplicationName': self.__app.ref,
            'RunConfiguration': {
                'ApplicationRestoreConfiguration': {
                    'ApplicationRestoreType': 'SKIP_RESTORE_FROM_SNAPSHOT'
                },
                'FlinkRunConfiguration': {
                    'AllowNonRestoredState': True
                }
            }
        }

        # See https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/ for service name, actions and parameters
        create_action = AwsSdkCall(service='KinesisAnalyticsV2',
                                   action='startApplication',
                                   parameters=create_params,
                                   physical_resource_id=PhysicalResourceId.of(self.__app.ref + '-start'))

        delete_action = AwsSdkCall(service='KinesisAnalyticsV2',
                                   action='stopApplication',
                                   parameters={'ApplicationName': self.__app.ref, 'Force': True})

        custom_resource = AwsCustomResource(scope=self,
                                            id='KdaStartAndStop',
                                            on_create=create_action,
                                            on_delete=delete_action,
                                            policy=AwsCustomResourcePolicy.from_statements([PolicyStatement(
                                                actions=['kinesisanalytics:StartApplication',
                                                         'kinesisanalytics:StopApplication',
                                                         'kinesisanalytics:DescribeApplication',
                                                         'kinesisanalytics:UpdateApplication'], resources=[
                                                    stack.format_arn(service='kinesisanalytics', resource='application',
                                                                     resource_name=self.app.application_name)])]))

        custom_resource.node.add_dependency(self.app)
Example #3
0
    def __init__(self, scope: core.Construct, id: str, log_bucket: _s3.Bucket,
                 config_table: _dynamodb.Table, tshirt_size: str,
                 sink_bucket: _s3.Bucket, web_sale_stream: str,
                 web_customer_stream: str, web_customer_address_stream: str,
                 kinesis_key: _kms.Key, vpc: _ec2.Vpc, **kwargs) -> None:

        super().__init__(scope, id, **kwargs)

        stack = core.Stack.of(self)

        stream_source_bucket = AutoEmptyBucket(
            self,
            'StreamSource',
            bucket_name='ara-stream-source-' + core.Aws.ACCOUNT_ID,
            uuid='95505f50-0276-11eb-adc1-0242ac120002')

        service_role = _iam.Role(
            self,
            'StreamEmrServiceRole',
            assumed_by=_iam.ServicePrincipal('elasticmapreduce.amazonaws.com'))

        service_role.add_managed_policy(
            _iam.ManagedPolicy.from_aws_managed_policy_name(
                'service-role/AmazonElasticMapReduceRole'))

        cluster_role = _iam.Role(
            self,
            'StreamEmrClusterRole',
            assumed_by=_iam.ServicePrincipal("ec2.amazonaws.com"))

        _iam.Policy(
            self,
            'StreamEmrClusterPolicy',
            statements=[
                _iam.PolicyStatement(actions=[
                    "glue:CreateDatabase",
                    "glue:UpdateDatabase",
                    "glue:DeleteDatabase",
                    "glue:GetDatabase",
                    "glue:GetDatabases",
                    "glue:CreateTable",
                    "glue:UpdateTable",
                    "glue:DeleteTable",
                    "glue:GetTable",
                    "glue:GetTables",
                    "glue:GetTableVersions",
                    "glue:CreatePartition",
                    "glue:BatchCreatePartition",
                    "glue:UpdatePartition",
                    "glue:DeletePartition",
                    "glue:BatchDeletePartition",
                    "glue:GetPartition",
                    "glue:GetPartitions",
                    "glue:BatchGetPartition",
                    "glue:CreateUserDefinedFunction",
                    "glue:UpdateUserDefinedFunction",
                    "glue:DeleteUserDefinedFunction",
                    "glue:GetUserDefinedFunction",
                    "glue:GetUserDefinedFunctions",
                    "cloudwatch:PutMetricData",
                    "dynamodb:ListTables",
                    "s3:HeadBucket",
                    "ec2:Describe*",
                ],
                                     resources=['*']),
                _iam.PolicyStatement(
                    actions=['s3:GetObject'],
                    resources=[
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES +
                        DataGenConfig.DSDGEN_INSTALL_SCRIPT, 'arn:aws:s3:::' +
                        ARA_BUCKET_NAME + BINARIES + DataGenConfig.JAR_FILE
                    ]),
                _iam.PolicyStatement(
                    actions=['s3:PutObject'],
                    resources=[log_bucket.bucket_arn + "/data-generator/*"]),
                _iam.PolicyStatement(
                    actions=[
                        "s3:AbortMultipartUpload", "s3:CreateBucket",
                        "s3:DeleteObject", "s3:GetBucketVersioning",
                        "s3:GetObject", "s3:GetObjectTagging",
                        "s3:GetObjectVersion", "s3:ListBucket",
                        "s3:ListBucketMultipartUploads",
                        "s3:ListBucketVersions", "s3:ListMultipartUploadParts",
                        "s3:PutBucketVersioning", "s3:PutObject",
                        "s3:PutObjectTagging"
                    ],
                    resources=[
                        sink_bucket.bucket_arn + '/*', sink_bucket.bucket_arn,
                        stream_source_bucket.bucket.bucket_arn + '/*',
                        stream_source_bucket.bucket.bucket_arn
                    ])
            ],
            roles=[cluster_role])

        cluster_role.add_managed_policy(
            _iam.ManagedPolicy.from_aws_managed_policy_name(
                'AmazonSSMManagedInstanceCore'))

        _iam.CfnInstanceProfile(self,
                                'StreamEmrClusterInstanceProfile',
                                roles=[cluster_role.role_name],
                                instance_profile_name=cluster_role.role_name)

        # Security Groups for the EMR cluster (private subnet)
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-master-private
        master_sg = _ec2.SecurityGroup(self,
                                       'ElasticMapReduce-Master-Private',
                                       vpc=vpc)
        slave_sg = _ec2.SecurityGroup(self,
                                      'ElasticMapReduce-Slave-Private',
                                      vpc=vpc)
        service_sg = _ec2.SecurityGroup(self,
                                        'ElasticMapReduce-ServiceAccess',
                                        vpc=vpc,
                                        allow_all_outbound=False)

        # Service SG used by the proxy instance
        service_sg.add_ingress_rule(master_sg, _ec2.Port.tcp(9443))
        service_sg.add_egress_rule(master_sg, _ec2.Port.tcp(8443))
        service_sg.add_egress_rule(slave_sg, _ec2.Port.tcp(8443))

        # EMR Master
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        # EMR Slave
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        with open('common/common_cdk/lambda/datagen_config.py', 'r') as f:
            lambda_source = f.read()

        configure_datagen_function = _lambda.SingletonFunction(
            self,
            'StreamConfigureDatagenLambda',
            uuid="a9904dec-01cf-11eb-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stream-datagen-config',
            environment={
                'TABLE_NAME': config_table.table_name,
                'JAR_LOCATION': BINARIES_LOCATION + DataGenConfig.JAR_FILE,
            },
            timeout=core.Duration.seconds(10))

        configure_datagen_function.role.add_to_policy(
            _iam.PolicyStatement(actions=[
                'dynamodb:GetItem',
                'dynamodb:PutItem',
            ],
                                 resources=[config_table.table_arn]))

        emr_cluster = _emr.CfnCluster(
            self,
            'StreamEmrCluster',
            name="StreamDatagenCluster",
            job_flow_role=cluster_role.role_name,
            service_role=service_role.role_name,
            release_label='emr-5.30.1',
            visible_to_all_users=True,
            log_uri=log_bucket.s3_url_for_object() + "/data-generator",
            applications=[
                _emr.CfnCluster.ApplicationProperty(name='hadoop'),
                _emr.CfnCluster.ApplicationProperty(name='spark')
            ],
            bootstrap_actions=[
                _emr.CfnCluster.BootstrapActionConfigProperty(
                    name="dsdgen-install",
                    script_bootstrap_action=_emr.CfnCluster.
                    ScriptBootstrapActionConfigProperty(
                        path=BINARIES_LOCATION +
                        DataGenConfig.DSDGEN_INSTALL_SCRIPT))
            ],
            instances=_emr.CfnCluster.JobFlowInstancesConfigProperty(
                emr_managed_master_security_group=master_sg.security_group_id,
                emr_managed_slave_security_group=slave_sg.security_group_id,
                service_access_security_group=service_sg.security_group_id,
                ec2_subnet_id=vpc.private_subnets[0].subnet_id,
                core_instance_group=_emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=DataGenConfig.
                                            BATCH_CLUSTER_SIZE[tshirt_size],
                                            instance_type='m5.xlarge'),
                master_instance_group=_emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=1,
                                            instance_type='m4.large')))

        configure_datagen = _sfn_tasks.LambdaInvoke(
            self,
            "ConfigureDatagenTask",
            lambda_function=configure_datagen_function,
            payload=_sfn.TaskInput.from_text(
                '{'
                '"Param": "stream_iterator",'
                '"Module": "stream",'
                '"SinkBucket": "' + sink_bucket.s3_url_for_object() + '",'
                '"Parallelism": "' +
                str(int(DataGenConfig.STREAM_DATA_SIZE[tshirt_size]) * 2) +
                '",'
                '"DataSize": "' + DataGenConfig.STREAM_DATA_SIZE[tshirt_size] +
                '",'
                '"TmpBucket": "' +
                str(stream_source_bucket.bucket.s3_url_for_object()) + '"'
                '}'),
            result_path='$.Config')

        add_datagen_step = _sfn.CustomState(
            self,
            'StreamAddDataGenStep',
            state_json={
                "Type": "Task",
                "Resource": "arn:aws:states:::elasticmapreduce:addStep.sync",
                "Parameters": {
                    "ClusterId.$": "$.Emr.Cluster.Id",
                    "Step": {
                        "Name": "DatagenStep",
                        "ActionOnFailure": "CONTINUE",
                        "HadoopJarStep": {
                            "Jar": "command-runner.jar",
                            "Args.$": "$.Config.Payload.StepParam"
                        }
                    }
                },
                "ResultPath": "$.Step",
                "Next": "StreamUpdateIterator"
            })

        update_iterator = _sfn_tasks.DynamoUpdateItem(
            self,
            'StreamUpdateIterator',
            table=config_table,
            key={
                'param':
                _sfn_tasks.DynamoAttributeValue.from_string('stream_iterator')
            },
            update_expression=
            'SET iterator = if_not_exists(iterator, :start) + :inc',
            expression_attribute_values={
                ":inc": _sfn_tasks.DynamoAttributeValue.from_number(1),
                ":start": _sfn_tasks.DynamoAttributeValue.from_number(0)
            },
            result_path=_sfn.JsonPath.DISCARD)

        definition = configure_datagen \
            .next(add_datagen_step) \
            .next(update_iterator)

        datagen_stepfunctions = _sfn.StateMachine(
            self,
            "StreamDataGenStepFunctions",
            definition=definition,
            timeout=core.Duration.minutes(30))

        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(actions=[
                'elasticmapreduce:AddJobFlowSteps',
                'elasticmapreduce:DescribeStep'
            ],
                                 resources=['*']))

        step_trigger = _events.Rule(self,
                                    'StreamStepTrigger',
                                    schedule=_events.Schedule.cron(
                                        minute='0/10',
                                        hour='*',
                                        month='*',
                                        week_day='*',
                                        year='*'))

        step_trigger.add_target(
            _events_targets.SfnStateMachine(
                machine=datagen_stepfunctions,
                input=_events.RuleTargetInput.from_object({
                    "Emr": {
                        "Cluster": {
                            "Id": core.Fn.ref(emr_cluster.logical_id)
                        }
                    }
                })))

        with open('common/common_cdk/lambda/stepfunctions_trigger.py',
                  'r') as f:
            lambda_source = f.read()

        stepfunctions_trigger_lambda = _lambda.SingletonFunction(
            self,
            'StreamStepFunctionsTriggerLambda',
            uuid="cf042246-01d0-11eb-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stepfunctions-stream-datagen-trigger')

        stepfunctions_trigger_lambda.role.add_to_policy(
            _iam.PolicyStatement(actions=["states:StartExecution"],
                                 resources=['*']))

        trigger_step_lambda_provider = _custom_resources.Provider(
            self,
            'StreamStepFunctionsTriggerLambdaProvider',
            on_event_handler=stepfunctions_trigger_lambda)

        core.CustomResource(
            self,
            'StreamStepFunctionsTrigger',
            service_token=trigger_step_lambda_provider.service_token,
            properties={"stepArn": datagen_stepfunctions.state_machine_arn})

        with open('common/common_cdk/lambda/stream_generator.py', 'r') as f:
            lambda_source = f.read()

        sale_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebSaleStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_sale_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(sale_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='sale', suffix='csv'))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=["kinesis:PutRecords"],
                                 resources=[
                                     stack.format_arn(
                                         service='kinesis',
                                         resource='stream',
                                         resource_name=web_sale_stream)
                                 ]))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))

        customer_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebCustomerStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_customer_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(
                customer_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='customer', suffix='csv'))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=["kinesis:PutRecords"],
                                 resources=[
                                     stack.format_arn(
                                         service='kinesis',
                                         resource='stream',
                                         resource_name=web_customer_stream)
                                 ]))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))

        address_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebCustomerAddressStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_customer_address_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(
                address_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='address', suffix='csv'))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=["kinesis:PutRecords"],
                resources=[
                    stack.format_arn(service='kinesis',
                                     resource='stream',
                                     resource_name=web_customer_address_stream)
                ]))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))