Beispiel #1
0
    def __init__(self, scope: core.Construct, id: str,
                 landing_zone: ILandingZone,
                 directory: DirectoryServicesConstruct, group_names: [List],
                 **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        self.__landing_zone = landing_zone

        # Configure the security groups
        self.security_group = ec2.SecurityGroup(
            self,
            'SecurityGroup',
            vpc=landing_zone.networking.vpc,
            allow_all_outbound=True,
            description='HadoopConstruct Security Group',
            security_group_name='hadoop-mapreduce-group')

        for port in services.keys():
            self.security_group.add_ingress_rule(
                peer=ec2.Peer.any_ipv4(),
                connection=ec2.Port(protocol=ec2.Protocol.TCP,
                                    from_port=port,
                                    to_port=port,
                                    string_representation=services[port]))

        self.security_group.add_ingress_rule(
            peer=ec2.Peer.any_ipv4(),
            connection=ec2.Port(protocol=ec2.Protocol.UDP,
                                from_port=0,
                                to_port=65535,
                                string_representation='Allow All UDP Traffic'))

        self.security_group.add_ingress_rule(
            peer=ec2.Peer.any_ipv4(),
            connection=ec2.Port(protocol=ec2.Protocol.TCP,
                                from_port=0,
                                to_port=65535,
                                string_representation='Allow All TCP Traffic'))

        # Setup roles...
        self.jobFlowRole = iam.Role(
            self,
            'JobFlowRole',
            assumed_by=iam.ServicePrincipal(service='ec2.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonSSMManagedInstanceCore'),
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceforEC2Role'),
            ])

        profile_name = 'jobflowprofile@{}-{}'.format(
            landing_zone.zone_name,
            core.Stack.of(self).region)
        job_flow_instance_profile = iam.CfnInstanceProfile(
            self,
            'JobFlowInstanceProfile',
            instance_profile_name=profile_name,
            roles=[self.jobFlowRole.role_name])

        serviceRole = iam.Role(
            self,
            'ServiceRole',
            assumed_by=iam.ServicePrincipal(
                service='elasticmapreduce.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceRole')
            ])

        self.database = g.Database(self,
                                   'GlueStore',
                                   database_name='demo-database')

        self.bucket = s3.Bucket(self,
                                'LogBucket',
                                removal_policy=core.RemovalPolicy.DESTROY)

        emr_fs = EmrfsConstruct(self,
                                'Emrfs',
                                landing_zone=landing_zone,
                                directory=directory,
                                group_names=group_names,
                                job_flow_role=self.jobFlowRole)

        # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-elasticmapreduce-instancefleetconfig.html
        self.cluster = emr.CfnCluster(
            self,
            'Hadoop',
            name='HadoopCluster',
            job_flow_role=profile_name,  #'EMR_EC2_DefaultRole',
            service_role=serviceRole.role_name,
            log_uri='s3://' + self.bucket.bucket_name + '/logs',
            release_label='emr-6.2.0',
            applications=[
                emr.CfnCluster.ApplicationProperty(name='Spark'),
                emr.CfnCluster.ApplicationProperty(name='Presto'),
                emr.CfnCluster.ApplicationProperty(name='Hue'),
                emr.CfnCluster.ApplicationProperty(name='Hive'),
                emr.CfnCluster.ApplicationProperty(name='JupyterHub'),
            ],
            configurations=[
                emr.CfnCluster.ConfigurationProperty(
                    classification='spark-hive-site',
                    configuration_properties={
                        'hive.metastore.client.factory.class':
                        'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification='hive-site',
                    configuration_properties={
                        'hive.metastore.client.factory.class':
                        'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory',
                        'aws.glue.partition.num.segments':
                        '10',  #1 to 10; (default=5)
                        'hive.metastore.schema.verification': 'false',
                    })
            ],
            security_configuration=emr_fs.security_configuration.ref,
            # kerberos_attributes= emr.CfnCluster.KerberosAttributesProperty(
            #   kdc_admin_password=directory.password,
            #   realm= directory.mad.name.upper(),
            #   ad_domain_join_password=directory.password,
            #   ad_domain_join_user= directory.admin
            # ),
            managed_scaling_policy=emr.CfnCluster.ManagedScalingPolicyProperty(
                compute_limits=emr.CfnCluster.ComputeLimitsProperty(
                    minimum_capacity_units=1,
                    maximum_capacity_units=25,
                    unit_type='InstanceFleetUnits')),
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                #hadoop_version='2.4.0',
                termination_protected=False,
                master_instance_fleet=emr.CfnCluster.
                InstanceFleetConfigProperty(
                    target_spot_capacity=1,
                    instance_type_configs=[
                        emr.CfnCluster.InstanceTypeConfigProperty(
                            instance_type='m5.xlarge', )
                    ]),
                core_instance_fleet=emr.CfnCluster.InstanceFleetConfigProperty(
                    target_spot_capacity=1,
                    instance_type_configs=[
                        emr.CfnCluster.InstanceTypeConfigProperty(
                            instance_type='m5.xlarge',
                            ebs_configuration=emr.CfnCluster.
                            EbsConfigurationProperty(ebs_block_device_configs=[
                                emr.CfnCluster.EbsBlockDeviceConfigProperty(
                                    volume_specification=emr.CfnCluster.
                                    VolumeSpecificationProperty(
                                        size_in_gb=50, volume_type='gp2'))
                            ]))
                    ]),
                additional_master_security_groups=[
                    self.security_group.security_group_id
                ],
                additional_slave_security_groups=[
                    self.security_group.security_group_id
                ],
                ec2_subnet_ids=[
                    net.subnet_id for net in landing_zone.networking.vpc.
                    _select_subnet_objects(subnet_group_name='Hadoop')
                ],
            ))

        self.cluster.add_depends_on(job_flow_instance_profile)
Beispiel #2
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        vpc = ec2.Vpc(self, "VPC")

        sg = ec2.SecurityGroup(self,
                               id="sg_ssh",
                               vpc=vpc,
                               security_group_name="sg_ssh")

        sg.add_ingress_rule(peer=ec2.Peer.any_ipv4(),
                            connection=ec2.Port.tcp(22))

        s3_bucket = s3.Bucket(
            self,
            "Bucket",
            bucket_name=f"emr-example-bucket",
            versioned=False,
            removal_policy=core.RemovalPolicy.
            DESTROY  # NOT recommended for production code
        )

        role = iam.Role(
            self,
            "EMRJobFlowRole",
            assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AmazonElasticMapReduceforEC2Role")
            ])

        profile = iam.CfnInstanceProfile(self,
                                         'InstanceProfile',
                                         roles=[role.role_name])

        emr_cluster = emr.CfnCluster(
            self,
            "EMRCluster",
            name="SparkStreamingCluster",
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                master_instance_group=emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=1,
                                            instance_type='c5.xlarge',
                                            name='Master'),
                core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=2, instance_type='r5.xlarge', name='Core'),
                ec2_key_name="<ssh_key>",
                additional_master_security_groups=[sg.security_group_name],
                ec2_subnet_id=vpc.public_subnets[0].subnet_id),
            job_flow_role=profile.ref,
            service_role='EMR_DefaultRole',
            release_label='emr-5.29.0',
            applications=[
                emr.CfnCluster.ApplicationProperty(name='Spark'),
                emr.CfnCluster.ApplicationProperty(name='Ganglia'),
                emr.CfnCluster.ApplicationProperty(name='Hive'),
                emr.CfnCluster.ApplicationProperty(name='Livy')
            ],
            configurations=[
                emr.CfnCluster.ConfigurationProperty(
                    classification='emrfs-site',
                    configuration_properties={"fs.s3.maxConnections": "1000"}),
                emr.CfnCluster.ConfigurationProperty(
                    classification='hive-site',
                    configuration_properties={
                        "hive.metastore.client.factory.class":
                        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification="spark-hive-site",
                    configuration_properties={
                        "hive.metastore.client.factory.class":
                        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification="spark-defaults",
                    configuration_properties={
                        "spark.dynamicAllocation.enabled": "false",
                        "spark.executor.cores": "2",
                        "spark.executor.memory": "3g",
                        "spark.executor.instances": "16"
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification="core-site",
                    configuration_properties={
                        "hadoop.proxyuser.livy.groups": "*",
                        "hadoop.proxyuser.livy.hosts": "*"
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification="livy-conf",
                    configuration_properties={
                        "livy.impersonation.enabled": "true"
                    })
            ],
            log_uri='s3://' + s3_bucket.bucket_name + '/emr-logs',
        )
Beispiel #3
0
    def __init__(self, scope: core.Construct, data_lake: DataLake,
                 common: Common, **kwargs) -> None:
        self.env = data_lake.env.value
        super().__init__(scope, id=f'{self.env}-emr-transform', **kwargs)

        self.logs_bucket = s3.Bucket(
            self,
            f'{self.env}-emr-logs-bucket',
            bucket_name=f's3-belisco-{self.env}-emr-logs-bucket',
            removal_policy=core.RemovalPolicy.DESTROY)

        buckets_arns = [
            data_lake.data_lake_raw_bucket.bucket_arn,
            data_lake.data_lake_processed_bucket.bucket_arn,
            data_lake.data_lake_curated_bucket.bucket_arn
        ]

        self.datalake_emr_policy = iam.Policy(
            self,
            id=f'iam-{self.env}-emr-data-lake',
            policy_name=f'iam-{self.env}-emr-data-lake',
            statements=[
                iam.PolicyStatement(actions=[
                    's3:*',
                ],
                                    resources=buckets_arns +
                                    [f'{arn}/*' for arn in buckets_arns])
            ])

        self.emr_role = iam.Role(
            self,
            f'{self.env}-emr-cluster-role',
            assumed_by=iam.ServicePrincipal('elasticmapreduce.amazonaws.com'),
            description='Role to allow EMR to process data',
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceRole')
            ])

        self.emr_role.attach_inline_policy(self.datalake_emr_policy)

        self.emr_ec2_role = iam.Role(
            self,
            f'{self.env}-emr-ec2-role',
            assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'),
            description='Role to allow EMR to process data',
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceforEC2Role')
            ])

        self.emr_ec2_role.attach_inline_policy(self.datalake_emr_policy)

        self.emr_ec2_instance_profile = iam.CfnInstanceProfile(
            self,
            f'{self.env}-emr-instance_profile',
            instance_profile_name=f'{self.env}-emr-instance_profile',
            roles=[self.emr_ec2_role.role_name])

        self.cluster = emr.CfnCluster(
            self,
            f'{self.env}-emr-cluster',
            name=f'{self.env}-emr-cluster',
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                master_instance_group=emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=1,
                                            instance_type='m4.large',
                                            market='ON_DEMAND',
                                            name='Master'),
                core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=2,
                    instance_type='m4.large',
                    market='ON_DEMAND',
                    name='Core'),
                termination_protected=False,
                ec2_subnet_id=common.custom_vpc.private_subnets[0].subnet_id),
            applications=[emr.CfnCluster.ApplicationProperty(name='Spark')],
            log_uri=f's3://{self.logs_bucket.bucket_name}/logs',
            job_flow_role=self.emr_ec2_instance_profile.get_att(
                'Arn').to_string(),
            service_role=self.emr_role.role_arn,
            release_label='emr-5.30.1',
            visible_to_all_users=True,
            configurations=[
                emr.CfnCluster.ConfigurationProperty(
                    classification='spark-hive-site',
                    configuration_properties={
                        "hive.metastore.client.factory.class":
                        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
                    })
            ])
Beispiel #4
0
  def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
    super().__init__(scope, construct_id, **kwargs)

    EMR_EC2_KEY_PAIR_NAME = cdk.CfnParameter(self, 'EMREC2KeyPairName',
      type='String',
      description='Amazon EMR EC2 Instance KeyPair name',
      default='emr'
    )

    EMR_CLUSTER_NAME = cdk.CfnParameter(self, 'EMRClusterName',
      type='String',
      description='Amazon EMR Cluster name',
      default='my-emr-cluster'
    )

    vpc_name = self.node.try_get_context("vpc_name")
    vpc = aws_ec2.Vpc.from_lookup(self, "ExistingVPC",
      is_default=True,
      vpc_name=vpc_name)

    # vpc = aws_ec2.Vpc(self, "EMRStackVPC",
    #   max_azs=2,
    #   gateway_endpoints={
    #     "S3": aws_ec2.GatewayVpcEndpointOptions(
    #       service=aws_ec2.GatewayVpcEndpointAwsService.S3
    #     )
    #   }
    # )

    emr_instances = aws_emr.CfnCluster.JobFlowInstancesConfigProperty(
      core_instance_group=aws_emr.CfnCluster.InstanceGroupConfigProperty(
        instance_count=2,
        instance_type="m5.xlarge",
        market="ON_DEMAND"
      ),
      ec2_subnet_id=vpc.public_subnets[0].subnet_id,
      keep_job_flow_alive_when_no_steps=True, # After last step completes: Cluster waits
      master_instance_group=aws_emr.CfnCluster.InstanceGroupConfigProperty(
        instance_count=1,
        instance_type="m5.xlarge",
        market="ON_DEMAND"
      ),
      termination_protected=True
    )

    emr_cfn_cluster = aws_emr.CfnCluster(self, "MyEMRCluster",
      instances=emr_instances,
      # In order to use the default role for `job_flow_role`, you must have already created it using the CLI or console
      job_flow_role="EMR_EC2_DefaultRole",
      name=EMR_CLUSTER_NAME.value_as_string,
      # service_role="EMR_DefaultRole_V2",
      service_role="EMR_DefaultRole",
      applications=[
        aws_emr.CfnCluster.ApplicationProperty(name="Hadoop"),
        aws_emr.CfnCluster.ApplicationProperty(name="Hive"),
        aws_emr.CfnCluster.ApplicationProperty(name="JupyterHub"),
        aws_emr.CfnCluster.ApplicationProperty(name="Livy"),
        aws_emr.CfnCluster.ApplicationProperty(name="Spark"),
        aws_emr.CfnCluster.ApplicationProperty(name="JupyterEnterpriseGateway")
      ],
      bootstrap_actions=None,
      configurations=[
        aws_emr.CfnCluster.ConfigurationProperty(
          classification="hive-site",
          configuration_properties={
            "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
          }),
        aws_emr.CfnCluster.ConfigurationProperty(
          classification="spark-hive-site",
          configuration_properties={
            "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
          })
      ],
      ebs_root_volume_size=10,
      log_uri="s3n://aws-logs-{account}-{region}/elasticmapreduce/".format(account=cdk.Aws.ACCOUNT_ID, region=cdk.Aws.REGION),
      release_label="emr-6.5.0",
      scale_down_behavior="TERMINATE_AT_TASK_COMPLETION",
      # tags=[cdk.CfnTag(
      #   key="for-use-with-amazon-emr-managed-policies",
      #   value="true"
      # )],
      visible_to_all_users=True
    )
Beispiel #5
0
    def __init__(self, scope: core.Construct, data_lake: DataLake, common: Common, **kwargs) -> None:
        self.env = data_lake.env.value
        super().__init__(scope, id=f'{self.env}-emr-transform', **kwargs)

        self.logs_bucket = s3.Bucket(
            self,
            f'{self.env}-emr-logs-bucket',
            bucket_name=f's3-belisco-{self.env}-emr-logs-bucket',
            removal_policy=core.RemovalPolicy.DESTROY
        )

        self.emr_role = iam.Role(
            self,
            f'{self.env}-emr-cluster-role',
            assumed_by=iam.ServicePrincipal('elasticmapreduce.amazonaws.com'),
            description='Role to allow EMR to process data',
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AmazonElasticMapReduceRole')
            ]
        )

        self.emr_ec2_role = iam.Role(
            self,
            f'{self.env}-emr-ec2-role',
            assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'),
            description='Role to allow EMR to process data',
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name('service-role/AmazonElasticMapReduceforEC2Role')
            ]
        )

        self.emr_ec2_instance_profile = iam.CfnInstanceProfile(
            self,
            f'{self.env}-emr-instance_profile',
            instance_profile_name=f'{self.env}-emr-instance_profile',
            roles=[
                self.emr_ec2_role.role_name
            ]
        )

        self.cluster = emr.CfnCluster(
            self,
            f'{self.env}-emr-cluster',
            name=f'{self.env}-emr-cluster',
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                master_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=1,
                    instance_type='m4.large',
                    market='ON_DEMAND',
                    name='Master'
                ),
                core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=2,
                    instance_type='m4.large',
                    market='ON_DEMAND',
                    name='Core'
                ),
                termination_protected=True,
                ec2_subnet_id=common.custom_vpc.private_subnets[0].subnet_id
            ),
            applications=[
                emr.CfnCluster.ApplicationProperty(name='Spark')
            ],
            log_uri=f's3://{self.logs_bucket.bucket_name}/logs',
            job_flow_role=self.emr_ec2_instance_profile.get_att('Arn').to_string(),
            service_role=self.emr_role.role_arn,
            release_label='emr-5.30.1',
            visible_to_all_users=True
        )
Beispiel #6
0
    def __init__(self, scope: core.Construct, id: str, log_bucket: _s3.Bucket,
                 config_table: _dynamodb.Table, tshirt_size: str,
                 sink_bucket: _s3.Bucket, web_sale_stream: str,
                 web_customer_stream: str, web_customer_address_stream: str,
                 kinesis_key: _kms.Key, vpc: _ec2.Vpc, **kwargs) -> None:

        super().__init__(scope, id, **kwargs)

        stack = core.Stack.of(self)

        stream_source_bucket = AutoEmptyBucket(
            self,
            'StreamSource',
            bucket_name='ara-stream-source-' + core.Aws.ACCOUNT_ID,
            uuid='95505f50-0276-11eb-adc1-0242ac120002')

        service_role = _iam.Role(
            self,
            'StreamEmrServiceRole',
            assumed_by=_iam.ServicePrincipal('elasticmapreduce.amazonaws.com'))

        service_role.add_managed_policy(
            _iam.ManagedPolicy.from_aws_managed_policy_name(
                'service-role/AmazonElasticMapReduceRole'))

        cluster_role = _iam.Role(
            self,
            'StreamEmrClusterRole',
            assumed_by=_iam.ServicePrincipal("ec2.amazonaws.com"))

        _iam.Policy(
            self,
            'StreamEmrClusterPolicy',
            statements=[
                _iam.PolicyStatement(actions=[
                    "glue:CreateDatabase",
                    "glue:UpdateDatabase",
                    "glue:DeleteDatabase",
                    "glue:GetDatabase",
                    "glue:GetDatabases",
                    "glue:CreateTable",
                    "glue:UpdateTable",
                    "glue:DeleteTable",
                    "glue:GetTable",
                    "glue:GetTables",
                    "glue:GetTableVersions",
                    "glue:CreatePartition",
                    "glue:BatchCreatePartition",
                    "glue:UpdatePartition",
                    "glue:DeletePartition",
                    "glue:BatchDeletePartition",
                    "glue:GetPartition",
                    "glue:GetPartitions",
                    "glue:BatchGetPartition",
                    "glue:CreateUserDefinedFunction",
                    "glue:UpdateUserDefinedFunction",
                    "glue:DeleteUserDefinedFunction",
                    "glue:GetUserDefinedFunction",
                    "glue:GetUserDefinedFunctions",
                    "cloudwatch:PutMetricData",
                    "dynamodb:ListTables",
                    "s3:HeadBucket",
                    "ec2:Describe*",
                ],
                                     resources=['*']),
                _iam.PolicyStatement(
                    actions=['s3:GetObject'],
                    resources=[
                        'arn:aws:s3:::' + ARA_BUCKET_NAME + BINARIES +
                        DataGenConfig.DSDGEN_INSTALL_SCRIPT, 'arn:aws:s3:::' +
                        ARA_BUCKET_NAME + BINARIES + DataGenConfig.JAR_FILE
                    ]),
                _iam.PolicyStatement(
                    actions=['s3:PutObject'],
                    resources=[log_bucket.bucket_arn + "/data-generator/*"]),
                _iam.PolicyStatement(
                    actions=[
                        "s3:AbortMultipartUpload", "s3:CreateBucket",
                        "s3:DeleteObject", "s3:GetBucketVersioning",
                        "s3:GetObject", "s3:GetObjectTagging",
                        "s3:GetObjectVersion", "s3:ListBucket",
                        "s3:ListBucketMultipartUploads",
                        "s3:ListBucketVersions", "s3:ListMultipartUploadParts",
                        "s3:PutBucketVersioning", "s3:PutObject",
                        "s3:PutObjectTagging"
                    ],
                    resources=[
                        sink_bucket.bucket_arn + '/*', sink_bucket.bucket_arn,
                        stream_source_bucket.bucket.bucket_arn + '/*',
                        stream_source_bucket.bucket.bucket_arn
                    ])
            ],
            roles=[cluster_role])

        cluster_role.add_managed_policy(
            _iam.ManagedPolicy.from_aws_managed_policy_name(
                'AmazonSSMManagedInstanceCore'))

        _iam.CfnInstanceProfile(self,
                                'StreamEmrClusterInstanceProfile',
                                roles=[cluster_role.role_name],
                                instance_profile_name=cluster_role.role_name)

        # Security Groups for the EMR cluster (private subnet)
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-master-private
        master_sg = _ec2.SecurityGroup(self,
                                       'ElasticMapReduce-Master-Private',
                                       vpc=vpc)
        slave_sg = _ec2.SecurityGroup(self,
                                      'ElasticMapReduce-Slave-Private',
                                      vpc=vpc)
        service_sg = _ec2.SecurityGroup(self,
                                        'ElasticMapReduce-ServiceAccess',
                                        vpc=vpc,
                                        allow_all_outbound=False)

        # Service SG used by the proxy instance
        service_sg.add_ingress_rule(master_sg, _ec2.Port.tcp(9443))
        service_sg.add_egress_rule(master_sg, _ec2.Port.tcp(8443))
        service_sg.add_egress_rule(slave_sg, _ec2.Port.tcp(8443))

        # EMR Master
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        master_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        master_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        # EMR Slave
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(master_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_icmp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_tcp())
        slave_sg.add_ingress_rule(slave_sg, _ec2.Port.all_udp())
        slave_sg.add_ingress_rule(service_sg, _ec2.Port.tcp(8443))

        with open('common/common_cdk/lambda/datagen_config.py', 'r') as f:
            lambda_source = f.read()

        configure_datagen_function = _lambda.SingletonFunction(
            self,
            'StreamConfigureDatagenLambda',
            uuid="a9904dec-01cf-11eb-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stream-datagen-config',
            environment={
                'TABLE_NAME': config_table.table_name,
                'JAR_LOCATION': BINARIES_LOCATION + DataGenConfig.JAR_FILE,
            },
            timeout=core.Duration.seconds(10))

        configure_datagen_function.role.add_to_policy(
            _iam.PolicyStatement(actions=[
                'dynamodb:GetItem',
                'dynamodb:PutItem',
            ],
                                 resources=[config_table.table_arn]))

        emr_cluster = _emr.CfnCluster(
            self,
            'StreamEmrCluster',
            name="StreamDatagenCluster",
            job_flow_role=cluster_role.role_name,
            service_role=service_role.role_name,
            release_label='emr-5.30.1',
            visible_to_all_users=True,
            log_uri=log_bucket.s3_url_for_object() + "/data-generator",
            applications=[
                _emr.CfnCluster.ApplicationProperty(name='hadoop'),
                _emr.CfnCluster.ApplicationProperty(name='spark')
            ],
            bootstrap_actions=[
                _emr.CfnCluster.BootstrapActionConfigProperty(
                    name="dsdgen-install",
                    script_bootstrap_action=_emr.CfnCluster.
                    ScriptBootstrapActionConfigProperty(
                        path=BINARIES_LOCATION +
                        DataGenConfig.DSDGEN_INSTALL_SCRIPT))
            ],
            instances=_emr.CfnCluster.JobFlowInstancesConfigProperty(
                emr_managed_master_security_group=master_sg.security_group_id,
                emr_managed_slave_security_group=slave_sg.security_group_id,
                service_access_security_group=service_sg.security_group_id,
                ec2_subnet_id=vpc.private_subnets[0].subnet_id,
                core_instance_group=_emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=DataGenConfig.
                                            BATCH_CLUSTER_SIZE[tshirt_size],
                                            instance_type='m5.xlarge'),
                master_instance_group=_emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=1,
                                            instance_type='m4.large')))

        configure_datagen = _sfn_tasks.LambdaInvoke(
            self,
            "ConfigureDatagenTask",
            lambda_function=configure_datagen_function,
            payload=_sfn.TaskInput.from_text(
                '{'
                '"Param": "stream_iterator",'
                '"Module": "stream",'
                '"SinkBucket": "' + sink_bucket.s3_url_for_object() + '",'
                '"Parallelism": "' +
                str(int(DataGenConfig.STREAM_DATA_SIZE[tshirt_size]) * 2) +
                '",'
                '"DataSize": "' + DataGenConfig.STREAM_DATA_SIZE[tshirt_size] +
                '",'
                '"TmpBucket": "' +
                str(stream_source_bucket.bucket.s3_url_for_object()) + '"'
                '}'),
            result_path='$.Config')

        add_datagen_step = _sfn.CustomState(
            self,
            'StreamAddDataGenStep',
            state_json={
                "Type": "Task",
                "Resource": "arn:aws:states:::elasticmapreduce:addStep.sync",
                "Parameters": {
                    "ClusterId.$": "$.Emr.Cluster.Id",
                    "Step": {
                        "Name": "DatagenStep",
                        "ActionOnFailure": "CONTINUE",
                        "HadoopJarStep": {
                            "Jar": "command-runner.jar",
                            "Args.$": "$.Config.Payload.StepParam"
                        }
                    }
                },
                "ResultPath": "$.Step",
                "Next": "StreamUpdateIterator"
            })

        update_iterator = _sfn_tasks.DynamoUpdateItem(
            self,
            'StreamUpdateIterator',
            table=config_table,
            key={
                'param':
                _sfn_tasks.DynamoAttributeValue.from_string('stream_iterator')
            },
            update_expression=
            'SET iterator = if_not_exists(iterator, :start) + :inc',
            expression_attribute_values={
                ":inc": _sfn_tasks.DynamoAttributeValue.from_number(1),
                ":start": _sfn_tasks.DynamoAttributeValue.from_number(0)
            },
            result_path=_sfn.JsonPath.DISCARD)

        definition = configure_datagen \
            .next(add_datagen_step) \
            .next(update_iterator)

        datagen_stepfunctions = _sfn.StateMachine(
            self,
            "StreamDataGenStepFunctions",
            definition=definition,
            timeout=core.Duration.minutes(30))

        datagen_stepfunctions.add_to_role_policy(
            _iam.PolicyStatement(actions=[
                'elasticmapreduce:AddJobFlowSteps',
                'elasticmapreduce:DescribeStep'
            ],
                                 resources=['*']))

        step_trigger = _events.Rule(self,
                                    'StreamStepTrigger',
                                    schedule=_events.Schedule.cron(
                                        minute='0/10',
                                        hour='*',
                                        month='*',
                                        week_day='*',
                                        year='*'))

        step_trigger.add_target(
            _events_targets.SfnStateMachine(
                machine=datagen_stepfunctions,
                input=_events.RuleTargetInput.from_object({
                    "Emr": {
                        "Cluster": {
                            "Id": core.Fn.ref(emr_cluster.logical_id)
                        }
                    }
                })))

        with open('common/common_cdk/lambda/stepfunctions_trigger.py',
                  'r') as f:
            lambda_source = f.read()

        stepfunctions_trigger_lambda = _lambda.SingletonFunction(
            self,
            'StreamStepFunctionsTriggerLambda',
            uuid="cf042246-01d0-11eb-adc1-0242ac120002",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.inline(lambda_source),
            handler='index.handler',
            function_name='stepfunctions-stream-datagen-trigger')

        stepfunctions_trigger_lambda.role.add_to_policy(
            _iam.PolicyStatement(actions=["states:StartExecution"],
                                 resources=['*']))

        trigger_step_lambda_provider = _custom_resources.Provider(
            self,
            'StreamStepFunctionsTriggerLambdaProvider',
            on_event_handler=stepfunctions_trigger_lambda)

        core.CustomResource(
            self,
            'StreamStepFunctionsTrigger',
            service_token=trigger_step_lambda_provider.service_token,
            properties={"stepArn": datagen_stepfunctions.state_machine_arn})

        with open('common/common_cdk/lambda/stream_generator.py', 'r') as f:
            lambda_source = f.read()

        sale_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebSaleStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_sale_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(sale_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='sale', suffix='csv'))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=["kinesis:PutRecords"],
                                 resources=[
                                     stack.format_arn(
                                         service='kinesis',
                                         resource='stream',
                                         resource_name=web_sale_stream)
                                 ]))

        sale_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))

        customer_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebCustomerStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_customer_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(
                customer_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='customer', suffix='csv'))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=["kinesis:PutRecords"],
                                 resources=[
                                     stack.format_arn(
                                         service='kinesis',
                                         resource='stream',
                                         resource_name=web_customer_stream)
                                 ]))

        customer_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))

        address_stream_generator_lambda = _lambda.Function(
            scope=self,
            id='WebCustomerAddressStreamGenerator',
            runtime=_lambda.Runtime.PYTHON_3_7,
            memory_size=2048,
            timeout=core.Duration.minutes(15),
            code=_lambda.Code.inline(lambda_source),
            handler='index.lambda_handler',
            environment={
                'REGION': core.Aws.REGION,
                'STREAM_NAME': web_customer_address_stream
            })

        stream_source_bucket.bucket.add_event_notification(
            _s3.EventType.OBJECT_CREATED,
            _s3_notifications.LambdaDestination(
                address_stream_generator_lambda),
            _s3.NotificationKeyFilter(prefix='address', suffix='csv'))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=[
                    "s3:DeleteObject",
                    "s3:GetObject",
                    "s3:ListBucket",
                ],
                resources=[
                    stream_source_bucket.bucket.bucket_arn + '/*',
                    stream_source_bucket.bucket.bucket_arn
                ]))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(
                actions=["kinesis:PutRecords"],
                resources=[
                    stack.format_arn(service='kinesis',
                                     resource='stream',
                                     resource_name=web_customer_address_stream)
                ]))

        address_stream_generator_lambda.add_to_role_policy(
            _iam.PolicyStatement(actions=['kms:GenerateDataKey'],
                                 resources=[
                                     stack.format_arn(
                                         service='kms',
                                         resource='key',
                                         sep='/',
                                         resource_name=kinesis_key.key_id)
                                 ]))
Beispiel #7
0
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        vpc: ec2.IVpc,
        name: str,
        release_label: str,
        rds_secret: secrets.Secret,
        rds_connections: ec2.Connections,
        log_bucket_name: str = None,
        ssh_key_name: str = None,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.tag_vpc(vpc)

        job_role = self.get_job_role()
        service_role = self.get_service_role()
        instance_profile = self.create_instance_profile(job_role)
        log_bucket = get_or_create_bucket(self, "emr_logs", log_bucket_name)

        # Assign necessary permissions
        # EMR needs to be able to PutObject to the log bucket
        log_bucket.grant_put(job_role)

        # EMR needs to be able to PassRole to the instance profile role
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role-for-ec2.html#emr-ec2-role-least-privilege
        # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role.html
        service_role.add_to_policy(
            iam.PolicyStatement(
                actions=["iam:PassRole"],
                resources=[job_role.role_arn],
                conditions={
                    "StringEquals": {"iam:PassedToService": "ec2.amazonaws.com"}
                },
            )
        )

        # Database configuration variables
        rds_hostname = rds_secret.secret_value_from_json("host").to_string()
        rds_port = rds_secret.secret_value_from_json("port").to_string()
        rds_dbname = rds_secret.secret_value_from_json("dbname").to_string()

        # Desired subnet for the EMR cluster
        emr_subnet = vpc.public_subnets[0]

        self.cluster = emr.CfnCluster(
            self,
            construct_id,
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                master_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=1, instance_type="m5.xlarge"
                ),
                core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=2, instance_type="m5.xlarge"
                ),
                ec2_subnet_id=emr_subnet.subnet_id,
            ),
            name=name,
            release_label=release_label,
            log_uri=f"s3://{log_bucket.bucket_name}/elasticmapreduce/",
            job_flow_role=job_role.role_name,
            service_role=service_role.role_name,
            applications=[
                emr.CfnCluster.ApplicationProperty(name=n)
                for n in [
                    "Spark",
                    "Hive",
                    "Zeppelin",
                    "Livy",
                    "JupyterEnterpriseGateway",
                ]
            ],
            visible_to_all_users=True, # Required for EMR Notebooks
            configurations=[
                emr.CfnCluster.ConfigurationProperty(
                    classification="hive-site",
                    configuration_properties={
                        "javax.jdo.option.ConnectionURL": f"jdbc:mysql://{rds_hostname}:{rds_port}/{rds_dbname}?createDatabaseIfNotExist=true",
                        "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
                        "javax.jdo.option.ConnectionUserName": rds_secret.secret_value_from_json(
                            "username"
                        ).to_string(),
                        "javax.jdo.option.ConnectionPassword": rds_secret.secret_value_from_json(
                            "password"
                        ).to_string(),
                    },
                ),
            ],
            tags=[
                cdk.CfnTag(
                    key="for-use-with-amazon-emr-managed-policies", value="true"
                ),
            ],
        )

        # Wait for the instance profile to be created
        self.cluster.add_depends_on(instance_profile)

        # Allow EMR to connect to the RDS database
        self.add_rds_ingres(emr_subnet.ipv4_cidr_block, rds_connections)
Beispiel #8
0
    def __init__(
        self,
        scope: Construct,
        id: str,
        s3_log_bucket: str,
        s3_script_bucket: str,
        spark_script: str,
        **kwargs,
    ) -> None:
        super().__init__(scope, id, **kwargs)

        # VPC
        vpc = ec2.Vpc(
            self,
            "vpc",
            nat_gateways=0,
            subnet_configuration=[
                ec2.SubnetConfiguration(name="public",
                                        subnet_type=ec2.SubnetType.PUBLIC)
            ],
        )

        # enable reading scripts from s3 bucket
        read_scripts_policy = iam.PolicyStatement(
            effect=iam.Effect.ALLOW,
            actions=[
                "s3:GetObject",
            ],
            resources=[f"arn:aws:s3:::{s3_script_bucket}/*"],
        )
        read_scripts_document = iam.PolicyDocument()
        read_scripts_document.add_statements(read_scripts_policy)

        # emr service role
        emr_service_role = iam.Role(
            self,
            "emr_service_role",
            assumed_by=iam.ServicePrincipal("elasticmapreduce.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AmazonElasticMapReduceRole")
            ],
            inline_policies=[read_scripts_document],
        )

        # emr job flow role
        emr_job_flow_role = iam.Role(
            self,
            "emr_job_flow_role",
            assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AmazonElasticMapReduceforEC2Role")
            ],
        )
        # emr job flow profile
        emr_job_flow_profile = iam.CfnInstanceProfile(
            self,
            "emr_job_flow_profile",
            roles=[emr_job_flow_role.role_name],
            instance_profile_name="emrJobFlowProfile_",
        )

        # create emr cluster
        emr.CfnCluster(
            self,
            "emr_cluster",
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
                    instance_count=3, instance_type="m4.large", market="SPOT"),
                ec2_subnet_id=vpc.public_subnets[0].subnet_id,
                hadoop_version="Amazon",
                keep_job_flow_alive_when_no_steps=False,
                master_instance_group=emr.CfnCluster.
                InstanceGroupConfigProperty(instance_count=1,
                                            instance_type="m4.large",
                                            market="SPOT"),
            ),
            # note job_flow_role is an instance profile (not an iam role)
            job_flow_role=emr_job_flow_profile.instance_profile_name,
            name="cluster_name",
            applications=[emr.CfnCluster.ApplicationProperty(name="Spark")],
            service_role=emr_service_role.role_name,
            configurations=[
                # use python3 for pyspark
                emr.CfnCluster.ConfigurationProperty(
                    classification="spark-env",
                    configurations=[
                        emr.CfnCluster.ConfigurationProperty(
                            classification="export",
                            configuration_properties={
                                "PYSPARK_PYTHON": "/usr/bin/python3",
                                "PYSPARK_DRIVER_PYTHON": "/usr/bin/python3",
                            },
                        )
                    ],
                ),
                # enable apache arrow
                emr.CfnCluster.ConfigurationProperty(
                    classification="spark-defaults",
                    configuration_properties={
                        "spark.sql.execution.arrow.enabled": "true"
                    },
                ),
                # dedicate cluster to single jobs
                emr.CfnCluster.ConfigurationProperty(
                    classification="spark",
                    configuration_properties={
                        "maximizeResourceAllocation": "true"
                    },
                ),
            ],
            log_uri=f"s3://{s3_log_bucket}/{Aws.REGION}/elasticmapreduce/",
            release_label="emr-6.0.0",
            visible_to_all_users=False,
            # the job to be done
            steps=[
                emr.CfnCluster.StepConfigProperty(
                    hadoop_jar_step=emr.CfnCluster.HadoopJarStepConfigProperty(
                        jar="command-runner.jar",
                        args=[
                            "spark-submit",
                            "--deploy-mode",
                            "cluster",
                            f"s3://{s3_script_bucket}/scripts/{spark_script}",
                        ],
                    ),
                    name="step_name",
                    action_on_failure="CONTINUE",
                ),
            ],
        )