Exemple #1
0
 def create_database(self):
     """Create the database of data lake in Glue"""
     id_suffix = self.database_name.replace("_", "-")
     glue.Database(
         scope=self,
         id=f"oedi-data-lake-database--{id_suffix}",
         database_name=self.database_name
     )
Exemple #2
0
    def _create_glue_db(self):
        """
        Create a glue database that will be visible in Athena
        """
        db_name = self.glue_db_name
        db = glue.Database(
            self,
            f"{db_name}-id",
            database_name=db_name,
            location_uri=f"s3://{self.data_bucket.bucket_name}/",
        )

        return db
Exemple #3
0
    def __init__(self, scope: cdk.Stack, id: str, base_module, stream_module,
                 **kwargs):
        super().__init__(scope, id, **kwargs)
        self.base_module = base_module
        self.stream_module = stream_module

        self.glue_service_iam_role = aws_iam.Role(
            self,
            "GlueIAMRole",
            role_name="GlueCrawler-{}".format(self.stack_name),
            assumed_by=aws_iam.ServicePrincipal(service='glue.amazonaws.com'),
        )

        # Attaching the default aws managed role, and s3 access policy to curated bucket path
        self.glue_service_iam_role.attach_managed_policy(
            'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole')
        self.glue_s3_iam_policy_statement = aws_iam.PolicyStatement()
        actions = ["s3:GetObject", "s3:PutObject"]
        for action in actions:
            self.glue_s3_iam_policy_statement.add_action(action)
        self.glue_s3_iam_policy_statement.add_resource(
            self.stream_module.output_bucket.bucket_arn + '/twitter-curated/*')

        self.glue_iam_policy = aws_iam.Policy(
            self,
            "GlueIAMPolicy",
            statements=[self.glue_s3_iam_policy_statement],
        )

        self.glue_iam_policy.attach_to_role(self.glue_service_iam_role)

        self.glue_database = aws_glue.Database(
            self,
            "GlueDatabaseTwitterData",
            database_name=self.stack_name,
        )

        self.glue_crawler = aws_glue.CfnCrawler(
            self,
            "GlueCrawlerTwitterDB",
            database_name=self.glue_database.database_name,
            role=self.glue_service_iam_role.role_arn,
            targets={
                "s3Targets": [{
                    "path":
                    "s3://{}/twitter-curated/".format(
                        self.stream_module.output_bucket.bucket_name)
                }]
            },
            table_prefix=self.stack_name)
Exemple #4
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        self._region = 'aws_region'
        self._account_id = 'aws_account_id'

        bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket')

        database = glue.Database(self,
                                 id='my_database_id',
                                 database_name='poc')

        table = glue.Table(
            self,
            id='my_table_id',
            database=database,
            table_name='my_table',
            columns=[
                glue.Column(name='col1',
                            type=glue.Type(input_string='string',
                                           is_primitive=True)),
                glue.Column(name='col2',
                            type=glue.Type(input_string='int',
                                           is_primitive=True))
            ],
            partition_keys=[
                glue.Column(name='dt',
                            type=glue.Type(input_string='string',
                                           is_primitive=True))
            ],
            bucket=bucket,
            s3_prefix='test_data',
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    'org.apache.hadoop.mapred.TextInputFormat'),
                output_format=glue.OutputFormat(
                    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
                ),
                serialization_library=glue.SerializationLibrary(
                    'org.openx.data.jsonserde.JsonSerDe')))
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        # The code that defines your stack goes here
        dbdemo = glue.Database(self, "nikhildb", database_name="nike")
Exemple #6
0
    def __init__(self, scope: core.Construct, id: str,
                 landing_zone: ILandingZone,
                 directory: DirectoryServicesConstruct, group_names: [List],
                 **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        self.__landing_zone = landing_zone

        # Configure the security groups
        self.security_group = ec2.SecurityGroup(
            self,
            'SecurityGroup',
            vpc=landing_zone.networking.vpc,
            allow_all_outbound=True,
            description='HadoopConstruct Security Group',
            security_group_name='hadoop-mapreduce-group')

        for port in services.keys():
            self.security_group.add_ingress_rule(
                peer=ec2.Peer.any_ipv4(),
                connection=ec2.Port(protocol=ec2.Protocol.TCP,
                                    from_port=port,
                                    to_port=port,
                                    string_representation=services[port]))

        self.security_group.add_ingress_rule(
            peer=ec2.Peer.any_ipv4(),
            connection=ec2.Port(protocol=ec2.Protocol.UDP,
                                from_port=0,
                                to_port=65535,
                                string_representation='Allow All UDP Traffic'))

        self.security_group.add_ingress_rule(
            peer=ec2.Peer.any_ipv4(),
            connection=ec2.Port(protocol=ec2.Protocol.TCP,
                                from_port=0,
                                to_port=65535,
                                string_representation='Allow All TCP Traffic'))

        # Setup roles...
        self.jobFlowRole = iam.Role(
            self,
            'JobFlowRole',
            assumed_by=iam.ServicePrincipal(service='ec2.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonSSMManagedInstanceCore'),
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceforEC2Role'),
            ])

        profile_name = 'jobflowprofile@{}-{}'.format(
            landing_zone.zone_name,
            core.Stack.of(self).region)
        job_flow_instance_profile = iam.CfnInstanceProfile(
            self,
            'JobFlowInstanceProfile',
            instance_profile_name=profile_name,
            roles=[self.jobFlowRole.role_name])

        serviceRole = iam.Role(
            self,
            'ServiceRole',
            assumed_by=iam.ServicePrincipal(
                service='elasticmapreduce.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AmazonElasticMapReduceRole')
            ])

        self.database = g.Database(self,
                                   'GlueStore',
                                   database_name='demo-database')

        self.bucket = s3.Bucket(self,
                                'LogBucket',
                                removal_policy=core.RemovalPolicy.DESTROY)

        emr_fs = EmrfsConstruct(self,
                                'Emrfs',
                                landing_zone=landing_zone,
                                directory=directory,
                                group_names=group_names,
                                job_flow_role=self.jobFlowRole)

        # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-elasticmapreduce-instancefleetconfig.html
        self.cluster = emr.CfnCluster(
            self,
            'Hadoop',
            name='HadoopCluster',
            job_flow_role=profile_name,  #'EMR_EC2_DefaultRole',
            service_role=serviceRole.role_name,
            log_uri='s3://' + self.bucket.bucket_name + '/logs',
            release_label='emr-6.2.0',
            applications=[
                emr.CfnCluster.ApplicationProperty(name='Spark'),
                emr.CfnCluster.ApplicationProperty(name='Presto'),
                emr.CfnCluster.ApplicationProperty(name='Hue'),
                emr.CfnCluster.ApplicationProperty(name='Hive'),
                emr.CfnCluster.ApplicationProperty(name='JupyterHub'),
            ],
            configurations=[
                emr.CfnCluster.ConfigurationProperty(
                    classification='spark-hive-site',
                    configuration_properties={
                        'hive.metastore.client.factory.class':
                        'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
                    }),
                emr.CfnCluster.ConfigurationProperty(
                    classification='hive-site',
                    configuration_properties={
                        'hive.metastore.client.factory.class':
                        'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory',
                        'aws.glue.partition.num.segments':
                        '10',  #1 to 10; (default=5)
                        'hive.metastore.schema.verification': 'false',
                    })
            ],
            security_configuration=emr_fs.security_configuration.ref,
            # kerberos_attributes= emr.CfnCluster.KerberosAttributesProperty(
            #   kdc_admin_password=directory.password,
            #   realm= directory.mad.name.upper(),
            #   ad_domain_join_password=directory.password,
            #   ad_domain_join_user= directory.admin
            # ),
            managed_scaling_policy=emr.CfnCluster.ManagedScalingPolicyProperty(
                compute_limits=emr.CfnCluster.ComputeLimitsProperty(
                    minimum_capacity_units=1,
                    maximum_capacity_units=25,
                    unit_type='InstanceFleetUnits')),
            instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
                #hadoop_version='2.4.0',
                termination_protected=False,
                master_instance_fleet=emr.CfnCluster.
                InstanceFleetConfigProperty(
                    target_spot_capacity=1,
                    instance_type_configs=[
                        emr.CfnCluster.InstanceTypeConfigProperty(
                            instance_type='m5.xlarge', )
                    ]),
                core_instance_fleet=emr.CfnCluster.InstanceFleetConfigProperty(
                    target_spot_capacity=1,
                    instance_type_configs=[
                        emr.CfnCluster.InstanceTypeConfigProperty(
                            instance_type='m5.xlarge',
                            ebs_configuration=emr.CfnCluster.
                            EbsConfigurationProperty(ebs_block_device_configs=[
                                emr.CfnCluster.EbsBlockDeviceConfigProperty(
                                    volume_specification=emr.CfnCluster.
                                    VolumeSpecificationProperty(
                                        size_in_gb=50, volume_type='gp2'))
                            ]))
                    ]),
                additional_master_security_groups=[
                    self.security_group.security_group_id
                ],
                additional_slave_security_groups=[
                    self.security_group.security_group_id
                ],
                ec2_subnet_ids=[
                    net.subnet_id for net in landing_zone.networking.vpc.
                    _select_subnet_objects(subnet_group_name='Hadoop')
                ],
            ))

        self.cluster.add_depends_on(job_flow_instance_profile)
Exemple #7
0
    def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs: str) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.vpc = ec2.Vpc(
            self,
            "aws-data-wrangler-vpc",
            cidr="11.19.224.0/19",
            enable_dns_hostnames=True,
            enable_dns_support=True,
        )
        cdk.Tags.of(self.vpc).add("Name", "aws-data-wrangler")
        self.key = kms.Key(
            self,
            id="aws-data-wrangler-key",
            description="Aws Data Wrangler Test Key.",
            policy=iam.PolicyDocument(
                statements=[
                    iam.PolicyStatement(
                        sid="Enable IAM User Permissions",
                        effect=iam.Effect.ALLOW,
                        actions=["kms:*"],
                        principals=[iam.AccountRootPrincipal()],
                        resources=["*"],
                    )
                ]
            ),
        )
        kms.Alias(
            self,
            "aws-data-wrangler-key-alias",
            alias_name="alias/aws-data-wrangler-key",
            target_key=self.key,
        )
        self.bucket = s3.Bucket(
            self,
            id="aws-data-wrangler",
            block_public_access=s3.BlockPublicAccess(
                block_public_acls=True,
                block_public_policy=True,
                ignore_public_acls=True,
                restrict_public_buckets=True,
            ),
            lifecycle_rules=[
                s3.LifecycleRule(
                    id="CleaningUp",
                    enabled=True,
                    expiration=cdk.Duration.days(1),
                    abort_incomplete_multipart_upload_after=cdk.Duration.days(1),
                ),
            ],
            versioned=True,
        )
        glue_db = glue.Database(
            self,
            id="aws_data_wrangler_glue_database",
            database_name="aws_data_wrangler",
            location_uri=f"s3://{self.bucket.bucket_name}",
        )
        log_group = logs.LogGroup(
            self,
            id="aws_data_wrangler_log_group",
            retention=logs.RetentionDays.ONE_MONTH,
        )
        log_stream = logs.LogStream(
            self,
            id="aws_data_wrangler_log_stream",
            log_group=log_group,
        )
        cdk.CfnOutput(self, "Region", value=self.region)
        cdk.CfnOutput(
            self,
            "VPC",
            value=self.vpc.vpc_id,
            export_name="aws-data-wrangler-base-VPC",
        )
        cdk.CfnOutput(
            self,
            "PublicSubnet1",
            value=self.vpc.public_subnets[0].subnet_id,
            export_name="aws-data-wrangler-base-PublicSubnet1",
        )
        cdk.CfnOutput(
            self,
            "PublicSubnet2",
            value=self.vpc.public_subnets[1].subnet_id,
            export_name="aws-data-wrangler-base-PublicSubnet2",
        )
        cdk.CfnOutput(
            self,
            "PrivateSubnet",
            value=self.vpc.private_subnets[0].subnet_id,
            export_name="aws-data-wrangler-base-PrivateSubnet",
        )
        cdk.CfnOutput(
            self,
            "KmsKeyArn",
            value=self.key.key_arn,
            export_name="aws-data-wrangler-base-KmsKeyArn",
        )
        cdk.CfnOutput(
            self,
            "BucketName",
            value=self.bucket.bucket_name,
            export_name="aws-data-wrangler-base-BucketName",
        )
        cdk.CfnOutput(self, "GlueDatabaseName", value=glue_db.database_name)
        cdk.CfnOutput(self, "LogGroupName", value=log_group.log_group_name)
        cdk.CfnOutput(self, "LogStream", value=log_stream.log_stream_name)
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        with open('./props/tasksetting.json', 'r') as f1:
            py_json1 = json.load(f1)
            ts = json.dumps(py_json1)

        # with open('./props/mappingrule.json', 'r') as f2:
        #     py_json2 = json.load(f2)
        #     mr = json.dumps(py_json2)

        with open('./props/config.json', 'r') as f2:
            configuration = json.load(f2)

        def getMappingrules(self, table_list):
            rules = []
            for index, value in enumerate(table_list, 1):
                rules.append({
                    "rule-type": "selection",
                    "rule-id": str(index),
                    "rule-name": str(index),
                    "object-locator": {
                        "schema-name": value['schemaName'],
                        "table-name": value['tableName']
                    },
                    "rule-action": "include",
                    "filters": []
                })
            mapping_rules = {"rules": rules}
            return json.dumps(mapping_rules)

        # The code that defines your stack goes here
        S3Accessrole = _iam.Role(
            self,
            'dmsrole',
            assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonS3FullAccess')
            ])

        raw_bucket = s3.Bucket(self,
                               'rawbucket',
                               bucket_name='rawbucket-datalake-cdk-oregon')
        raw_bucket.add_lifecycle_rule(
            enabled=configuration['s3LifecycleRule']['enabled'],
            expiration=core.Duration.days(
                configuration['s3LifecycleRule']['expiration']))

        #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable',
        #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) )

        dl_dms = _dms.CfnReplicationInstance(
            self,
            'dmsreplication',
            replication_instance_class=configuration['DMS_instance_setting']
            ['instance_class'],
            replication_instance_identifier='datalake-instance-cdk',
            allocated_storage=configuration['DMS_instance_setting']
            ['allocated_storage'])

        source_endpoint = _dms.CfnEndpoint(
            self,
            'sourceendpoint',
            endpoint_type='source',
            engine_name=configuration['engineName'],
            database_name=configuration['databaseName'],
            username=configuration['username'],
            password=configuration['password'],
            port=configuration['port'],
            server_name=configuration['serverName'],
        )

        target_endpoint = _dms.CfnEndpoint(
            self,
            'targetendpoint',
            endpoint_type='target',
            engine_name='s3',
            s3_settings={
                'bucketName': raw_bucket.bucket_name,
                'serviceAccessRoleArn': S3Accessrole.role_arn
            },
            extra_connection_attributes='dataFormat=parquet')

        dms_task = _dms.CfnReplicationTask(
            self,
            'data2lake-task',
            migration_type='full-load-and-cdc',
            replication_instance_arn=dl_dms.ref,
            source_endpoint_arn=source_endpoint.ref,
            target_endpoint_arn=target_endpoint.ref,
            replication_task_settings=ts,
            table_mappings=getMappingrules(self, configuration['tableList']))

        my_table = ddb.Table(self,
                             id='dynamoTable',
                             table_name='ControllerTable',
                             partition_key=ddb.Attribute(
                                 name='path', type=ddb.AttributeType.STRING),
                             billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        datalake_bucket = s3.Bucket(self,
                                    'datalakebucket',
                                    bucket_name='datalake-bucket-cdk-oregon')

        glue_role = _iam.Role(
            self,
            'gluerole',
            assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        raw_bucket.grant_read(glue_role)
        datalake_bucket.grant_read_write(glue_role)

        #lake formation settings
        #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings",
        #so that the lake setting can be allowed by below code in cdk.
        lake_admin_setting = _lakeformation.CfnDataLakeSettings(
            self,
            'data-lake-GrantAdmin',
            admins=[
                _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
                    data_lake_principal_identifier=configuration[
                        'executiveArn'])
            ])

        glue_database = _glue.Database(self,
                                       'gluedatabase',
                                       database_name='data_lake_gluedb')

        glue_database.node.add_dependency(lake_admin_setting)

        glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions(
            self,
            'permission-glueRole',
            data_lake_principal=_lakeformation.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=_lakeformation.CfnPermissions.ResourceProperty(
                database_resource=_lakeformation.CfnPermissions.
                DatabaseResourceProperty(name=glue_database.database_name)),
            permissions=['ALL'])

        crawler = _glue.CfnCrawler(
            self,
            'datalakecrawler',
            name='Crawler-datalake-cdk',
            role=glue_role.role_arn,
            targets={
                's3Targets': [{
                    'path':
                    's3://' + datalake_bucket.bucket_name + '/datalake/'
                }]
            },
            database_name='data_lake_gluedb',
            configuration=
            "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}"
        )

        initialload_script = S3Assets.Asset(self,
                                            'initial-load-code',
                                            path='./Gluejob/InitialLoad.py')
        incrementalload_script = S3Assets.Asset(
            self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py')

        initialload_script.grant_read(glue_role)
        incrementalload_script.grant_read(glue_role)
        my_table.grant_full_access(glue_role)

        initial_load_job = _glue.CfnJob(
            self,
            'initial-job',
            name='InitialLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                python_version='3',
                script_location='s3://' + initialload_script.s3_bucket_name +
                '/' + initialload_script.s3_object_key),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=configuration['glue_job_setting']
            ['job_capacity'],
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=configuration['glue_job_setting']
                ['max_concurrent_run_JobExecution']))

        incremental_load_job = _glue.CfnJob(
            self,
            'increment-job',
            name='IncrementalLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://' +
                incrementalload_script.s3_bucket_name + '/' +
                incrementalload_script.s3_object_key,
                python_version='3'),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        job_trigger = _glue.CfnTrigger(
            self,
            'datalake-glue-trigger',
            type='SCHEDULED',
            schedule=configuration['job_trigger_schedule'],
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk')
            ])

        dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns')

        endpoint_email = configuration['emailSubscriptionList']

        for emails in endpoint_email:
            dl_sns.add_subscription(_subscrption.EmailSubscription(emails))

        #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL)

        glue_events_target = _events_targets.SnsTopic(dl_sns)

        glue_events_rule = _events.Rule(
            self,
            'gluejobevents-datalake',
            description='Using for tracking the failed glue job of data lake',
            rule_name='dl-gluejob-event',
            event_pattern=_events.EventPattern(
                source=['aws.glue'],
                detail_type=['Glue Job State Change'],
                detail={
                    "jobName": [initial_load_job.name],
                    "state": ["FAILED"]
                }),
            targets=[glue_events_target])

        dms_subscription = _dms.CfnEventSubscription(
            self,
            'dmsevents-datalake',
            sns_topic_arn=dl_sns.topic_arn,
            subscription_name='datalake-dmsevents',
            source_type='replication-task',
            event_categories=['failure'])
Exemple #9
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        s3_logs_bucket = s3.Bucket(
            self,
            "LogsBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            lifecycle_rules=[
                s3.LifecycleRule(
                    abort_incomplete_multipart_upload_after=core.Duration.days(
                        7),
                    expiration=core.Duration.days(30))
            ])

        s3_data_bucket = s3.Bucket(
            self,
            "DataBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            server_access_logs_bucket=s3_logs_bucket,
            server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/")

        glue_database = glue.Database(self,
                                      "GlueDatabase",
                                      database_name=PROJECT_NAME)

        glue_table = glue.Table(
            self,
            "GlueTable",
            columns=[
                glue.Column(name="timestamp",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="celcius",
                            type=glue.Type(input_string="double",
                                           is_primitive=True)),
                glue.Column(name="fahrenheit",
                            type=glue.Type(input_string="double",
                                           is_primitive=True))
            ],
            database=glue_database,
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    "org.apache.hadoop.mapred.TextInputFormat"),
                output_format=glue.OutputFormat(
                    "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
                ),
                serialization_library=glue.SerializationLibrary(
                    "org.openx.data.jsonserde.JsonSerDe")),
            table_name=PROJECT_NAME,
            encryption=glue.TableEncryption.S3_MANAGED,
            partition_keys=[
                glue.Column(name="year",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="month",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="day",
                            type=glue.Type(input_string="int",
                                           is_primitive=True))
            ])

        glue_crawler_role = iam.Role(
            self,
            "GlueCrawlerRole",
            assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AWSGlueServiceRole")
            ])

        s3_data_bucket.grant_read(glue_crawler_role,
                                  objects_key_pattern=f"{PROJECT_PREFIX}/")
        s3_data_bucket.grant_put(glue_crawler_role,
                                 objects_key_pattern=f"{PROJECT_PREFIX}/")

        glue_crawler = glue.CfnCrawler(
            self,
            "GlueCrawler",
            role=glue_crawler_role.role_arn,
            database_name=glue_database.database_name,
            targets={
                "s3Targets": [{
                    "path":
                    f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/"
                }]
            },
            schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
Exemple #10
0
    def __init__(self, scope: core.Construct, id: str, config_dict,
                 **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        """ Create the datalake database """
        createDatalakeDB = glue.Database(
            self,
            "createDatalakeDB",
            database_name=config_dict['datalake_db_name'])

        core.CfnOutput(self,
                       "createDatalakeDBName",
                       value=createDatalakeDB.database_name)
        """ Create Comp Reg Table """

        createDatalakeCompRegTable = glue.Table(
            self,
            "createDatalakeCompRegTable",
            columns=[
                glue.Column(name="lot_compound_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="version_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="smiles",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_mw",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_multiplicity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_name",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="formula_weight",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_alias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereochemistry",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereocomment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="geometric_isomerism",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="elnref",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmass",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="provider",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="purity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="puritymethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="nmrshifts",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lotalias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="molfile",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="checksum",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            database=createDatalakeDB.from_database_arn(
                self, "GetDBArn", database_arn=createDatalakeDB.database_arn),
            data_format=glue.DataFormat(
                input_format=glue.InputFormat.PARQUET,
                output_format=glue.OutputFormat.PARQUET,
                serialization_library=glue.SerializationLibrary.PARQUET),
            table_name="tbl_compound_data",
            bucket=s3.Bucket.from_bucket_name(
                self,
                "getIBucket",
                bucket_name=config_dict['datalake_bucket_name']),
            compressed=True,
            description=
            "This table contains data regarding compound registration coming from  RDS",
            partition_keys=[
                glue.Column(name="dt",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            s3_prefix="compound_reg/compound_data/")

        core.CfnOutput(self,
                       "createDatalakeCompRegTableName",
                       value=createDatalakeCompRegTable.table_name)
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)
        # create db for glue schema
        glue_db = glue.Database(
            self,
            'GlueDB',
            database_name='reddit_data',
        )

        # data schema
        glue_table = glue.Table(
            self,
            'GlueTable',
            table_name='sentiment',
            columns=[
                glue.Column(name='@timestamp', type=glue.Schema.TIMESTAMP),
                glue.Column(name='id', type=glue.Schema.STRING),
                glue.Column(name='subreddit', type=glue.Schema.STRING),
                glue.Column(name='body', type=glue.Schema.STRING),
                glue.Column(name='is_submitter', type=glue.Schema.BOOLEAN),
                glue.Column(name='polarity', type=glue.Schema.FLOAT),
                glue.Column(name='subjectivity', type=glue.Schema.FLOAT),
                glue.Column(name='author', type=glue.Schema.STRING),
            ],
            database=glue_db,
            data_format=glue.DataFormat.PARQUET,
            bucket=s3.Bucket.from_bucket_arn(self, 'DataBucket', BUCKET_ARN),
            s3_prefix='reddit/',
        )

        # role assumed by firehose
        stream_role = iam.Role(
            self,
            'FirehoseRole',
            assumed_by=iam.ServicePrincipal('firehose.amazonaws.com'),
            description='role used by Firehose to access s3 bucket',
        )

        # add s3 statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=[BUCKET_ARN, f'{BUCKET_ARN}/*'],
                actions=[
                    's3:AbortMultipartUpload',
                    's3:GetBucketLocation',
                    's3:GetObject',
                    's3:ListBucket',
                    's3:ListBucketMultipartUploads',
                    's3:PutObject',
                ],
            ))

        # add glue statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=[
                    glue_table.table_arn,
                    glue_db.database_arn,
                    glue_db.catalog_arn,
                ],
                actions=[
                    'glue:GetTable',
                    'glue:GetTableVersion',
                    'glue:GetTableVersions',
                ],
            ))

        # cloudwatch statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=['*'],
                actions=[
                    'logs:PutLogEvents',
                ],
            ))

        data_format_conversion_configuration = kf.CfnDeliveryStream.DataFormatConversionConfigurationProperty(
            enabled=True,
            input_format_configuration=kf.CfnDeliveryStream.
            InputFormatConfigurationProperty(
                deserializer=kf.CfnDeliveryStream.DeserializerProperty(
                    hive_json_ser_de=kf.CfnDeliveryStream.
                    HiveJsonSerDeProperty(), ), ),
            output_format_configuration=kf.CfnDeliveryStream.
            OutputFormatConfigurationProperty(
                serializer=kf.CfnDeliveryStream.SerializerProperty(
                    parquet_ser_de=kf.CfnDeliveryStream.ParquetSerDeProperty(),
                ), ),
            schema_configuration=kf.CfnDeliveryStream.
            SchemaConfigurationProperty(
                database_name=glue_db.database_name,
                table_name=glue_table.table_name,
                role_arn=stream_role.role_arn,
                region='us-east-2',
            ),
        )

        s3_config = kf.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty(
            bucket_arn=BUCKET_ARN,  # temporary, will replace with env variable
            role_arn=stream_role.role_arn,
            data_format_conversion_configuration=
            data_format_conversion_configuration,
            prefix='reddit/',
            buffering_hints=kf.CfnDeliveryStream.BufferingHintsProperty(
                size_in_m_bs=64, ),
        )

        firehose = kf.CfnDeliveryStream(
            self,
            'FirehoseStream',
            delivery_stream_name='RedditDataStream',
            extended_s3_destination_configuration=s3_config,
        )

        # add role dependency
        firehose.node.add_dependency(stream_role)

        # add ECS Fargate instance
        app_role = iam.Role(
            self,
            'RedditStreamingAppRole',
            assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com'),
            description=
            'Role used by the Reddit Streaming Application Fargate Task',
        )

        # add firehose permissions
        app_role.add_to_policy(
            iam.PolicyStatement(
                resources=[firehose.attr_arn],
                actions=[
                    'firehose:DeleteDeliveryStream',
                    'firehose:PutRecord',
                    'firehose:PutRecordBatch',
                    'firehose:UpdateDestination',
                ],
            ))

        # add ecs and cloudwatch permissions
        app_role.add_to_policy(
            iam.PolicyStatement(
                resources=['*'],
                actions=[
                    'ecr:GetAuthorizationToken',
                    'ecr:BatchCheckLayerAvailability',
                    'ecr:GetDownloadUrlForLayer',
                    'ecr:BatchGetImage',
                    'logs:CreateLogStream',
                    'logs:PutLogEvents',
                ],
            ))

        vpc = ec2.Vpc(self, 'RedditVpc', max_azs=3)

        cluster = ecs.Cluster(self, 'RedditCluster', vpc=vpc)

        task_definition = ecs.FargateTaskDefinition(
            self,
            'TaskDefinition',
            memory_limit_mib=512,
            cpu=256,
            task_role=app_role,
        )

        task_definition.add_container(
            id='RedditStreamingApp',
            image=ecs.ContainerImage.from_asset('./sentiment_analysis'),
            command=['all'],
            environment={
                'FIREHOSE_STREAM_NAME': firehose.delivery_stream_name,
                'PRAW_CLIENT_SECRET': os.environ['PRAW_CLIENT_SECRET'],
                'PRAW_CLIENT_ID': os.environ['PRAW_CLIENT_ID'],
                'PRAW_USER_AGENT': os.environ['PRAW_USER_AGENT'],
            },
            logging=ecs.LogDriver.aws_logs(stream_prefix='reddit'),
        )

        container = ecs.FargateService(
            self,
            'StreamingApplication',
            desired_count=1,
            task_definition=task_definition,
            cluster=cluster,
            assign_public_ip=True,
        )
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # Kinesis to lambda
        self.stream_lambda = kinesis_lambda.KinesisStreamsToLambda(
            self,
            'clickstream',
            lambda_function_props=_lambda.FunctionProps(
                runtime=_lambda.Runtime.PYTHON_3_7,
                handler='index.lambda_handler',
                code=_lambda.Code.inline(
                    get_code('send_data_to_firehose.py'))),
            kinesis_stream_props=kinesis.StreamProps(
                stream_name='clickstream',
                retention_period=core.Duration.days(1),
                shard_count=4),
            kinesis_event_source_props=lambda_sources.KinesisEventSourceProps(
                starting_position=_lambda.StartingPosition.TRIM_HORIZON,
                batch_size=1))

        # Lambda to produce data
        self.produce_fake_data = _lambda.Function(
            self,
            'produce_data',
            runtime=_lambda.Runtime.PYTHON_3_7,
            timeout=core.Duration.seconds(90),
            handler='index.lambda_handler',
            code=_lambda.Code.inline(get_code('produce_data.py')),
            environment={
                'STREAM_NAME': self.stream_lambda.kinesis_stream.stream_name
            })
        self.stream_lambda.kinesis_stream.grant_read_write(
            self.produce_fake_data)

        # EventBridge to activate my function above
        self.event_rule = events.Rule(
            self,
            'scheduledRule',
            schedule=events.Schedule.expression('rate(1 minute)'))
        self.event_rule.add_target(
            targets.LambdaFunction(self.produce_fake_data))

        # S3 Bucket
        self.bucket = s3.Bucket(self,
                                'data-clicks-lake',
                                removal_policy=core.RemovalPolicy.DESTROY,
                                auto_delete_objects=True)

        # Glue
        self.glue_db_analytical = glue.Database(
            self,
            'analytic_clickstream',
            database_name='clickstream_db',
            location_uri=None,
        )

        self.glue_table_analytical = glue.Table(
            self,
            'analytical-table',
            table_name='analytical-table',
            columns=[
                glue_column('custid', 'int'),
                glue_column('trafficfrom', 'string'),
                glue_column('url', 'string'),
                glue_column('device', 'string'),
                glue_column('touchproduct', 'int'),
                glue_column('trans_timestamp', 'string')
            ],
            database=self.glue_db_analytical,
            data_format=glue.DataFormat.PARQUET,
            bucket=self.bucket,
            s3_prefix='kinesis/',
        )

        # Firehose
        iam_role_firehose_analytical = self.create_firehose_role()
        self.bucket.grant_read_write(iam_role_firehose_analytical)

        firehose_props = FirehoseProps(
            bucket=self.bucket,
            role=iam_role_firehose_analytical,
            stream=self.stream_lambda.kinesis_stream,
            glue_db=self.glue_db_analytical,
            glue_table=self.glue_table_analytical)

        self.firehose = FirehoseLib(self, 'firehose_clickstream',
                                    firehose_props)

        # Elasticsearh
        self.es_domain = ElasticsearchLib(self,
                                          'ES-clickstream-domain').es_domain

        # Lambda to send data to Elasticsearch
        self.send_data_to_elasticsearch = lambda_python.PythonFunction(
            self,
            'clickstream_to_es',
            entry='./analytics_ml_flow/lambda/lambda_with_requirements/',
            handler='handler',
            timeout=core.Duration.seconds(180),
            index='Kinesis_ES.py',
            environment={
                'ES_HOST_HTTP': self.es_domain.domain_endpoint,
                'ES_INDEX': 'clickstream',
                'ES_IND_TYPE': 'transactions',
                'ES_REGION': 'us-west-2',
            })
        self.es_domain.grant_index_read_write('clickstream',
                                              self.send_data_to_elasticsearch)
        self.es_domain.grant_read_write(self.send_data_to_elasticsearch)

        stream_source = lambda_sources.KinesisEventSource(
            self.stream_lambda.kinesis_stream,
            starting_position=_lambda.StartingPosition.TRIM_HORIZON,
            batch_size=1)

        self.stream_lambda.kinesis_stream.grant_read(
            self.send_data_to_elasticsearch)
        self.send_data_to_elasticsearch.add_event_source(stream_source)

        # Glue Crawler
        crawler_role = self.create_crawler_permissions()
        glue_props = GlueCrawlerProps(bucket=self.bucket, role=crawler_role)
        self.glue_crawler = GlueCrawlerLib(self, 'glueCrawler', glue_props)
Exemple #13
0
    def __init__(self, scope: core.Construct, id: str, source_bucket_name: str,
                 glue_database_name: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # get the source bucket - this object is an IBucketProxy interface, not a Buckt construct.
        # Can not be used to add an event directly. Instead, use a custom resource to add an event trigger later
        source_bucket = s3.Bucket.from_bucket_name(
            self, "MySourceBucket", bucket_name=source_bucket_name)

        # create the new destination bucket - this bucket holds the csv file that containers the FITS header information
        # the name of the bucket will be <stack-id>-fitsstorebucketXXXXXXXX-YYYYYYYYYYYYY
        # e.g. my-fits-datalake-fitsstorebucket1234567f-098765432d
        target_bucket = s3.Bucket(self, "FITSSTORE_BUCKET")

        # Add the astropy and numpy layers for the lambda function that is used as the event trigger on the source_bucket
        layer_astropy = lambda_.LayerVersion(
            self,
            'AstroFitsioLayer',
            code=lambda_.Code.from_asset("resources_layer/astropy.zip"),
            compatible_runtimes=[lambda_.Runtime.PYTHON_3_7])
        # use an AWS provided layer for numpy
        layer_numpy = lambda_.LayerVersion.from_layer_version_arn(
            self, "NumpyLayer",
            "arn:aws:lambda:us-east-1:668099181075:layer:AWSLambda-Python37-SciPy1x:22"
        )

        # create the FITS header extractor lambda function
        # pass the FITSSTORE_BUCKET to the lambda function as an environment variable
        handler = lambda_.Function(
            self,
            "FITSHeaderExtractorHandler",
            runtime=lambda_.Runtime.PYTHON_3_7,
            code=lambda_.Code.asset("resources"),
            handler="fits_header_extractor.fits_header_extractor_handler",
            environment=dict(FITSSTORE_BUCKET=target_bucket.bucket_name),
            layers=[layer_astropy, layer_numpy])

        # grant read access to handler on source bucket
        source_bucket.grant_read(handler)

        # Give the lambda resource based policy
        # both source_arn and source_account is needed for security reason
        handler.add_permission(
            's3-trigger-lambda-s3-invoke-function',
            principal=iam_.ServicePrincipal('s3.amazonaws.com'),
            action='lambda:InvokeFunction',
            source_arn=source_bucket.bucket_arn,
            source_account=self.account)

        # grant access to the handler
        # - this is a lot easier than adding policies, but not all constructs support this
        target_bucket.grant_read_write(handler)

        # map the put event to hanlder - this doesn't work as source_bucket is not really a Bucket object (IBucketProxy)
        # You can use this approach if the bucket is created as a new Bucket object
        #notification = s3_notifications.LambdaDestination(handler)
        #source_bucket.add_object_created_notification(self, notification )

        # use custom resource to add an event trigger on the destnation bucket -
        # the custom resource creation makes an SDK call to create the event notification on the
        # Action reference https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html
        # Events reference https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html
        custom_s3_resource = custom_resources_.AwsCustomResource(
            self,
            's3-putobject-custom-notification-resource',
            policy=custom_resources_.AwsCustomResourcePolicy.from_statements([
                iam_.PolicyStatement(effect=iam_.Effect.ALLOW,
                                     resources=['*'],
                                     actions=['s3:PutBucketNotification'])
            ]),
            on_create=custom_resources_.AwsSdkCall(
                service="S3",
                action="putBucketNotificationConfiguration",
                parameters={
                    "Bucket": source_bucket.bucket_name,
                    "NotificationConfiguration": {
                        "LambdaFunctionConfigurations": [{
                            "Events":
                            ['s3:ObjectCreated:*', 's3:ObjectRemoved:*'],
                            "LambdaFunctionArn":
                            handler.function_arn,
                            "Filter": {
                                "Key": {
                                    "FilterRules": [{
                                        'Name': 'suffix',
                                        'Value': 'fits'
                                    }]
                                }
                            }
                        }]
                    }
                },
                physical_resource_id=custom_resources_.PhysicalResourceId.of(
                    f's3-notification-resource-{str(uuid.uuid1())}'),
                region=self.region))

        # Make sure the lambda function is created first
        custom_s3_resource.node.add_dependency(
            handler.permissions_node.find_child(
                's3-trigger-lambda-s3-invoke-function'))

        # create a glue crawler to build the data catalog
        # Step 1 . create a role for AWS Glue
        glue_role = iam_.Role(
            self,
            "glue_role",
            assumed_by=iam_.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                iam_.ManagedPolicy.from_managed_policy_arn(
                    self,
                    'MyFitsCrawlerGlueRole',
                    managed_policy_arn=
                    'arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole')
            ])
        # glue role needs "*" read/write - otherwise crawler will not be able to create tables (and no error messages in crawler logs)
        glue_role.add_to_policy(
            iam_.PolicyStatement(actions=[
                's3:GetObject', 's3:PutObject', 'lakeformation:GetDataAccess'
            ],
                                 effect=iam_.Effect.ALLOW,
                                 resources=['*']))

        # Step 2. create a database in data catalog
        db = glue_.Database(self,
                            "MyFitsDatabase",
                            database_name=glue_database_name)

        # Step 3. create a crawler named "fitsdatalakecrawler-<hex>", and schedule to run every 15 mins
        # You can change the frequency based on your needs
        # cron schedule format cron(Minutes Hours Day-of-month Month Day-of-week Year)
        glue_.CfnCrawler(
            self,
            "fits-datalake-crawler",
            database_name=glue_database_name,
            role=glue_role.role_arn,
            schedule={"scheduleExpression": "cron(0/15 * * * ? *)"},
            targets={"s3Targets": [{
                "path": target_bucket.bucket_name
            }]},
        )

        # When your AWS Lake Formation Data catalog settings is not set to
        # "Use only IAM access control for new databases" or
        # "Use only IAM access control for new tables in new databse"
        # you need to grant additional permission to the data catalog database.
        # in order for the crawler to run, we need to add some permissions to lakeformation

        location_resource = lakeformation_.CfnResource(
            self,
            "MyFitsDatalakeLocationResource",
            resource_arn=target_bucket.bucket_arn,
            use_service_linked_role=True)
        lakeformation_.CfnPermissions(
            self,
            "MyFitsDatalakeDatabasePermission",
            data_lake_principal=lakeformation_.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=lakeformation_.CfnPermissions.ResourceProperty(
                database_resource=lakeformation_.CfnPermissions.
                DatabaseResourceProperty(name=db.database_name)),
            permissions=["ALTER", "DROP", "CREATE_TABLE"],
        )
        location_permission = lakeformation_.CfnPermissions(
            self,
            "MyFitsDatalakeLocationPermission",
            data_lake_principal=lakeformation_.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=lakeformation_.CfnPermissions.ResourceProperty(
                data_location_resource=lakeformation_.CfnPermissions.
                DataLocationResourceProperty(
                    s3_resource=target_bucket.bucket_arn)),
            permissions=["DATA_LOCATION_ACCESS"],
        )
        #make sure the location resource is created first
        location_permission.node.add_dependency(location_resource)
    def __init__(self, scope: core.Construct, id: str, region_name: str,
                 db_name: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        # CloudTrail
        bucket = s3.Bucket(self, 'TrailBucket', versioned=True)
        tail = cloudtrail.Trail(self, 'CloudTrail', bucket=bucket)

        db = glue.Database(self, 'cloudtrail', database_name=db_name)

        awg = core.CfnResource(
            self,
            'AthenaWorkGroup',
            type="AWS::Athena::WorkGroup",
            properties={
                "Name": f"{db_name}",
                "State": "ENABLED",
                "WorkGroupConfiguration": {
                    "ResultConfiguration": {
                        "OutputLocation":
                        f"s3://{bucket.bucket_name}/athena_output/"
                    }
                }
            })

        # Pipeline for Working on Data
        project = codebuild.Project(
            self,
            'learner_build',
            build_spec=codebuild.BuildSpec.from_source_filename(
                'buildspec.yml'),
            environment_variables={
                'arn': {
                    'value': '-- Pur ARN Here --'
                },
                'athena_database': {
                    'value': db_name
                },
                'region_name': {
                    'value': region_name
                },
                'bucket': {
                    'value': bucket.bucket_name
                }
            },
            source=codebuild.Source.s3(bucket=bucket,
                                       path='pipeline/learner.zip'))
        project.add_to_role_policy(
            iam.PolicyStatement(actions=['athena:*'], resources=['*']))

        project.add_to_role_policy(
            iam.PolicyStatement(actions=['iam:*'], resources=['*']))

        project.add_to_role_policy(
            iam.PolicyStatement(actions=['glue:*'], resources=['*']))

        project.add_to_role_policy(
            iam.PolicyStatement(actions=['s3:*'], resources=['*']))

        # Lambdas and Api GW
        api = agw.RestApi(self,
                          "learner-api",
                          rest_api_name="Learner Service",
                          description="System to learn roles")

        switcher = lambda_.Function(
            self,
            "Switcher",
            runtime=lambda_.Runtime.PYTHON_3_8,
            code=lambda_.Code.from_asset("lambdas/switcher"),
            handler="main.handler",
        )
        switcher.add_to_role_policy(
            iam.PolicyStatement(actions=['iam:*'], resources=['*']))

        frontend = lambda_.Function(
            self,
            "Frontend",
            runtime=lambda_.Runtime.PYTHON_3_8,
            code=lambda_.Code.from_asset("lambdas/frontend"),
            handler="main.handler",
        )

        learner = lambda_.Function(
            self,
            "Learner",
            runtime=lambda_.Runtime.PYTHON_3_8,
            code=lambda_.Code.from_asset("lambdas/learner"),
            handler="main.handler",
            environment={
                'codebuild': project.project_name,
                'region_name': region_name
            })

        learner.add_to_role_policy(
            iam.PolicyStatement(actions=['codebuild:StartBuild'],
                                resources=[project.project_arn]))

        get_switcher_integration = agw.LambdaIntegration(
            switcher,
            request_templates={"application/json": '{ "statusCode": "200" }'})

        get_frontend_integration = agw.LambdaIntegration(
            frontend,
            request_templates={"application/json": '{ "statusCode": "200" }'})

        get_learner_integration = agw.LambdaIntegration(
            learner,
            request_templates={"application/json": '{ "statusCode": "200" }'})

        api.root.add_method("GET", get_frontend_integration)

        switch = api.root.add_resource('switch')
        switch.add_method("GET", get_switcher_integration)

        learn = api.root.add_resource('learn')
        learn.add_method("GET", get_learner_integration)

        # Outputs
        core.CfnOutput(self, 'BucketName', value=bucket.bucket_name)
Exemple #15
0
    def create_glue_resources(self) -> None:
        '''Creates Glue Database and Tables
        '''
        if not hasattr(self, 'glue_attr'):
            self.prepare_glue_attr_types()

        col = aws_glue.Column

        # Kinesis and Athena depends on data schema declarations that should
        # be in a Database and Table in AWS Glue
        self.glue_db_analytical = aws_glue.Database(
            self,
            'sls-blog-analytical-db',
            database_name='sls-blog-analytical',
            location_uri=None,
        )

        self.glue_table_analytical = aws_glue.Table(
            self,
            'analytical-table',
            table_name='analytical-table',
            columns=[
                col(name='id', type=self.glue_attr_string),
                col(name='publish_timestamp', type=self.glue_attr_timestamp),
                col(name='publisher_email', type=self.glue_attr_string),
                col(name='publisher_name', type=self.glue_attr_string),
                col(name='item_type', type=self.glue_attr_string),
                col(name='title', type=self.glue_attr_string),
                col(name='body', type=self.glue_attr_string),
            ],
            database=self.glue_db_analytical,
            data_format=aws_glue.DataFormat.PARQUET,
            bucket=self.bucket_analytical,
            s3_prefix='kinesis/',
        )

        self.glue_table_likes = aws_glue.Table(
            self,
            'likes-table',
            table_name='likes-table',
            columns=[
                col(name='id', type=self.glue_attr_string),
                col(name='like', type=self.glue_attr_integer),
            ],
            database=self.glue_db_analytical,
            data_format=aws_glue.DataFormat.PARQUET,
            bucket=self.bucket_likes,
            s3_prefix='kinesis/',
        )

        self.glue_table_apirequests = aws_glue.Table(
            self,
            'apirequests-table',
            table_name='apirequests-table',
            columns=[
                col(name='id', type=self.glue_attr_string),
                col(name='item_type', type=self.glue_attr_string),
                col(name='http_method', type=self.glue_attr_string),
                col(name='timestamp', type=self.glue_attr_timestamp),
                col(name='datetime', type=self.glue_attr_date),
                col(name='ip_address', type=self.glue_attr_string),
                col(name='user_agent', type=self.glue_attr_string),
                col(name='origin', type=self.glue_attr_string),
                col(name='country_code', type=self.glue_attr_string),
                col(name='device_type', type=self.glue_attr_string),
                col(name='action', type=self.glue_attr_string),
                col(name='article_id', type=self.glue_attr_string),
            ],
            database=self.glue_db_analytical,
            data_format=aws_glue.DataFormat.PARQUET,
            bucket=self.bucket_apirequests,
            s3_prefix='kinesis/',
        )
Exemple #16
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        s3_org_data = _s3.Bucket(self,
                                 ORIGINAL_DATA_BUCKET_NAME,
                                 bucket_name=ORIGINAL_DATA_BUCKET_NAME,
                                 removal_policy=core.RemovalPolicy.RETAIN)
        s3_transformed_data = _s3.Bucket(
            self,
            TRANSFORMED_DATA_BUCKET_NAME,
            bucket_name=TRANSFORMED_DATA_BUCKET_NAME,
            removal_policy=core.RemovalPolicy.RETAIN)

        # title-read
        s3_deployment.BucketDeployment(
            self,
            "s3-deployment-{}".format(TITLE_READ),
            sources=[
                s3_deployment.Source.asset("data/{}/".format(TITLE_READ))
            ],
            destination_bucket=s3_org_data,
            destination_key_prefix="{}/".format(TITLE_READ))
        # title
        s3_deployment.BucketDeployment(
            self,
            "s3-deployment-{}".format(TITLE),
            sources=[s3_deployment.Source.asset("data/{}/".format(TITLE))],
            destination_bucket=s3_org_data,
            destination_key_prefix="{}/".format(TITLE))
        # user
        s3_deployment.BucketDeployment(
            self,
            "s3-deployment-{}".format(USER),
            sources=[s3_deployment.Source.asset("data/{}/".format(USER))],
            destination_bucket=s3_org_data,
            destination_key_prefix="{}/".format(USER))

        statement = iam.PolicyStatement(actions=[
            "s3:*", "glue:*", "iam:ListRolePolicies", "iam:GetRole",
            "iam:GetRolePolicy"
        ],
                                        resources=["*"])
        write_to_s3_policy = iam.PolicyDocument(statements=[statement])

        glue_role = iam.Role(
            self,
            'GlueCrawlerRole-dna',
            role_name='GlueCrawlerRole-dna',
            inline_policies=[write_to_s3_policy],
            assumed_by=iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        #TODO add IAM role for ctas lambda

        dna_database = glue.Database(self,
                                     "dna-glue-database-id",
                                     database_name=GLUE_DATABASE_NAME)

        # create glue table
        title_read_table = glue.Table(
            self,
            "{}-table-id".format(TITLE_READ),
            table_name="{}_table".format(TITLE_READ).replace("-", "_"),
            database=dna_database,
            columns=[{
                "name": "USER_ID",
                "type": glue.Schema.STRING
            }, {
                "name": "ITEM_ID",
                "type": glue.Schema.STRING
            }, {
                "name": "TIMESTAMP",
                "type": glue.Schema.BIG_INT
            }, {
                "name": "TITLE",
                "type": glue.Schema.STRING
            }, {
                "name": "EVENT_TYPE",
                "type": glue.Schema.STRING
            }],
            data_format=glue.DataFormat.CSV,
            bucket=s3_org_data,
            s3_prefix=TITLE_READ)

        title_table = glue.Table(self,
                                 "{}-table-id".format(TITLE),
                                 table_name="{}_table".format(TITLE).replace(
                                     "-", "_"),
                                 database=dna_database,
                                 columns=[{
                                     "name": "ITEM_ID",
                                     "type": glue.Schema.STRING
                                 }, {
                                     "name": "CREATION_TIMESTAMP",
                                     "type": glue.Schema.BIG_INT
                                 }, {
                                     "name": "TITLE",
                                     "type": glue.Schema.STRING
                                 }, {
                                     "name": "TAG",
                                     "type": glue.Schema.STRING
                                 }],
                                 data_format=glue.DataFormat.CSV,
                                 bucket=s3_org_data,
                                 s3_prefix=TITLE)

        user_table = glue.Table(self,
                                "{}-table-id".format(USER),
                                table_name="{}_table".format(USER).replace(
                                    "-", "_"),
                                database=dna_database,
                                columns=[
                                    {
                                        "name": "USER_ID",
                                        "type": glue.Schema.STRING
                                    },
                                    {
                                        "name": "NAME",
                                        "type": glue.Schema.STRING
                                    },
                                    {
                                        "name": "EMAIL",
                                        "type": glue.Schema.STRING
                                    },
                                    {
                                        "name": "GENDER",
                                        "type": glue.Schema.STRING,
                                        "categorical": True
                                    },
                                    {
                                        "name": "AGE",
                                        "type": glue.Schema.BIG_INT,
                                        "categorical": True
                                    },
                                ],
                                data_format=glue.DataFormat.CSV,
                                bucket=s3_org_data,
                                s3_prefix=USER)

        _athena.CfnWorkGroup(self,
                             "athena_workgroup_id",
                             name=ATHENA_WORKGROUP)

        ctas_lambda_trigger = _event.Rule(
            self,
            "ctas-lambda-trigger-event-id",
            rule_name="ctas-lambda-trigger-event",
            schedule=_event.Schedule.cron(minute="10", hour="*"))

        s3_statement = iam.PolicyStatement(
            effect=iam.Effect.ALLOW,
            # resources = [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)],
            resources=["*"],
            actions=["s3:*"])
        athena_statement = iam.PolicyStatement(
            effect=iam.Effect.ALLOW,
            resources=["*"],
            actions=["athena:StartQueryExecution", "glue:*"])

        ctas_lambda_func = _lambda.Function(
            self,
            "CTAS_query",
            function_name="CTAS_query",
            runtime=_lambda.Runtime.PYTHON_3_7,
            code=_lambda.Code.asset("src/lambda"),
            handler="ctas_lambda.lambda_handler",
            description="CTAS query to transform AVRO file, batch execution",
            environment={
                "BUCKET_NAME": s3_transformed_data.bucket_name,
                "DATABASE_NAME": GLUE_DATABASE_NAME,
                "ATHENA_WORKGROUP": ATHENA_WORKGROUP
            },
            timeout=core.Duration.minutes(3))
        ctas_lambda_func.add_to_role_policy(s3_statement)
        ctas_lambda_func.add_to_role_policy(athena_statement)

        ctas_lambda_trigger.add_target(
            _target.LambdaFunction(ctas_lambda_func))