def __init__(self, scope: core.Construct, data_lake: DataLake,
                 **kwargs) -> None:
        self.env = data_lake.env.value
        super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs)

        self.atomic_events_crawler = glue.CfnCrawler(
            self,
            f'{self.env}-atomic-events-crawler',
            name=f'{self.env}-atomic-events-crawler',
            description=
            'Crawler to detect schema of data sored in data lake raw, atomic events',
            schedule=glue.CfnCrawler.ScheduleProperty(
                schedule_expression='cron(0/15 * * * ? *)'),
            role=data_lake.data_lake_role.role_arn,
            targets=glue.CfnCrawler.TargetsProperty(s3_targets=[
                glue.CfnCrawler.S3TargetProperty(
                    path=
                    f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events'
                )
            ]),
            database_name=data_lake.data_lake_raw_database.database_name)

        self.orders_table = glue.Table(
            self,
            f'{self.env}-orders-table',
            table_name='orders',
            description='orders captured from Postgres using DMS CDC',
            database=data_lake.data_lake_raw_database,
            compressed=True,
            data_format=glue.DataFormat(
                input_format=glue.InputFormat.TEXT,
                output_format=glue.OutputFormat.HIVE_IGNORE_KEY_TEXT,
                serialization_library=glue.SerializationLibrary.OPEN_CSV),
            s3_prefix='orders',
            bucket=data_lake.data_lake_raw_bucket,
            columns=[
                glue.Column(name='created_at',
                            type=glue.Type(input_string='datetime',
                                           is_primitive=True)),
                glue.Column(name='order_id',
                            type=glue.Type(input_string='integer',
                                           is_primitive=True)),
                glue.Column(name='product_name',
                            type=glue.Type(input_string='string',
                                           is_primitive=True)),
                glue.Column(name='value',
                            type=glue.Type(input_string='float',
                                           is_primitive=True))
            ])
Beispiel #2
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        self._region = 'aws_region'
        self._account_id = 'aws_account_id'

        bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket')

        database = glue.Database(self,
                                 id='my_database_id',
                                 database_name='poc')

        table = glue.Table(
            self,
            id='my_table_id',
            database=database,
            table_name='my_table',
            columns=[
                glue.Column(name='col1',
                            type=glue.Type(input_string='string',
                                           is_primitive=True)),
                glue.Column(name='col2',
                            type=glue.Type(input_string='int',
                                           is_primitive=True))
            ],
            partition_keys=[
                glue.Column(name='dt',
                            type=glue.Type(input_string='string',
                                           is_primitive=True))
            ],
            bucket=bucket,
            s3_prefix='test_data',
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    'org.apache.hadoop.mapred.TextInputFormat'),
                output_format=glue.OutputFormat(
                    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
                ),
                serialization_library=glue.SerializationLibrary(
                    'org.openx.data.jsonserde.JsonSerDe')))
Beispiel #3
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        s3_logs_bucket = s3.Bucket(
            self,
            "LogsBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            lifecycle_rules=[
                s3.LifecycleRule(
                    abort_incomplete_multipart_upload_after=core.Duration.days(
                        7),
                    expiration=core.Duration.days(30))
            ])

        s3_data_bucket = s3.Bucket(
            self,
            "DataBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            server_access_logs_bucket=s3_logs_bucket,
            server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/")

        glue_database = glue.Database(self,
                                      "GlueDatabase",
                                      database_name=PROJECT_NAME)

        glue_table = glue.Table(
            self,
            "GlueTable",
            columns=[
                glue.Column(name="timestamp",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="celcius",
                            type=glue.Type(input_string="double",
                                           is_primitive=True)),
                glue.Column(name="fahrenheit",
                            type=glue.Type(input_string="double",
                                           is_primitive=True))
            ],
            database=glue_database,
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    "org.apache.hadoop.mapred.TextInputFormat"),
                output_format=glue.OutputFormat(
                    "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
                ),
                serialization_library=glue.SerializationLibrary(
                    "org.openx.data.jsonserde.JsonSerDe")),
            table_name=PROJECT_NAME,
            encryption=glue.TableEncryption.S3_MANAGED,
            partition_keys=[
                glue.Column(name="year",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="month",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="day",
                            type=glue.Type(input_string="int",
                                           is_primitive=True))
            ])

        glue_crawler_role = iam.Role(
            self,
            "GlueCrawlerRole",
            assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AWSGlueServiceRole")
            ])

        s3_data_bucket.grant_read(glue_crawler_role,
                                  objects_key_pattern=f"{PROJECT_PREFIX}/")
        s3_data_bucket.grant_put(glue_crawler_role,
                                 objects_key_pattern=f"{PROJECT_PREFIX}/")

        glue_crawler = glue.CfnCrawler(
            self,
            "GlueCrawler",
            role=glue_crawler_role.role_arn,
            database_name=glue_database.database_name,
            targets={
                "s3Targets": [{
                    "path":
                    f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/"
                }]
            },
            schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
Beispiel #4
0
    def __init__(self, scope: core.Construct, id: str, config_dict,
                 **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        """ Create the datalake database """
        createDatalakeDB = glue.Database(
            self,
            "createDatalakeDB",
            database_name=config_dict['datalake_db_name'])

        core.CfnOutput(self,
                       "createDatalakeDBName",
                       value=createDatalakeDB.database_name)
        """ Create Comp Reg Table """

        createDatalakeCompRegTable = glue.Table(
            self,
            "createDatalakeCompRegTable",
            columns=[
                glue.Column(name="lot_compound_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="version_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="smiles",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_mw",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_multiplicity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_name",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="formula_weight",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_alias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereochemistry",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereocomment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="geometric_isomerism",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="elnref",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmass",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="provider",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="purity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="puritymethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="nmrshifts",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lotalias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="molfile",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="checksum",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            database=createDatalakeDB.from_database_arn(
                self, "GetDBArn", database_arn=createDatalakeDB.database_arn),
            data_format=glue.DataFormat(
                input_format=glue.InputFormat.PARQUET,
                output_format=glue.OutputFormat.PARQUET,
                serialization_library=glue.SerializationLibrary.PARQUET),
            table_name="tbl_compound_data",
            bucket=s3.Bucket.from_bucket_name(
                self,
                "getIBucket",
                bucket_name=config_dict['datalake_bucket_name']),
            compressed=True,
            description=
            "This table contains data regarding compound registration coming from  RDS",
            partition_keys=[
                glue.Column(name="dt",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            s3_prefix="compound_reg/compound_data/")

        core.CfnOutput(self,
                       "createDatalakeCompRegTableName",
                       value=createDatalakeCompRegTable.table_name)