def __init__(self, scope: core.Construct, data_lake: DataLake, **kwargs) -> None: self.env = data_lake.env.value super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs) self.atomic_events_crawler = glue.CfnCrawler( self, f'{self.env}-atomic-events-crawler', name=f'{self.env}-atomic-events-crawler', description= 'Crawler to detect schema of data sored in data lake raw, atomic events', schedule=glue.CfnCrawler.ScheduleProperty( schedule_expression='cron(0/15 * * * ? *)'), role=data_lake.data_lake_role.role_arn, targets=glue.CfnCrawler.TargetsProperty(s3_targets=[ glue.CfnCrawler.S3TargetProperty( path= f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events' ) ]), database_name=data_lake.data_lake_raw_database.database_name) self.orders_table = glue.Table( self, f'{self.env}-orders-table', table_name='orders', description='orders captured from Postgres using DMS CDC', database=data_lake.data_lake_raw_database, compressed=True, data_format=glue.DataFormat( input_format=glue.InputFormat.TEXT, output_format=glue.OutputFormat.HIVE_IGNORE_KEY_TEXT, serialization_library=glue.SerializationLibrary.OPEN_CSV), s3_prefix='orders', bucket=data_lake.data_lake_raw_bucket, columns=[ glue.Column(name='created_at', type=glue.Type(input_string='datetime', is_primitive=True)), glue.Column(name='order_id', type=glue.Type(input_string='integer', is_primitive=True)), glue.Column(name='product_name', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='value', type=glue.Type(input_string='float', is_primitive=True)) ])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) self._region = 'aws_region' self._account_id = 'aws_account_id' bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket') database = glue.Database(self, id='my_database_id', database_name='poc') table = glue.Table( self, id='my_table_id', database=database, table_name='my_table', columns=[ glue.Column(name='col1', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='col2', type=glue.Type(input_string='int', is_primitive=True)) ], partition_keys=[ glue.Column(name='dt', type=glue.Type(input_string='string', is_primitive=True)) ], bucket=bucket, s3_prefix='test_data', data_format=glue.DataFormat( input_format=glue.InputFormat( 'org.apache.hadoop.mapred.TextInputFormat'), output_format=glue.OutputFormat( 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ), serialization_library=glue.SerializationLibrary( 'org.openx.data.jsonserde.JsonSerDe')))
def __init__( self, scope: core.Construct, glue_database: BaseDataLakeGlueDatabase, glue_role: BaseDataLakeGlueRole, **kwargs, ) -> None: self.glue_role = glue_role self.glue_database = glue_database self.deploy_env = self.glue_database.deploy_env self.data_lake_bucket = self.glue_database.data_lake_bucket self.obj_name = f"glue-{self.deploy_env.value}-orders-table" super().__init__( scope, self.obj_name, table_name="orders", description="orders captured from Postgres using DMS CDC", database=self.glue_database, compressed=True, data_format=glue.DataFormat.PARQUET, s3_prefix="orders/public/orders", bucket=self.data_lake_bucket, columns=[ glue.Column(name="op", type=glue.Type(input_string="string", is_primitive=True)), glue.Column( name="extracted_at", type=glue.Type(input_string="string", is_primitive=True), ), glue.Column( name="created_at", type=glue.Type(input_string="timestamp", is_primitive=True), ), glue.Column(name="order_id", type=glue.Type(input_string="int", is_primitive=True)), glue.Column( name="product_name", type=glue.Type(input_string="string", is_primitive=True), ), glue.Column(name="value", type=glue.Type(input_string="double", is_primitive=True)), ], **kwargs, )
def glue_column(name, col_type, is_primitive=True): return glue.Column(name=name, type=glue.Type(input_string=col_type, is_primitive=is_primitive))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_logs_bucket = s3.Bucket( self, "LogsBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, lifecycle_rules=[ s3.LifecycleRule( abort_incomplete_multipart_upload_after=core.Duration.days( 7), expiration=core.Duration.days(30)) ]) s3_data_bucket = s3.Bucket( self, "DataBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, server_access_logs_bucket=s3_logs_bucket, server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/") glue_database = glue.Database(self, "GlueDatabase", database_name=PROJECT_NAME) glue_table = glue.Table( self, "GlueTable", columns=[ glue.Column(name="timestamp", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="celcius", type=glue.Type(input_string="double", is_primitive=True)), glue.Column(name="fahrenheit", type=glue.Type(input_string="double", is_primitive=True)) ], database=glue_database, data_format=glue.DataFormat( input_format=glue.InputFormat( "org.apache.hadoop.mapred.TextInputFormat"), output_format=glue.OutputFormat( "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" ), serialization_library=glue.SerializationLibrary( "org.openx.data.jsonserde.JsonSerDe")), table_name=PROJECT_NAME, encryption=glue.TableEncryption.S3_MANAGED, partition_keys=[ glue.Column(name="year", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="month", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="day", type=glue.Type(input_string="int", is_primitive=True)) ]) glue_crawler_role = iam.Role( self, "GlueCrawlerRole", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AWSGlueServiceRole") ]) s3_data_bucket.grant_read(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") s3_data_bucket.grant_put(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") glue_crawler = glue.CfnCrawler( self, "GlueCrawler", role=glue_crawler_role.role_arn, database_name=glue_database.database_name, targets={ "s3Targets": [{ "path": f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/" }] }, schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
def __init__(self, scope: core.Construct, id: str, config_dict, **kwargs) -> None: super().__init__(scope, id, **kwargs) """ Create the datalake database """ createDatalakeDB = glue.Database( self, "createDatalakeDB", database_name=config_dict['datalake_db_name']) core.CfnOutput(self, "createDatalakeDBName", value=createDatalakeDB.database_name) """ Create Comp Reg Table """ createDatalakeCompRegTable = glue.Table( self, "createDatalakeCompRegTable", columns=[ glue.Column(name="lot_compound_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="version_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="smiles", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_mw", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_multiplicity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_name", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="formula_weight", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_alias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereochemistry", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereocomment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="geometric_isomerism", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="elnref", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmass", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="provider", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="purity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="puritymethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="nmrshifts", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lotalias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="molfile", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="checksum", type=glue.Type(input_string="string", is_primitive=True)) ], database=createDatalakeDB.from_database_arn( self, "GetDBArn", database_arn=createDatalakeDB.database_arn), data_format=glue.DataFormat( input_format=glue.InputFormat.PARQUET, output_format=glue.OutputFormat.PARQUET, serialization_library=glue.SerializationLibrary.PARQUET), table_name="tbl_compound_data", bucket=s3.Bucket.from_bucket_name( self, "getIBucket", bucket_name=config_dict['datalake_bucket_name']), compressed=True, description= "This table contains data regarding compound registration coming from RDS", partition_keys=[ glue.Column(name="dt", type=glue.Type(input_string="string", is_primitive=True)) ], s3_prefix="compound_reg/compound_data/") core.CfnOutput(self, "createDatalakeCompRegTableName", value=createDatalakeCompRegTable.table_name)