def __init__(self, scope: core.Construct, data_lake: DataLake, **kwargs) -> None: self.env = data_lake.env.value super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs) self.atomic_events_crawler = glue.CfnCrawler( self, f'{self.env}-atomic-events-crawler', name=f'{self.env}-atomic-events-crawler', description= 'Crawler to detect schema of data sored in data lake raw, atomic events', schedule=glue.CfnCrawler.ScheduleProperty( schedule_expression='cron(0/15 * * * ? *)'), role=data_lake.data_lake_role.role_arn, targets=glue.CfnCrawler.TargetsProperty(s3_targets=[ glue.CfnCrawler.S3TargetProperty( path= f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events' ) ]), database_name=data_lake.data_lake_raw_database.database_name) self.orders_table = glue.Table( self, f'{self.env}-orders-table', table_name='orders', description='orders captured from Postgres using DMS CDC', database=data_lake.data_lake_raw_database, compressed=True, data_format=glue.DataFormat.PARQUET, s3_prefix='orders/public/orders', bucket=data_lake.data_lake_raw_bucket, columns=[ glue.Column(name='op', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='extracted_at', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='created_at', type=glue.Type(input_string='timestamp', is_primitive=True)), glue.Column(name='order_id', type=glue.Type(input_string='int', is_primitive=True)), glue.Column(name='product_name', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='value', type=glue.Type(input_string='double', is_primitive=True)) ])
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) self._region = 'aws_region' self._account_id = 'aws_account_id' bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket') database = glue.Database(self, id='my_database_id', database_name='poc') table = glue.Table( self, id='my_table_id', database=database, table_name='my_table', columns=[ glue.Column(name='col1', type=glue.Type(input_string='string', is_primitive=True)), glue.Column(name='col2', type=glue.Type(input_string='int', is_primitive=True)) ], partition_keys=[ glue.Column(name='dt', type=glue.Type(input_string='string', is_primitive=True)) ], bucket=bucket, s3_prefix='test_data', data_format=glue.DataFormat( input_format=glue.InputFormat( 'org.apache.hadoop.mapred.TextInputFormat'), output_format=glue.OutputFormat( 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ), serialization_library=glue.SerializationLibrary( 'org.openx.data.jsonserde.JsonSerDe')))
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_logs_bucket = s3.Bucket( self, "LogsBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, lifecycle_rules=[ s3.LifecycleRule( abort_incomplete_multipart_upload_after=core.Duration.days( 7), expiration=core.Duration.days(30)) ]) s3_data_bucket = s3.Bucket( self, "DataBucket", encryption=s3.BucketEncryption.KMS_MANAGED, block_public_access=s3.BlockPublicAccess.BLOCK_ALL, server_access_logs_bucket=s3_logs_bucket, server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/") glue_database = glue.Database(self, "GlueDatabase", database_name=PROJECT_NAME) glue_table = glue.Table( self, "GlueTable", columns=[ glue.Column(name="timestamp", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="celcius", type=glue.Type(input_string="double", is_primitive=True)), glue.Column(name="fahrenheit", type=glue.Type(input_string="double", is_primitive=True)) ], database=glue_database, data_format=glue.DataFormat( input_format=glue.InputFormat( "org.apache.hadoop.mapred.TextInputFormat"), output_format=glue.OutputFormat( "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat" ), serialization_library=glue.SerializationLibrary( "org.openx.data.jsonserde.JsonSerDe")), table_name=PROJECT_NAME, encryption=glue.TableEncryption.S3_MANAGED, partition_keys=[ glue.Column(name="year", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="month", type=glue.Type(input_string="int", is_primitive=True)), glue.Column(name="day", type=glue.Type(input_string="int", is_primitive=True)) ]) glue_crawler_role = iam.Role( self, "GlueCrawlerRole", assumed_by=iam.ServicePrincipal("glue.amazonaws.com"), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( "AWSGlueServiceRole") ]) s3_data_bucket.grant_read(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") s3_data_bucket.grant_put(glue_crawler_role, objects_key_pattern=f"{PROJECT_PREFIX}/") glue_crawler = glue.CfnCrawler( self, "GlueCrawler", role=glue_crawler_role.role_arn, database_name=glue_database.database_name, targets={ "s3Targets": [{ "path": f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/" }] }, schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
def __init__(self, scope: core.Construct, id: str, config_dict, **kwargs) -> None: super().__init__(scope, id, **kwargs) """ Create the datalake database """ createDatalakeDB = glue.Database( self, "createDatalakeDB", database_name=config_dict['datalake_db_name']) core.CfnOutput(self, "createDatalakeDBName", value=createDatalakeDB.database_name) """ Create Comp Reg Table """ createDatalakeCompRegTable = glue.Table( self, "createDatalakeCompRegTable", columns=[ glue.Column(name="lot_compound_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="version_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_id", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="smiles", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_mw", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_multiplicity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="salt_name", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="formula_weight", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_alias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereochemistry", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="stereocomment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="geometric_isomerism", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="parent_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="elnref", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="msmass", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="provider", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="purity", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="puritymethod", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="nmrshifts", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lotalias", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_comment", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="lot_project", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="molfile", type=glue.Type(input_string="string", is_primitive=True)), glue.Column(name="checksum", type=glue.Type(input_string="string", is_primitive=True)) ], database=createDatalakeDB.from_database_arn( self, "GetDBArn", database_arn=createDatalakeDB.database_arn), data_format=glue.DataFormat( input_format=glue.InputFormat.PARQUET, output_format=glue.OutputFormat.PARQUET, serialization_library=glue.SerializationLibrary.PARQUET), table_name="tbl_compound_data", bucket=s3.Bucket.from_bucket_name( self, "getIBucket", bucket_name=config_dict['datalake_bucket_name']), compressed=True, description= "This table contains data regarding compound registration coming from RDS", partition_keys=[ glue.Column(name="dt", type=glue.Type(input_string="string", is_primitive=True)) ], s3_prefix="compound_reg/compound_data/") core.CfnOutput(self, "createDatalakeCompRegTableName", value=createDatalakeCompRegTable.table_name)
def __init__(self, scope: core.Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) # create db for glue schema glue_db = glue.Database( self, 'GlueDB', database_name='reddit_data', ) # data schema glue_table = glue.Table( self, 'GlueTable', table_name='sentiment', columns=[ glue.Column(name='@timestamp', type=glue.Schema.TIMESTAMP), glue.Column(name='id', type=glue.Schema.STRING), glue.Column(name='subreddit', type=glue.Schema.STRING), glue.Column(name='body', type=glue.Schema.STRING), glue.Column(name='is_submitter', type=glue.Schema.BOOLEAN), glue.Column(name='polarity', type=glue.Schema.FLOAT), glue.Column(name='subjectivity', type=glue.Schema.FLOAT), glue.Column(name='author', type=glue.Schema.STRING), ], database=glue_db, data_format=glue.DataFormat.PARQUET, bucket=s3.Bucket.from_bucket_arn(self, 'DataBucket', BUCKET_ARN), s3_prefix='reddit/', ) # role assumed by firehose stream_role = iam.Role( self, 'FirehoseRole', assumed_by=iam.ServicePrincipal('firehose.amazonaws.com'), description='role used by Firehose to access s3 bucket', ) # add s3 statement stream_role.add_to_policy( iam.PolicyStatement( resources=[BUCKET_ARN, f'{BUCKET_ARN}/*'], actions=[ 's3:AbortMultipartUpload', 's3:GetBucketLocation', 's3:GetObject', 's3:ListBucket', 's3:ListBucketMultipartUploads', 's3:PutObject', ], )) # add glue statement stream_role.add_to_policy( iam.PolicyStatement( resources=[ glue_table.table_arn, glue_db.database_arn, glue_db.catalog_arn, ], actions=[ 'glue:GetTable', 'glue:GetTableVersion', 'glue:GetTableVersions', ], )) # cloudwatch statement stream_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'logs:PutLogEvents', ], )) data_format_conversion_configuration = kf.CfnDeliveryStream.DataFormatConversionConfigurationProperty( enabled=True, input_format_configuration=kf.CfnDeliveryStream. InputFormatConfigurationProperty( deserializer=kf.CfnDeliveryStream.DeserializerProperty( hive_json_ser_de=kf.CfnDeliveryStream. HiveJsonSerDeProperty(), ), ), output_format_configuration=kf.CfnDeliveryStream. OutputFormatConfigurationProperty( serializer=kf.CfnDeliveryStream.SerializerProperty( parquet_ser_de=kf.CfnDeliveryStream.ParquetSerDeProperty(), ), ), schema_configuration=kf.CfnDeliveryStream. SchemaConfigurationProperty( database_name=glue_db.database_name, table_name=glue_table.table_name, role_arn=stream_role.role_arn, region='us-east-2', ), ) s3_config = kf.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty( bucket_arn=BUCKET_ARN, # temporary, will replace with env variable role_arn=stream_role.role_arn, data_format_conversion_configuration= data_format_conversion_configuration, prefix='reddit/', buffering_hints=kf.CfnDeliveryStream.BufferingHintsProperty( size_in_m_bs=64, ), ) firehose = kf.CfnDeliveryStream( self, 'FirehoseStream', delivery_stream_name='RedditDataStream', extended_s3_destination_configuration=s3_config, ) # add role dependency firehose.node.add_dependency(stream_role) # add ECS Fargate instance app_role = iam.Role( self, 'RedditStreamingAppRole', assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com'), description= 'Role used by the Reddit Streaming Application Fargate Task', ) # add firehose permissions app_role.add_to_policy( iam.PolicyStatement( resources=[firehose.attr_arn], actions=[ 'firehose:DeleteDeliveryStream', 'firehose:PutRecord', 'firehose:PutRecordBatch', 'firehose:UpdateDestination', ], )) # add ecs and cloudwatch permissions app_role.add_to_policy( iam.PolicyStatement( resources=['*'], actions=[ 'ecr:GetAuthorizationToken', 'ecr:BatchCheckLayerAvailability', 'ecr:GetDownloadUrlForLayer', 'ecr:BatchGetImage', 'logs:CreateLogStream', 'logs:PutLogEvents', ], )) vpc = ec2.Vpc(self, 'RedditVpc', max_azs=3) cluster = ecs.Cluster(self, 'RedditCluster', vpc=vpc) task_definition = ecs.FargateTaskDefinition( self, 'TaskDefinition', memory_limit_mib=512, cpu=256, task_role=app_role, ) task_definition.add_container( id='RedditStreamingApp', image=ecs.ContainerImage.from_asset('./sentiment_analysis'), command=['all'], environment={ 'FIREHOSE_STREAM_NAME': firehose.delivery_stream_name, 'PRAW_CLIENT_SECRET': os.environ['PRAW_CLIENT_SECRET'], 'PRAW_CLIENT_ID': os.environ['PRAW_CLIENT_ID'], 'PRAW_USER_AGENT': os.environ['PRAW_USER_AGENT'], }, logging=ecs.LogDriver.aws_logs(stream_prefix='reddit'), ) container = ecs.FargateService( self, 'StreamingApplication', desired_count=1, task_definition=task_definition, cluster=cluster, assign_public_ip=True, )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # Kinesis to lambda self.stream_lambda = kinesis_lambda.KinesisStreamsToLambda( self, 'clickstream', lambda_function_props=_lambda.FunctionProps( runtime=_lambda.Runtime.PYTHON_3_7, handler='index.lambda_handler', code=_lambda.Code.inline( get_code('send_data_to_firehose.py'))), kinesis_stream_props=kinesis.StreamProps( stream_name='clickstream', retention_period=core.Duration.days(1), shard_count=4), kinesis_event_source_props=lambda_sources.KinesisEventSourceProps( starting_position=_lambda.StartingPosition.TRIM_HORIZON, batch_size=1)) # Lambda to produce data self.produce_fake_data = _lambda.Function( self, 'produce_data', runtime=_lambda.Runtime.PYTHON_3_7, timeout=core.Duration.seconds(90), handler='index.lambda_handler', code=_lambda.Code.inline(get_code('produce_data.py')), environment={ 'STREAM_NAME': self.stream_lambda.kinesis_stream.stream_name }) self.stream_lambda.kinesis_stream.grant_read_write( self.produce_fake_data) # EventBridge to activate my function above self.event_rule = events.Rule( self, 'scheduledRule', schedule=events.Schedule.expression('rate(1 minute)')) self.event_rule.add_target( targets.LambdaFunction(self.produce_fake_data)) # S3 Bucket self.bucket = s3.Bucket(self, 'data-clicks-lake', removal_policy=core.RemovalPolicy.DESTROY, auto_delete_objects=True) # Glue self.glue_db_analytical = glue.Database( self, 'analytic_clickstream', database_name='clickstream_db', location_uri=None, ) self.glue_table_analytical = glue.Table( self, 'analytical-table', table_name='analytical-table', columns=[ glue_column('custid', 'int'), glue_column('trafficfrom', 'string'), glue_column('url', 'string'), glue_column('device', 'string'), glue_column('touchproduct', 'int'), glue_column('trans_timestamp', 'string') ], database=self.glue_db_analytical, data_format=glue.DataFormat.PARQUET, bucket=self.bucket, s3_prefix='kinesis/', ) # Firehose iam_role_firehose_analytical = self.create_firehose_role() self.bucket.grant_read_write(iam_role_firehose_analytical) firehose_props = FirehoseProps( bucket=self.bucket, role=iam_role_firehose_analytical, stream=self.stream_lambda.kinesis_stream, glue_db=self.glue_db_analytical, glue_table=self.glue_table_analytical) self.firehose = FirehoseLib(self, 'firehose_clickstream', firehose_props) # Elasticsearh self.es_domain = ElasticsearchLib(self, 'ES-clickstream-domain').es_domain # Lambda to send data to Elasticsearch self.send_data_to_elasticsearch = lambda_python.PythonFunction( self, 'clickstream_to_es', entry='./analytics_ml_flow/lambda/lambda_with_requirements/', handler='handler', timeout=core.Duration.seconds(180), index='Kinesis_ES.py', environment={ 'ES_HOST_HTTP': self.es_domain.domain_endpoint, 'ES_INDEX': 'clickstream', 'ES_IND_TYPE': 'transactions', 'ES_REGION': 'us-west-2', }) self.es_domain.grant_index_read_write('clickstream', self.send_data_to_elasticsearch) self.es_domain.grant_read_write(self.send_data_to_elasticsearch) stream_source = lambda_sources.KinesisEventSource( self.stream_lambda.kinesis_stream, starting_position=_lambda.StartingPosition.TRIM_HORIZON, batch_size=1) self.stream_lambda.kinesis_stream.grant_read( self.send_data_to_elasticsearch) self.send_data_to_elasticsearch.add_event_source(stream_source) # Glue Crawler crawler_role = self.create_crawler_permissions() glue_props = GlueCrawlerProps(bucket=self.bucket, role=crawler_role) self.glue_crawler = GlueCrawlerLib(self, 'glueCrawler', glue_props)
def create_glue_resources(self) -> None: '''Creates Glue Database and Tables ''' if not hasattr(self, 'glue_attr'): self.prepare_glue_attr_types() col = aws_glue.Column # Kinesis and Athena depends on data schema declarations that should # be in a Database and Table in AWS Glue self.glue_db_analytical = aws_glue.Database( self, 'sls-blog-analytical-db', database_name='sls-blog-analytical', location_uri=None, ) self.glue_table_analytical = aws_glue.Table( self, 'analytical-table', table_name='analytical-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='publish_timestamp', type=self.glue_attr_timestamp), col(name='publisher_email', type=self.glue_attr_string), col(name='publisher_name', type=self.glue_attr_string), col(name='item_type', type=self.glue_attr_string), col(name='title', type=self.glue_attr_string), col(name='body', type=self.glue_attr_string), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_analytical, s3_prefix='kinesis/', ) self.glue_table_likes = aws_glue.Table( self, 'likes-table', table_name='likes-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='like', type=self.glue_attr_integer), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_likes, s3_prefix='kinesis/', ) self.glue_table_apirequests = aws_glue.Table( self, 'apirequests-table', table_name='apirequests-table', columns=[ col(name='id', type=self.glue_attr_string), col(name='item_type', type=self.glue_attr_string), col(name='http_method', type=self.glue_attr_string), col(name='timestamp', type=self.glue_attr_timestamp), col(name='datetime', type=self.glue_attr_date), col(name='ip_address', type=self.glue_attr_string), col(name='user_agent', type=self.glue_attr_string), col(name='origin', type=self.glue_attr_string), col(name='country_code', type=self.glue_attr_string), col(name='device_type', type=self.glue_attr_string), col(name='action', type=self.glue_attr_string), col(name='article_id', type=self.glue_attr_string), ], database=self.glue_db_analytical, data_format=aws_glue.DataFormat.PARQUET, bucket=self.bucket_apirequests, s3_prefix='kinesis/', )
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) s3_org_data = _s3.Bucket(self, ORIGINAL_DATA_BUCKET_NAME, bucket_name=ORIGINAL_DATA_BUCKET_NAME, removal_policy=core.RemovalPolicy.RETAIN) s3_transformed_data = _s3.Bucket( self, TRANSFORMED_DATA_BUCKET_NAME, bucket_name=TRANSFORMED_DATA_BUCKET_NAME, removal_policy=core.RemovalPolicy.RETAIN) # title-read s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(TITLE_READ), sources=[ s3_deployment.Source.asset("data/{}/".format(TITLE_READ)) ], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(TITLE_READ)) # title s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(TITLE), sources=[s3_deployment.Source.asset("data/{}/".format(TITLE))], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(TITLE)) # user s3_deployment.BucketDeployment( self, "s3-deployment-{}".format(USER), sources=[s3_deployment.Source.asset("data/{}/".format(USER))], destination_bucket=s3_org_data, destination_key_prefix="{}/".format(USER)) statement = iam.PolicyStatement(actions=[ "s3:*", "glue:*", "iam:ListRolePolicies", "iam:GetRole", "iam:GetRolePolicy" ], resources=["*"]) write_to_s3_policy = iam.PolicyDocument(statements=[statement]) glue_role = iam.Role( self, 'GlueCrawlerRole-dna', role_name='GlueCrawlerRole-dna', inline_policies=[write_to_s3_policy], assumed_by=iam.ServicePrincipal('glue.amazonaws.com'), managed_policies=[ iam.ManagedPolicy.from_aws_managed_policy_name( 'service-role/AWSGlueServiceRole') ]) #TODO add IAM role for ctas lambda dna_database = glue.Database(self, "dna-glue-database-id", database_name=GLUE_DATABASE_NAME) # create glue table title_read_table = glue.Table( self, "{}-table-id".format(TITLE_READ), table_name="{}_table".format(TITLE_READ).replace("-", "_"), database=dna_database, columns=[{ "name": "USER_ID", "type": glue.Schema.STRING }, { "name": "ITEM_ID", "type": glue.Schema.STRING }, { "name": "TIMESTAMP", "type": glue.Schema.BIG_INT }, { "name": "TITLE", "type": glue.Schema.STRING }, { "name": "EVENT_TYPE", "type": glue.Schema.STRING }], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=TITLE_READ) title_table = glue.Table(self, "{}-table-id".format(TITLE), table_name="{}_table".format(TITLE).replace( "-", "_"), database=dna_database, columns=[{ "name": "ITEM_ID", "type": glue.Schema.STRING }, { "name": "CREATION_TIMESTAMP", "type": glue.Schema.BIG_INT }, { "name": "TITLE", "type": glue.Schema.STRING }, { "name": "TAG", "type": glue.Schema.STRING }], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=TITLE) user_table = glue.Table(self, "{}-table-id".format(USER), table_name="{}_table".format(USER).replace( "-", "_"), database=dna_database, columns=[ { "name": "USER_ID", "type": glue.Schema.STRING }, { "name": "NAME", "type": glue.Schema.STRING }, { "name": "EMAIL", "type": glue.Schema.STRING }, { "name": "GENDER", "type": glue.Schema.STRING, "categorical": True }, { "name": "AGE", "type": glue.Schema.BIG_INT, "categorical": True }, ], data_format=glue.DataFormat.CSV, bucket=s3_org_data, s3_prefix=USER) _athena.CfnWorkGroup(self, "athena_workgroup_id", name=ATHENA_WORKGROUP) ctas_lambda_trigger = _event.Rule( self, "ctas-lambda-trigger-event-id", rule_name="ctas-lambda-trigger-event", schedule=_event.Schedule.cron(minute="10", hour="*")) s3_statement = iam.PolicyStatement( effect=iam.Effect.ALLOW, # resources = [s3_bucket.bucket_arn, "{}/*".format(s3_bucket.bucket_arn)], resources=["*"], actions=["s3:*"]) athena_statement = iam.PolicyStatement( effect=iam.Effect.ALLOW, resources=["*"], actions=["athena:StartQueryExecution", "glue:*"]) ctas_lambda_func = _lambda.Function( self, "CTAS_query", function_name="CTAS_query", runtime=_lambda.Runtime.PYTHON_3_7, code=_lambda.Code.asset("src/lambda"), handler="ctas_lambda.lambda_handler", description="CTAS query to transform AVRO file, batch execution", environment={ "BUCKET_NAME": s3_transformed_data.bucket_name, "DATABASE_NAME": GLUE_DATABASE_NAME, "ATHENA_WORKGROUP": ATHENA_WORKGROUP }, timeout=core.Duration.minutes(3)) ctas_lambda_func.add_to_role_policy(s3_statement) ctas_lambda_func.add_to_role_policy(athena_statement) ctas_lambda_trigger.add_target( _target.LambdaFunction(ctas_lambda_func))