def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) # CloudFormation Parameters glue_db_name = core.CfnParameter(self, "GlueDatabaseNameNycTlc", type="String", description="Name of Glue Database to be created for NYC TLC.", allowed_pattern="[\w-]+", default = "nyc_tlc_db" ) glue_table_name = core.CfnParameter(self, "GlueTableNameNycTlc", type="String", description="Name of Glue Table to be created for NYC TLC.", allowed_pattern="[\w-]+", default = "nyc_tlc_table" ) self.template_options.description = "\ This template deploys the dataset containing New York City Taxi and Limousine Commission (TLC) Trip Record Data.\n \ Sample data is copied from the public dataset into a local S3 bucket, a database and table are created in AWS Glue, \ and the S3 location is registered with AWS Lake Formation." self.template_options.metadata = { "AWS::CloudFormation::Interface": { "License": "MIT-0" } } # Create S3 bucket for storing a copy of the Dataset locally in the AWS Account local_dataset_bucket = s3.Bucket(self, "LocalNycTlcBucket", block_public_access = s3.BlockPublicAccess( block_public_acls=True, block_public_policy=True, ignore_public_acls=True, restrict_public_buckets=True), removal_policy = core.RemovalPolicy.DESTROY) public_dataset_bucket = s3.Bucket.from_bucket_arn(self, "PublicDatasetBucket", BUCKET_ARN) with open("lambda/s3_copy.py", encoding="utf8") as fp: s3_copy_code = fp.read() s3_copy_execution_role = iam.Role(self, "S3CopyHandlerServiceRole", assumed_by = iam.ServicePrincipal('lambda.amazonaws.com'), managed_policies = [ iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSLambdaBasicExecutionRole"), ], inline_policies = { "S3CopyHandlerRoleInlinePolicy" : iam.PolicyDocument( statements = [ iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "s3:Get*" ], resources=[ public_dataset_bucket.bucket_arn, public_dataset_bucket.arn_for_objects("*") ]), iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ "s3:PutObject", "s3:GetObject", "s3:DeleteObject" ], resources=[local_dataset_bucket.arn_for_objects("*")] ) ] ) } ) s3_copy_fn = _lambda.Function(self, "S3CopyHandler", runtime = _lambda.Runtime.PYTHON_3_7, code = _lambda.InlineCode.from_inline(s3_copy_code), handler = "index.handler", role = s3_copy_execution_role, timeout = core.Duration.seconds(600) ) s3_copy = core.CustomResource(self, "S3Copy", service_token = s3_copy_fn.function_arn, resource_type = "Custom::S3Copy", properties = { "PublicDatasetBucket": public_dataset_bucket.bucket_name, "LocalDatasetBucket" : local_dataset_bucket.bucket_name, "PublicDatasetObject": OBJECT, "LocalDatasetPrefix": glue_table_name.value_as_string } ) # Create Database, Table and Partitions for Amazon Reviews lakeformation_resource = lf.CfnResource(self, "LakeFormationResource", resource_arn = local_dataset_bucket.bucket_arn, use_service_linked_role = True) lakeformation_resource.node.add_dependency(s3_copy) cfn_glue_db = glue.CfnDatabase(self, "GlueDatabase", catalog_id = core.Aws.ACCOUNT_ID, database_input = glue.CfnDatabase.DatabaseInputProperty( name = glue_db_name.value_as_string, location_uri=local_dataset_bucket.s3_url_for_object(), ) ) nyc_tlc_table = glue.CfnTable(self, "GlueTableNycTlc", catalog_id = cfn_glue_db.catalog_id, database_name = glue_db_name.value_as_string, table_input = glue.CfnTable.TableInputProperty( description = "New York City Taxi and Limousine Commission (TLC) Trip Record Data", name = glue_table_name.value_as_string, parameters = { "skip.header.line.count": "1", "compressionType": "none", "classification": "csv", "delimiter": ",", "typeOfData": "file" }, storage_descriptor = glue.CfnTable.StorageDescriptorProperty( columns = [ {"name":"vendorid","type":"bigint"}, {"name":"lpep_pickup_datetime","type":"string"}, {"name":"lpep_dropoff_datetime","type":"string"}, {"name":"store_and_fwd_flag","type":"string"}, {"name":"ratecodeid","type":"bigint"}, {"name":"pulocationid","type":"bigint"}, {"name":"dolocationid","type":"bigint"}, {"name":"passenger_count","type":"bigint"}, {"name":"trip_distance","type":"double"}, {"name":"fare_amount","type":"double"}, {"name":"extra","type":"double"}, {"name":"mta_tax","type":"double"}, {"name":"tip_amount","type":"double"}, {"name":"tolls_amount","type":"double"}, {"name":"ehail_fee","type":"string"}, {"name":"improvement_surcharge","type":"double"}, {"name":"total_amount","type":"double"}, {"name":"payment_type","type":"bigint"}, {"name":"trip_type","type":"bigint"}, {"name":"congestion_surcharge","type":"double"}], location = local_dataset_bucket.s3_url_for_object() + "/" + glue_table_name.value_as_string + "/", input_format = "org.apache.hadoop.mapred.TextInputFormat", output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", compressed = False, serde_info = glue.CfnTable.SerdeInfoProperty( serialization_library = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", parameters = { "field.delim": "," } ) ), table_type = "EXTERNAL_TABLE" ) ) nyc_tlc_table.node.add_dependency(cfn_glue_db) core.CfnOutput(self, "LocalNycTlcBucketOutput", value=local_dataset_bucket.bucket_name, description="S3 Bucket created to store the dataset") core.CfnOutput(self, "GlueDatabaseOutput", value=cfn_glue_db.ref, description="Glue DB created to host the dataset table") core.CfnOutput(self, "GlueTableNycTlcOutput", value=nyc_tlc_table.ref, description="Glue Table created to host the dataset")
def __init__( self, scope: cdk.Construct, construct_id: str, stack_log_level: str, src_stream, **kwargs, ) -> None: super().__init__(scope, construct_id, **kwargs) self.template_options.description = "Miztiik Automation: Sales Transactions Table Stack." self.template_options.metadata = {"License": "Miztiik Corp."} # CloudFormation Parameters self.glue_db_name = cdk.CfnParameter( self, "GlueTxnsDbName", type="String", description= "Name of Glue Database to be created for Sales Transactions.", allowed_pattern="[\w-]+", default="miztiik_sales_db", ) self.glue_table_name = cdk.CfnParameter( self, "GlueTxnsTableName", type="String", description= "Name of Glue Table to be created for Sales Transactions (JSON).", allowed_pattern="[\w-]+", default="sales_txns_tbl", ) cfn_txn_db = _glue.CfnDatabase( self, "GlueTxnsDb", catalog_id=cdk.Aws.ACCOUNT_ID, database_input=_glue.CfnDatabase.DatabaseInputProperty( name=self.glue_db_name.value_as_string, description="Database for Sales Transactions." # location_uri=txns_bucket.s3_url_for_object(), ), ) # Ref: https://docs.aws.amazon.com/glue/latest/dg/add-job-streaming.html cfn_txn_table = _glue.CfnTable( self, "glueTxnsTable01", catalog_id=cfn_txn_db.catalog_id, database_name=self.glue_db_name.value_as_string, table_input=_glue.CfnTable.TableInputProperty( description="Sales Transactions Table", name=self.glue_table_name.value_as_string, parameters={ "classification": "json", # "typeOfData": "file" }, # partition_keys=[ # { # "name": "product_category", # "type": "string" # } # ], table_type="EXTERNAL_TABLE", storage_descriptor=_glue.CfnTable.StorageDescriptorProperty( # columns=[ # { # "name": "marketplace", # "type": "string" # } # ], location=f"{src_stream.stream_name}", parameters={ "endpointUrl": f"https://kinesis.{cdk.Aws.REGION}.amazonaws.com", "streamName": f"{src_stream.stream_name}", "typeOfData": "kinesis" }, input_format="org.apache.hadoop.mapred.TextInputFormat", output_format= "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", serde_info=_glue.CfnTable.SerdeInfoProperty( name="miztiikAutomationSerDeConfig", serialization_library= "org.openx.data.jsonserde.JsonSerDe", parameters={ "paths": "", # "typeOfData": "file" })))) cfn_txn_table.add_depends_on(cfn_txn_db) ########################################### ################# OUTPUTS ################# ########################################### output_0 = cdk.CfnOutput( self, "AutomationFrom", value=f"{GlobalArgs.SOURCE_INFO}", description= "To know more about this automation stack, check out our github page.", ) output_1 = cdk.CfnOutput( self, "GlueTxnsTable", value= f"https://console.aws.amazon.com/glue/home?region={cdk.Aws.REGION}#table:catalog={cdk.Aws.ACCOUNT_ID};name={self.glue_table_name.value_as_string};namespace={self.glue_db_name.value_as_string}", description="Glue Transactions Table.", )
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) #the S3 bucket where CloudFront Access Logs will be stored cf_access_logs = s3.Bucket(self, "LogBucket") #S3 bucket where Athena will put the results athena_results = s3.Bucket(self, "AthenaResultsBucket") #create an Athena database glue_database_name = "serverlessland_database" myDatabase = glue.CfnDatabase( self, id=glue_database_name, catalog_id=account, database_input=glue.CfnDatabase.DatabaseInputProperty( description=f"Glue database '{glue_database_name}'", name=glue_database_name, ) ) #define a table with the structure of CloudFront Logs https://docs.aws.amazon.com/athena/latest/ug/cloudfront-logs.html athena_table = glue.CfnTable(self, id='cfaccesslogs', catalog_id=account, database_name=glue_database_name, table_input=glue.CfnTable.TableInputProperty( name='cf_access_logs', description='CloudFront access logs', table_type='EXTERNAL_TABLE', parameters = { 'skip.header.line.count': '2', }, storage_descriptor=glue.CfnTable.StorageDescriptorProperty( location="s3://"+cf_access_logs.bucket_name+"/", input_format='org.apache.hadoop.mapred.TextInputFormat', output_format='org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', compressed=False, serde_info=glue.CfnTable.SerdeInfoProperty( serialization_library='org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe', parameters={ 'field.delim' : ' ' } ), columns=[ glue.CfnTable.ColumnProperty(name='date', type='date'), glue.CfnTable.ColumnProperty(name='time', type='string'), glue.CfnTable.ColumnProperty(name='location', type='string'), glue.CfnTable.ColumnProperty(name='bytes', type='bigint'), glue.CfnTable.ColumnProperty(name='request_ip', type='string'), glue.CfnTable.ColumnProperty(name='method', type='string'), glue.CfnTable.ColumnProperty(name='host', type='string'), glue.CfnTable.ColumnProperty(name='uri', type='string'), glue.CfnTable.ColumnProperty(name='status', type='string'), glue.CfnTable.ColumnProperty(name='referer', type='string'), glue.CfnTable.ColumnProperty(name='user_agent', type='string'), glue.CfnTable.ColumnProperty(name='query_string', type='string'), glue.CfnTable.ColumnProperty(name='cookie', type='string'), glue.CfnTable.ColumnProperty(name='result_type', type='string'), glue.CfnTable.ColumnProperty(name='request_id', type='string'), glue.CfnTable.ColumnProperty(name='host_header', type='string'), glue.CfnTable.ColumnProperty(name='request_protocol', type='string'), glue.CfnTable.ColumnProperty(name='request_bytes', type='bigint'), glue.CfnTable.ColumnProperty(name='time_taken', type='float'), glue.CfnTable.ColumnProperty(name='xforwarded_for', type='string'), glue.CfnTable.ColumnProperty(name='ssl_protocol', type='string'), glue.CfnTable.ColumnProperty(name='ssl_cipher', type='string'), glue.CfnTable.ColumnProperty(name='response_result_type', type='string'), glue.CfnTable.ColumnProperty(name='http_version', type='string'), glue.CfnTable.ColumnProperty(name='fle_status', type='string'), glue.CfnTable.ColumnProperty(name='fle_encrypted_fields', type='int'), glue.CfnTable.ColumnProperty(name='c_port', type='int'), glue.CfnTable.ColumnProperty(name='time_to_first_byte', type='float'), glue.CfnTable.ColumnProperty(name='x_edge_detailed_result_type', type='string'), glue.CfnTable.ColumnProperty(name='sc_content_type', type='string'), glue.CfnTable.ColumnProperty(name='sc_content_len', type='string'), glue.CfnTable.ColumnProperty(name='sc_range_start', type='bigint'), glue.CfnTable.ColumnProperty(name='sc_range_end', type='bigint') ] ), ) ) #submit the query and wait for the results start_query_execution_job = tasks.AthenaStartQueryExecution(self, "Start Athena Query", query_string="SELECT uri FROM cf_access_logs limit 10", integration_pattern=sf.IntegrationPattern.RUN_JOB, #executes the command in SYNC mode query_execution_context=tasks.QueryExecutionContext( database_name=glue_database_name ), result_configuration=tasks.ResultConfiguration( output_location=s3.Location( bucket_name=athena_results.bucket_name, object_key="results" ) ) ) #get the results get_query_results_job = tasks.AthenaGetQueryResults(self, "Get Query Results", query_execution_id=sf.JsonPath.string_at("$.QueryExecution.QueryExecutionId"), result_path=sf.JsonPath.string_at("$.GetQueryResults"), ) #prepare the query to see if more results are available (up to 1000 can be retrieved) prepare_next_params = sf.Pass(self, "Prepare Next Query Params", parameters={ "QueryExecutionId.$": "$.StartQueryParams.QueryExecutionId", "NextToken.$": "$.GetQueryResults.NextToken" }, result_path=sf.JsonPath.string_at("$.StartQueryParams") ) #check to see if more results are available has_more_results = sf.Choice(self, "Has More Results?").when( sf.Condition.is_present("$.GetQueryResults.NextToken"), prepare_next_params.next(get_query_results_job) ).otherwise(sf.Succeed(self, "Done")) #do something with each result #here add your own logic map = sf.Map(self, "Map State", max_concurrency=1, input_path=sf.JsonPath.string_at("$.GetQueryResults.ResultSet.Rows[1:]"), result_path = sf.JsonPath.DISCARD ) map.iterator(sf.Pass(self, "DoSomething")) # Step function to orchestrate Athena query and retrieving the results workflow = sf.StateMachine(self, "AthenaQuery", definition=start_query_execution_job.next(get_query_results_job).next(map).next(has_more_results), timeout=Duration.minutes(60) ) CfnOutput(self, "Logs", value=cf_access_logs.bucket_name, export_name='LogsBucket') CfnOutput(self, "SFName", value=workflow.state_machine_name, export_name='SFName') CfnOutput(self, "SFArn", value = workflow.state_machine_arn, export_name = 'StepFunctionArn', description = 'Step Function arn')