("manufacturer", "string", "manufacturer", "string")],\ transformation_ctx = "apply_mapping") dynamic_frame.printSchema() # Write to S3 Sink s3path = s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/" s3sink = glueContext.write_dynamic_frame.from_options(frame = apply_mapping, connection_type = "s3", connection_options = {"path": s3path}, format = "parquet", transformation_ctx = "s3sink") # Read from Kinesis Data Stream sourceData1 = glueContext.create_data_frame.from_catalog( \ database = "ventilatordb", \ table_name = "ventilators_table", \ transformation_ctx = "datasource1", \ additional_options = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"}) sourceData1.printSchema() sourceData2 = glueContext.create_data_frame.from_catalog( \ database = "ventilatordb", \ table_name = "ventilators_table2", \ transformation_ctx = "datasource2", \ additional_options = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"}) sourceData2.printSchema() glueContext.forEachBatch(frame = sourceData, batch_function = processBatch, options = {"windowSize": "100 seconds", "checkpointLocation": checkpoint_location}) job.commit()
("filter_level", "string", "filter_level", "string"), ("lang", "string", "lang", "string"), ("timestamp_ms", "timestamp", "timestamp_ms", "timestamp") ], transformation_ctx="apply_mapping") #datasink1 = glueContext.write_dynamic_frame.from_options( # frame = apply_mapping, # connection_type = "s3", # connection_options = { # "path": dest_s3_path, # "partitionKeys": ["year", "month", "day"], # 'compression': 'gzip' # }, # format = "json", # transformation_ctx = "datasink1") drop_duplicates_df = apply_mapping.toDF().dropDuplicates( subset=["id_str"]) drop_duplicates_df.repartition( "year", "month", "day").write.partitionBy( ["year", "month", "day"]).mode('append').json(dest_s3_path, compression='gzip') glueContext.forEachBatch(frame=datasource0, batch_function=processBatch, options={ "windowSize": "600 seconds", "checkpointLocation": dest_s3_path + "/checkpoint" }) job.commit()
month = now.month day = now.day hour = now.hour minute = now.minute path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format( str(year)) + "/ingest_month=" + "{:0>2}".format( str(month)) + "/ingest_day=" + "{:0>2}".format( str(day)) + "/ingest_hour=" + "{:0>2}".format( str(hour)) + "/" datasink1 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={"path": path_datasink1}, format="parquet", transformation_ctx="datasink1") logger.info(f'{{"batch_process_successful":True}}') glueContext.forEachBatch( frame=data_frame_datasource0, batch_function=processBatch, options={ "windowSize": "100 seconds", "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint" # "s3://raw-data-bkt-010/stream-etl/checkpoint/" }) job.commit()
hour = now.hour minute = now.minute path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format( str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/" # path_DataSink0 = "s3://sale-events-bkt-with-access-points-010" + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/" datasink1 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={ "path": path_datasink1 }, format="parquet", transformation_ctx="datasink1" ) logger.info(f'{{"batch_process_successful":True}}') glueContext.forEachBatch( frame=data_frame_datasource0, batch_function=processBatch, options={ "windowSize": "100 seconds", "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint" # arn:aws:s3:us-east-1:230023004178:accesspoint/glue-consumer # "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint" # "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint" } ) job.commit()