コード例 #1
0
            ("manufacturer", "string", "manufacturer", "string")],\
            transformation_ctx = "apply_mapping")

        dynamic_frame.printSchema()

        # Write to S3 Sink
        s3path = s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/"
        s3sink = glueContext.write_dynamic_frame.from_options(frame = apply_mapping, connection_type = "s3", connection_options = {"path": s3path}, format = "parquet", transformation_ctx = "s3sink")

# Read from Kinesis Data Stream
sourceData1 = glueContext.create_data_frame.from_catalog( \
    database = "ventilatordb", \
    table_name = "ventilators_table", \
    transformation_ctx = "datasource1", \
    additional_options = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"})

sourceData1.printSchema()

sourceData2 = glueContext.create_data_frame.from_catalog( \
    database = "ventilatordb", \
    table_name = "ventilators_table2", \
    transformation_ctx = "datasource2", \
    additional_options = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"})

sourceData2.printSchema()



glueContext.forEachBatch(frame = sourceData, batch_function = processBatch, options = {"windowSize": "100 seconds", "checkpointLocation": checkpoint_location})
job.commit()
コード例 #2
0
                ("filter_level", "string", "filter_level", "string"),
                ("lang", "string", "lang", "string"),
                ("timestamp_ms", "timestamp", "timestamp_ms", "timestamp")
            ],
            transformation_ctx="apply_mapping")
        #datasink1 = glueContext.write_dynamic_frame.from_options(
        #    frame = apply_mapping,
        #    connection_type = "s3",
        #    connection_options = {
        #        "path": dest_s3_path,
        #        "partitionKeys": ["year", "month", "day"],
        #        'compression': 'gzip'
        #    },
        #    format = "json",
        #    transformation_ctx = "datasink1")
        drop_duplicates_df = apply_mapping.toDF().dropDuplicates(
            subset=["id_str"])
        drop_duplicates_df.repartition(
            "year", "month", "day").write.partitionBy(
                ["year", "month",
                 "day"]).mode('append').json(dest_s3_path, compression='gzip')


glueContext.forEachBatch(frame=datasource0,
                         batch_function=processBatch,
                         options={
                             "windowSize": "600 seconds",
                             "checkpointLocation": dest_s3_path + "/checkpoint"
                         })
job.commit()
コード例 #3
0
        month = now.month
        day = now.day
        hour = now.hour
        minute = now.minute
        path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format(
            str(year)) + "/ingest_month=" + "{:0>2}".format(
                str(month)) + "/ingest_day=" + "{:0>2}".format(
                    str(day)) + "/ingest_hour=" + "{:0>2}".format(
                        str(hour)) + "/"
        datasink1 = glueContext.write_dynamic_frame.from_options(
            frame=datasource0,
            connection_type="s3",
            connection_options={"path": path_datasink1},
            format="parquet",
            transformation_ctx="datasink1")
        logger.info(f'{{"batch_process_successful":True}}')


glueContext.forEachBatch(
    frame=data_frame_datasource0,
    batch_function=processBatch,
    options={
        "windowSize":
        "100 seconds",
        "checkpointLocation":
        f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint"
        # "s3://raw-data-bkt-010/stream-etl/checkpoint/"
    })

job.commit()
        hour = now.hour
        minute = now.minute
        path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(
            str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/"
        # path_DataSink0 = "s3://sale-events-bkt-with-access-points-010" + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/"
        datasink1 = glueContext.write_dynamic_frame.from_options(
            frame=datasource0,
            connection_type="s3",
            connection_options={
                "path": path_datasink1
            },
            format="parquet",
            transformation_ctx="datasink1"
        )
        logger.info(f'{{"batch_process_successful":True}}')


glueContext.forEachBatch(
    frame=data_frame_datasource0,
    batch_function=processBatch,
    options={
        "windowSize": "100 seconds",
        "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint"
        # arn:aws:s3:us-east-1:230023004178:accesspoint/glue-consumer
        # "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint"
        # "checkpointLocation": f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}/checkpoint"
    }
)

job.commit()