def writeCsvFile(datasource, path):
    dataframe = DynamicFrame.toDF(datasource).repartition(1)
    datasource = DynamicFrame.fromDF(dataframe, glueContext, 'write-csv')
    glueContext.write_dynamic_frame.from_options(
        frame=datasource,
        connection_type="s3",
        connection_options={"path": path},
        format="csv",
        transformation_ctx="write-csv")
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

cleanup_temp_folder(args['temp_workflow_bucket'], 'glue_workflow_distinct_dates')

tableName = args['table_name'].replace("-", "_")
datasource = glueContext.create_dynamic_frame.from_catalog(database = args['db_name'], table_name = tableName, transformation_ctx = "datasource")

mapped_readings = ApplyMapping.apply(frame = datasource, mappings = [("lclid", "string", "meter_id", "string"), \
                                                                     ("datetime", "string", "reading_time", "string"), \
                                                                     ("KWH/hh (per half hour)", "double", "reading_value", "double")], \
                                     transformation_ctx = "mapped_readings")

mapped_readings_df = DynamicFrame.toDF(mapped_readings)

mapped_readings_df = mapped_readings_df.withColumn("obis_code", lit(""))
mapped_readings_df = mapped_readings_df.withColumn("reading_type", lit("INT"))

reading_time = to_timestamp(col("reading_time"), "yyyy-MM-dd HH:mm:ss")
mapped_readings_df = mapped_readings_df \
    .withColumn("week_of_year", weekofyear(reading_time)) \
    .withColumn("date_str", regexp_replace(col("reading_time").substr(1,10), "-", "")) \
    .withColumn("day_of_month", dayofmonth(reading_time)) \
    .withColumn("month", month(reading_time)) \
    .withColumn("year", year(reading_time)) \
    .withColumn("hour", hour(reading_time)) \
    .withColumn("minute", minute(reading_time)) \
    .withColumn("reading_date_time", reading_time) \
    .drop("reading_time")
Beispiel #3
0
def hash_cc(s):
    return hashlib.sha256(s).hexdigest()

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0")


## @convert glue DynamicFrame to DataFrame to manipulate the columns
dataframe0 = DynamicFrame.toDF(datasource0)

hash_cc_f = udf(lambda x: hash_cc(x), StringType())

dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"]))
dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password')

## @convert dataframe to glue DynamicFrame and write the output in parquet format
datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1")


datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4")

job.commit()
applymapping1 = ApplyMapping.apply(frame = datasource, mappings = [\
    ("col0", "long", "meter_id", "string"), \
    ("col1", "string", "obis_code", "string"), \
    ("col2", "long", "reading_time", "string"), \
    ("col3", "long", "reading_value", "double"), \
    ("col4", "string", "reading_type", "string") \
    ], transformation_ctx = "applymapping1")

resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_struct",
                                     transformation_ctx="resolvechoice2")

dropnullfields = DropNullFields.apply(frame=resolvechoice2,
                                      transformation_ctx="dropnullfields")

mappedReadings = DynamicFrame.toDF(dropnullfields)

# reading_time could not be passed, so splitting date and time fields manually
mappedReadings = mappedReadings.withColumn("date_str",
                                           col("reading_time").substr(1, 8))

timeStr = regexp_replace(col("reading_time").substr(9, 16), "24", "")
time = to_timestamp(timeStr, "HHmmss")
date = to_date(col("date_str"), "yyyyMMdd")

udfParseDateString = udf(parseDateString, TimestampType())

# add separate date and time fields
mappedReadings = mappedReadings.withColumn("week_of_year", weekofyear(date)) \
          .withColumn("day_of_month", dayofmonth(date)) \
          .withColumn("month", month(date)) \