def writeCsvFile(datasource, path): dataframe = DynamicFrame.toDF(datasource).repartition(1) datasource = DynamicFrame.fromDF(dataframe, glueContext, 'write-csv') glueContext.write_dynamic_frame.from_options( frame=datasource, connection_type="s3", connection_options={"path": path}, format="csv", transformation_ctx="write-csv")
glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) cleanup_temp_folder(args['temp_workflow_bucket'], 'glue_workflow_distinct_dates') tableName = args['table_name'].replace("-", "_") datasource = glueContext.create_dynamic_frame.from_catalog(database = args['db_name'], table_name = tableName, transformation_ctx = "datasource") mapped_readings = ApplyMapping.apply(frame = datasource, mappings = [("lclid", "string", "meter_id", "string"), \ ("datetime", "string", "reading_time", "string"), \ ("KWH/hh (per half hour)", "double", "reading_value", "double")], \ transformation_ctx = "mapped_readings") mapped_readings_df = DynamicFrame.toDF(mapped_readings) mapped_readings_df = mapped_readings_df.withColumn("obis_code", lit("")) mapped_readings_df = mapped_readings_df.withColumn("reading_type", lit("INT")) reading_time = to_timestamp(col("reading_time"), "yyyy-MM-dd HH:mm:ss") mapped_readings_df = mapped_readings_df \ .withColumn("week_of_year", weekofyear(reading_time)) \ .withColumn("date_str", regexp_replace(col("reading_time").substr(1,10), "-", "")) \ .withColumn("day_of_month", dayofmonth(reading_time)) \ .withColumn("month", month(reading_time)) \ .withColumn("year", year(reading_time)) \ .withColumn("hour", hour(reading_time)) \ .withColumn("minute", minute(reading_time)) \ .withColumn("reading_date_time", reading_time) \ .drop("reading_time")
def hash_cc(s): return hashlib.sha256(s).hexdigest() ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0") ## @convert glue DynamicFrame to DataFrame to manipulate the columns dataframe0 = DynamicFrame.toDF(datasource0) hash_cc_f = udf(lambda x: hash_cc(x), StringType()) dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"])) dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password') ## @convert dataframe to glue DynamicFrame and write the output in parquet format datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1") datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4") job.commit()
applymapping1 = ApplyMapping.apply(frame = datasource, mappings = [\ ("col0", "long", "meter_id", "string"), \ ("col1", "string", "obis_code", "string"), \ ("col2", "long", "reading_time", "string"), \ ("col3", "long", "reading_value", "double"), \ ("col4", "string", "reading_type", "string") \ ], transformation_ctx = "applymapping1") resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_struct", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields") mappedReadings = DynamicFrame.toDF(dropnullfields) # reading_time could not be passed, so splitting date and time fields manually mappedReadings = mappedReadings.withColumn("date_str", col("reading_time").substr(1, 8)) timeStr = regexp_replace(col("reading_time").substr(9, 16), "24", "") time = to_timestamp(timeStr, "HHmmss") date = to_date(col("date_str"), "yyyyMMdd") udfParseDateString = udf(parseDateString, TimestampType()) # add separate date and time fields mappedReadings = mappedReadings.withColumn("week_of_year", weekofyear(date)) \ .withColumn("day_of_month", dayofmonth(date)) \ .withColumn("month", month(date)) \