def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions): df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items')) df_tables_array = df_tables.select(df_tables['type'], df_tables['database'], array(df_tables['item']).alias('items')) df_partitions_array_batched = batch_metastore_partitions(sql_context=sql_context, df_parts=df_partitions) dyf_databases = DynamicFrame.fromDF( dataframe=df_databases_array, glue_ctx=glue_context, name='dyf_databases') dyf_tables = DynamicFrame.fromDF( dataframe=df_tables_array, glue_ctx=glue_context, name='dyf_tables') dyf_partitions = DynamicFrame.fromDF( dataframe=df_partitions_array_batched, glue_ctx=glue_context, name='dyf_partitions') return dyf_databases, dyf_tables, dyf_partitions
def write_df_to_catalog(data_frame, entity_type, glue_context, options): # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get. if data_frame.rdd.isEmpty(): return # nothing to do database_name = options['catalog.database'] nested_data_frame = nest_data_frame(data_frame, database_name, entity_type) dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type) sink = glue_context.getSink('catalog', **options) sink.write(dynamic_frame)
glueContext = GlueContext(sparkContext) sparkSession = glueContext.spark_session glueJob = Job(glueContext) glueJob.init(args['JOB_NAME'], args) collections_input = "COLLECTIONS_REPLACE" collections = collections_input.split(",") dfs = [] # Loop over each collection read the collection and push it to dataframes list for collection in collections: source_df = sparkSession.read.format("jdbc").option( "url", jdbc_url).option("dbtable", collection).option( "driver", "cdata.jdbc.mongodb.MongoDBDriver").load() dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df_{}".format(collection)) dfs.append({"dynamic_frame": dynamic_dframe, "collection": collection}) # Write dataframes to s3 for df in dfs: glueContext.write_dynamic_frame.from_options( frame=df["dynamic_frame"], connection_type="s3", connection_options={ "path": "TARGET_BUCKET{}".format(df["collection"]) }, format="csv", transformation_ctx="datasink4") glueJob.commit()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # date_now = datetime.now() # preday = date_now + timedelta(days=-1) # d1 = preday.strftime("%Y%m%d") # print("d1 =", d1) # # now = datetime.now() # current date and time # year = now.strftime("%Y%m%d") # print("year:", year) dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student_history" ) print('Count:', dyf_mapping_lo_student_history.count()) # # Filter nhung ban ghi cua ngay hom truoc, filter nhung ban ghi co diem != 0 # dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None) dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None and (x['knowledge'] != 0 or x['comprehension'] != 0 or x[ 'application'] != 0 or x['analysis'] != 0 or x[ 'synthesis'] != 0 or x['evaluation'] != 0)) if dyf_mapping_lo_student_history.count() > 0: print('START JOB---------------') df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF() df_mapping_lo_student_history = df_mapping_lo_student_history.groupby('date_id', 'student_id', 'learning_object_id').agg( f.sum("knowledge").alias("knowledge"), f.sum("comprehension").alias("comprehension"), f.sum("application").alias("application"), f.sum("analysis").alias("analysis"), f.sum("synthesis").alias("synthesis"), f.sum("evaluation").alias("evaluation")) df_mapping_lo_student_history.printSchema() df_mapping_lo_student_history.show() print('END JOB---------------') dyf_mapping_lo_student_used = DynamicFrame.fromDF(df_mapping_lo_student_history, glueContext, "dyf_student_lo_init") # print('COUNT:', dyf_student_lo_init.count()) # dyf_student_lo_init.printSchema() # dyf_student_lo_init.show() dyf_mapping_lo_student_used = ApplyMapping.apply(frame=dyf_mapping_lo_student_used, mappings=[("student_id", "long", "student_id", "long"), ("learning_object_id", "long", "learning_object_id", "long"), ("date_id", "int", "date_id", "long"), ("knowledge", 'long', 'knowledge', 'long'), ("comprehension", 'long', 'comprehension', 'long'), ("application", 'long', 'application', 'long'), ("analysis", 'long', 'analysis', 'long'), ("synthesis", 'long', 'synthesis', 'long'), ("evaluation", 'long', 'evaluation', 'long')]) dyf_mapping_lo_student_used = ResolveChoice.apply(frame=dyf_mapping_lo_student_used, choice="make_cols", transformation_ctx="resolvechoice2") dyf_mapping_lo_student_used = DropNullFields.apply(frame=dyf_mapping_lo_student_used, transformation_ctx="dyf_mapping_lo_student_used") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_mapping_lo_student_used, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_used", "database": "dts_odin", "postactions": """ call proc_insert_tbhv(); INSERT INTO mapping_lo_student_history SELECT * FROM mapping_lo_student_used; DROP TABLE IF EXISTS mapping_lo_student_used """ }, redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_init", transformation_ctx="datasink5")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # TBHV E ngay # LO_TYPE: 1: Tu vung; 2: Ngu am; 3: Nghe; 4: Ngu phap # Custom function def get_length(array_str): json_obj = json.loads(array_str) # index = 0; # for item in json_obj: # index += 1 length = 0 if json_obj is not None: length = len(json_obj) return length udf_get_length = udf(get_length, IntegerType()) arr_aip_tu_vung = ['3', '4', '5', '17'] arr_aip_ngu_phap = ['6', '7', '8', '9', '18'] arr_aip_ngu_am = ['16'] arr_aip_nghe = ['10', '11', '12', '13', '14', '15'] arr_knowledge = ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18'] arr_comprehension = ['8', '9', '14'] def do_add_lo_type(code): lo_type = -1 code = str(code) for x in arr_aip_tu_vung: if x == code: lo_type = 1 for x in arr_aip_ngu_am: if x == code: lo_type = 2 for x in arr_aip_nghe: if x == code: lo_type = 3 for x in arr_aip_ngu_phap: if x == code: lo_type = 4 return lo_type add_lo_type = udf(do_add_lo_type, IntegerType()) def do_add_score_aip(code, type, lo_type, correct_answer, student_answer): code = str(code) score = 0 arr = [] # diem ngu am if lo_type == 2 and correct_answer == student_answer: score = 2 if lo_type == 2 and correct_answer != student_answer: score = -1 # truong hop cac diem khac ko phair ngu am if lo_type != 2 and correct_answer == student_answer: score = 10 if lo_type != 2 and correct_answer != student_answer: score = -5 if type == 'knowledge': arr = arr_knowledge for x in arr: if x == code: return score return 0 add_score_aip = udf(do_add_score_aip, IntegerType()) def do_add_score_micro(code, type, lo_type, total_step, count_step): code = str(code) score = 0 arr = [] percent_success = 0.7 if count_step / total_step >= percent_success: score = 10 else: score = -5 if type == 'knowledge': arr = arr_knowledge if type == 'comprehension': arr = arr_comprehension for x in arr: if x == code: return score return 0 add_score_micro = udf(do_add_score_micro, IntegerType()) def do_add_score_ait(total_step, max_step, received_point, length_answer): score = 0 if total_step == max_step: if length_answer <= 3 and received_point >= 3: score = 30 if length_answer <= 3 and received_point <= 2: score = 10 if length_answer >= 4 and received_point <= 2: score = -15 return score add_score_ait = udf(do_add_score_ait, IntegerType()) ########## dyf_ai_study_step dyf_ai_study_step = glueContext.create_dynamic_frame.from_catalog( database="moodlestarter", table_name="ai_study_step" ) dyf_ai_study_step = dyf_ai_study_step.select_fields( ['_key', 'user_id', 'lesson_id', 'tag', 'current_step', 'total_step', 'learning_object', 'learning_object_type', 'correct_answer', 'student_answer', 'student_answer_details', 'max_point', 'received_point', 'created_at', 'page_style', 'session_id']) try: # # doc moc flag tu s3 df_flag = spark.read.parquet("s3://dts-odin/flag/flag_ai_study_step.parquet") max_key = df_flag.collect()[0]['flag'] print('read from index: ', max_key) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_ai_study_step = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['_key'] > max_key) except: print('read flag error ') if dyf_ai_study_step.count() > 0: try: ## Xu ly tag la: aip dyf_aip = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'aip') df_aip = dyf_aip.toDF() def random_code(): return random.randint(1, 16) add_code = udf(random_code, IntegerType()) df_aip = df_aip.withColumn("code", add_code()) df_aip.printSchema() df_aip = df_aip.withColumn("lo_type", add_lo_type(df_aip.code)) df_aip = df_aip.withColumn("knowledge", add_score_aip(df_aip.code, f.lit('knowledge'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("comprehension", add_score_aip(df_aip.code, f.lit('comprehension'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("application", add_score_aip(df_aip.code, f.lit('application'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("analysis", add_score_aip(df_aip.code, f.lit('analysis'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("synthesis", add_score_aip(df_aip.code, f.lit('synthesis'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("evaluation", add_score_aip(df_aip.code, f.lit('evaluation'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_aip.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) df_aip.printSchema() df_aip.show() dyf_aip = DynamicFrame.fromDF(df_aip, glueContext, "dyf_aip") applymapping = ApplyMapping.apply(frame=dyf_aip, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("correct_answer", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter_1", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) try: ## Xu ly tag la: micro dyf_micro = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'micro') df_micro = dyf_micro.toDF() df_micro_max_step = df_micro.groupby('user_id', 'lesson_id', 'session_id').agg( f.max('current_step').alias("max_step")) df_micro_max_step = df_micro_max_step.where("max_step >= 4") df_micro_max_step = df_micro_max_step.withColumnRenamed('user_id', 'user_id1') \ .withColumnRenamed('lesson_id', 'lesson_id1') \ .withColumnRenamed('session_id', 'session_id1') df_micro_received_point = df_micro.where("max_point = received_point") df_micro_received_point = df_micro_received_point.groupby('user_id', 'lesson_id', 'session_id').agg( f.count('received_point').alias("count_received_point")) df_micro_received_point = df_micro_received_point.withColumnRenamed('user_id', 'user_id2') \ .withColumnRenamed('lesson_id', 'lesson_id2') \ .withColumnRenamed('session_id', 'session_id2') df_micro = df_micro.join(df_micro_max_step, (df_micro['user_id'] == df_micro_max_step['user_id1']) & (df_micro['lesson_id'] == df_micro_max_step['lesson_id1']) & (df_micro['session_id'] == df_micro_max_step['session_id1'])) df_micro = df_micro.join(df_micro_received_point, (df_micro['user_id'] == df_micro_received_point['user_id2']) & (df_micro['lesson_id'] == df_micro_received_point['lesson_id2']) & (df_micro['session_id'] == df_micro_received_point['session_id2'])) def random_code1(): return random.randint(17, 18) add_code1 = udf(random_code1, IntegerType()) df_micro = df_micro.withColumn("code", add_code1()) df_micro = df_micro.withColumn("lo_type", add_lo_type(df_micro.code)) df_micro = df_micro.withColumn("knowledge", add_score_micro(df_micro.code, f.lit('knowledge'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("comprehension", add_score_micro(df_micro.code, f.lit('comprehension'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("application", add_score_micro(df_micro.code, f.lit('application'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("analysis", add_score_micro(df_micro.code, f.lit('analysis'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("synthesis", add_score_micro(df_micro.code, f.lit('synthesis'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("evaluation", add_score_micro(df_micro.code, f.lit('evaluation'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_micro.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) df_micro.printSchema() df_micro.show() dyf_micro = DynamicFrame.fromDF(df_micro, glueContext, "dyf_micro") applymapping = ApplyMapping.apply(frame=dyf_micro, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("learning_object", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter_2", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) except Exception as e: print("###################### Exception ##########################") print(e) try: ## Xu ly tag la: ait # dyf_ai_study_step.show(5) dyf_ait = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'ait') # dyf_ait = Filter.apply(frame=dyf_ai_study_step, # f=lambda x: x['tag'] == 'ait' # and x['student_answer_details'] is not None # and x['student_answer_details'] != 'null' # and x['correct_answer'] is not None) df_ait = dyf_ait.toDF() # udf_parse_json = udf(lambda str: parse_json(str), json_schema) # age_list = df_ait["student_answer_details"].tolist() # print ('list', age_list) df_ait = df_ait.withColumn('len_answer', udf_get_length(df_ait["student_answer_details"])) # df_ait.printSchema() # df_ait.show() df_ait_max_step = df_ait.groupby('user_id', 'lesson_id', 'total_step').agg( f.max('current_step').alias("max_step")) df_ait_max_step = df_ait_max_step.where('total_step = max_step') df_ait_max_step = df_ait_max_step.withColumnRenamed('user_id', 'user_id1').withColumnRenamed('lesson_id', 'lesson_id1').withColumnRenamed( 'total_step', 'total_step1') # df_ait_max_step.printSchema() # df_ait_max_step.show() df_ait_received_point = df_ait.where( "student_answer_details IS NOT NULL AND max_point = received_point AND page_style like '%ait_practice%'") df_ait_received_point = df_ait_received_point.groupby('user_id', 'lesson_id').agg( f.count('received_point').alias("count_received_point")) df_ait_received_point = df_ait_received_point.withColumnRenamed('user_id', 'user_id2').withColumnRenamed( 'lesson_id', 'lesson_id2') # df_ait_received_point.printSchema() # df_ait_received_point.show() # ait_pronunciation df_ait = df_ait.where("max_point = received_point AND page_style like '%ait_pronunciation%'") df_ait = df_ait.join(df_ait_received_point, ( df_ait['user_id'] == df_ait_received_point['user_id2']) & ( df_ait['lesson_id'] == df_ait_received_point[ 'lesson_id2'])) df_ait = df_ait.join(df_ait_max_step, ( df_ait['user_id'] == df_ait_max_step['user_id1']) & ( df_ait['lesson_id'] == df_ait_max_step[ 'lesson_id1'])) # print('SCHEMA:::') # df_ait.printSchema() # df_ait.show() df_ait = df_ait.withColumn("knowledge", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("comprehension", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("application", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("analysis", f.lit(0)) \ .withColumn("synthesis", f.lit(0)) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("lo_type", f.lit(1)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_ait.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) # df_ait.printSchema() # df_ait.show() dyf_ait = DynamicFrame.fromDF(df_ait, glueContext, "dyf_ait") applymapping = ApplyMapping.apply(frame=dyf_ait, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("correct_answer", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) df_temp = dyf_ai_study_step.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_ai_study_step.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session datasource0 = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="advisor_account", transformation_ctx="datasource0") datasource0 = datasource0.select_fields([ '_key', 'user_id', 'user_name', 'user_display_name', 'user_email', 'user_phone', 'ip_phone_number', 'level', 'advisor_deleted' ]).rename_field('user_id', 'id').rename_field('user_name', 'ten').rename_field( 'advisor_deleted', 'advisor_deleted_tmp') # doc flag tu s3 df_flag = spark.read.parquet("s3://dts-odin/flag/flag_CVHD.parquet") # so sanh _key datasource voi flag, lay nhung gia tri co key > flag data = datasource0.toDF() data = data.where(data['_key'] > df_flag.collect()[0]['flag']) data = data.withColumn('type_eg', f.lit(None)) data = data.withColumn('advisor_type', f.lit(None)) data = data.withColumn( 'advisor_deleted', when(data.advisor_deleted_tmp, f.lit(1)).otherwise(f.lit(0))) data.printSchema() datasource0 = DynamicFrame.fromDF(data, glueContext, "datasource0") # datasource0.show() if (datasource0.count() > 0): try: # chon field mong muon applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[("id", "int", "id", "bigint"), ("ten", "string", "username", "string"), ("user_display_name", "string", "name", "string"), ("user_email", "string", "email", "string"), ("level", "int", "level", "int"), ("advisor_deleted", "int", "advisor_deleted", "int"), ("type_eg", "int", "type_eg", "string"), ("advisor_type", "int", "advisor_type", "string")], transformation_ctx="applymapping1") resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_advisor", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/", transformation_ctx="datasink4") # lay max _key tren datasource datasource = datasource0.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # tao data frame flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet", mode="overwrite") except: # xu ly ngoai le(khi co datasource nhung k co gia tri thoa man dieu kien sau khi loc) # ghi flag datasource = datasource0.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet", mode="overwrite") # EG datasource = glueContext.create_dynamic_frame.from_catalog( database="dm_toa", table_name="advisor_eg") # Chon cac truong can thiet datasource = datasource.select_fields( ['_key', 'advisor_id', 'bo_phan', 'eg']) datasource = datasource.resolveChoice(specs=[('_key', 'cast:long')]) data = datasource.toDF() # data = data.where(data['_key'] > df_flag.collect()[0]['flag']) # data = data.where(data['_key'] < 276961) datasource = DynamicFrame.fromDF(data, glueContext, "datasource") if (datasource.count() > 0): applymapping1 = ApplyMapping.apply( frame=datasource, mappings=[("advisor_id", "string", "advisor_id", "string"), ("bo_phan", "string", "bo_phan", "string"), ("eg", "string", "eg", "string")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_advisor_eg", "database": "dts_odin", "postactions": """update dim_advisor set type_eg = eg, advisor_type = bo_phan from dim_advisor_eg where id=advisor_id; DROP TABLE IF EXISTS public.dim_advisor_eg""" }, redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/", transformation_ctx="datasink4")
print('database_name is: ', database_name) print('driver is: ', driver) print('username is: ', username) print('password is: ', password) print('bucket_name is: ', bucket_name) print('partition_Keys is: ', partition_Keys) path = 's3://' + str(bucket_name) + "/" + str(database_name) + "/" + str( table_name) print('Path is: ', path) # Read Data from database using JDBC driver in to DataFrame source_df = spark.read.format("jdbc").option("url", url).option( "dbtable", db_table_name).option("driver", driver).option( "user", username).option("password", password).load() job.init(args['JOB_NAME'], args) # Convert DataFrames to AWS Glue's DynamicFrames Object dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df") glueContext.write_dynamic_frame.from_options(frame=dynamic_dframe, connection_type="s3", connection_options={ "path": path, "partitionKeys": partition_Keys }, format="parquet") job.commit()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_tpe_enduser_used_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product") dyf_tpe_enduser_used_product = dyf_tpe_enduser_used_product.select_fields( ['contact_id', 'product_id', 'timecreated']) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'ct_id') # dyf_student_contact = Filter.apply(frame=dyf_student_contact, # f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' # and x["student_id"] is not None and x["student_id"] != '' # ) df_student_contact = dyf_student_contact.toDF() print('df_student_contact') df_student_contact.show() #-------------------------------------------------------------------------------------------------------------------# df_tpe_invoice_product_details = dyf_tpe_invoice_product_details.toDF() df_tpe_invoice_product_details = df_tpe_invoice_product_details.\ where("cat_code like 'TAAM%' OR cat_code like 'TENUP%' ") df_tpe_invoice_product_details = df_tpe_invoice_product_details.withColumn( 'to_status_id', f.when(df_tpe_invoice_product_details.cat_code.like('TAAM%'), f.lit(999L)).when( df_tpe_invoice_product_details.cat_code.like('TENUP%'), f.lit(998L)).otherwise(f.lit(999999999L))) df_tpe_invoice_product_details.show(2) df_tpe_enduser_used_product = dyf_tpe_enduser_used_product.toDF() #-----------------------------------------------____JOIN______------------------------------------------------------# df_join = df_tpe_invoice_product_details.join( df_tpe_enduser_used_product, df_tpe_invoice_product_details.id == df_tpe_enduser_used_product.product_id) df_join.printSchema() print('df_join ::', df_join.count()) df_join1 = df_join.join(df_student_contact, df_student_contact.ct_id == df_join.contact_id) df_join1 = df_join1.withColumn( 'change_status_date_id', from_unixtime(df_join1.timecreated, "yyyyMMdd")) df_join1.printSchema() print('df_join1 ::', df_join1.count()) #-------------------------------------------------------------------------------------------------------------------# df_result = df_join1.select('student_id', 'change_status_date_id', 'to_status_id', 'contact_id') df_result.printSchema() df_result.show(3) df_result = df_result.drop_duplicates() df_result.cache() print('count df_result::', df_result.count()) dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result") dyf_result = Filter.apply( frame=dyf_result, f=lambda x: x["student_id"] is not None and x[ "change_status_date_id"] is not None and x[ "to_status_id"] is not None and x["contact_id"] is not None) apply_output = ApplyMapping.apply( frame=dyf_result, mappings=[ ("student_id", "string", "student_id", "long"), # ("user_id", "long", "user_id", "long"), ("change_status_date_id", "string", "change_status_date_id", "long" ), # ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), # ("measure1", "double", "measure1", "double"), # ("measure2", "double", "measure2", "double"), # ("description", "string", "description", "string"), # ("timestamp1", "string", "timestamp1", "string"), ("contact_id", "string", "contact_id", "string"), # ("teacher_id", "long", "teacher_id", "long"), # ("contact_id1", "string", "contact_id1", "string"), # ("measure1_int", "int", "measure1_int", "int"), # ("measure2_int", "int", "measure2_int", "int"), # ("contact_id_str", "string", "contact_id_str", "string"), # ("lc", "string", "lc", "string"), # ("student_id_string", "string", "student_id_string", "string") ]) df_apply_output = apply_output.toDF() df_apply_output.drop_duplicates() print('df_apply_output.count', df_apply_output.count()) dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext, "dyf_apply_output") resolve_choice = ResolveChoice.apply(frame=dyf_apply_output, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolve_choice, transformation_ctx="dropnullfields") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_mapping_status", "database": "dts_odin", "postactions": """ insert into mapping_changed_status_student_v1(student_id, change_status_date_id, to_status_id, contact_id) select student_id, change_status_date_id, to_status_id, contact_id from temp_mapping_status; update mapping_changed_status_student_v1 set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id) where user_id is null; DROP TABLE IF EXISTS temp_mapping_status """ }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") df_result.unpersist() df_student_contact.unpersist() print( '------------------------>___complete__________------------------------------>' )
#Note show function is as action . Action fources the execution of the data frame plan. # With big data he slowdown would be significant without cacching data_frame_aggerafated.show(10) ################################################################### ########## LOAD (WRITE DATA) ################################################################### # CREATE JUST 1 PARTATION , BECAUSE there is little data data_frame_aggerafated = data_frame_aggerafated.repartition(10) #Convert back to dynamic frame dynamic_frame_write = DynamicFrame.fromDF(data_frame_aggerafated,glue_context,"dynamic_frame_write") #write data back to s3 glue_context.write_dynamic_frame.from_options( frame = dynamic_frame_write, connection_type = "s3", connection_options = { "path" : s3_write_path, #hrere you cloud create s3 prefic according to a value in seperate cloums #"partationKeys":["decate"] }, format = "csv" ) #log end time dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def writeCsvFile(datasource, path): dataframe = DynamicFrame.toDF(datasource).repartition(1) datasource = DynamicFrame.fromDF(dataframe, glueContext, 'write-csv') glueContext.write_dynamic_frame.from_options(frame = datasource, connection_type = "s3", connection_options = {"path": path}, format = "csv", transformation_ctx = "write-csv")
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") dyf_care_call = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='care_call') dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')]) # print schema and select fields print('original schema') dyf_care_call.printSchema() dyf_care_call.show(10) # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_care_call = Filter.apply(frame=dyf_care_call, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_care_call.count()) dyf_care_call = dyf_care_call.select_fields( ['_key', 'id', 'phone', 'duration', 'call_status', 'time_created']).rename_field('time_created', 'call_date') dy_source_care_call_cache = dyf_care_call.toDF() dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates( ['id']) dy_source_care_call_cache = dy_source_care_call_cache.cache() dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext, 'dyf_care_call') # if (dyf_care_call.count() > 0): dyf_care_call = Filter.apply( frame=dyf_care_call, f=lambda x: x["phone"] is not None and x["phone"] != '' and (x["call_status"] == 'success' or x["call_status"] == 'call_success') and x["call_date"] is not None and x["call_date"] != '' and x["duration"] is not None and x["duration"] > 30) # print('dyf_care_call::corrcect') print('dyf_care_call number', dyf_care_call.count()) if (dyf_care_call.count() > 0): dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact_phone') dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields( ['phone', 'contact_id']) dyf_ad_contact_phone = Filter.apply( frame=dyf_ad_contact_phone, f=lambda x: x["phone"] is not None and x["phone"] != '' and x[ "contact_id"] is not None and x["contact_id"] != '') print('dyf_ad_contact_phone::schema') dyf_ad_contact_phone.printSchema() # dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter', # table_name='advisor_ip_phone') # # dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone, # f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '') # # # # # # #-----------------------------------------------------------------------------------------------------------# join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone, 'phone', 'phone') # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id']) # print('join_call_contact::schema------------') join_call_contact.printSchema() join_call_contact.show(2) print('join: ', join_call_contact.count()) # # # #-----------------------------------------------------------------------------------------------------------# # dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='nvn_poss_lich_su_dong_tien') dyf_source_ls_dong_tien = Filter.apply( frame=dyf_source_ls_dong_tien, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan" ] != '') dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([ '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao', 'makh' ]).rename_field('ngay_tao', 'ngay_a0') dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF() dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates( ['id']) dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache() dyf_source_ls_dong_tien = DynamicFrame.fromDF( dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien') # join_call_contact_ao = Join.apply(join_call_contact, dyf_source_ls_dong_tien, 'contact_id', 'contact_id') # print('join_call_contact_ao::schema------------') join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join: ', join_call_contact_ao.count()) # # # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'), # # ('ngay_a0', 'cast:timestamp')]) # # join_call_contact_ao = Filter.apply( frame=join_call_contact_ao, f=lambda x: x["call_date"] is not None and x[ "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"]) # print( 'join_call_contact_ao::after filter calldate > ngay_a0------------' ) # join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join_call_contact_ao: ', join_call_contact_ao.count()) # # #get lich su chao mung thanh cong df_join_call_contact_ao = join_call_contact_ao.toDF() df_join_call_contact_ao = df_join_call_contact_ao.groupby( 'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1")) df_join_call_contact_ao = df_join_call_contact_ao.withColumn( 'id_time', from_unixtime( unix_timestamp(df_join_call_contact_ao.ngay_a1, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao, glueContext, 'dyf_result') # # print('dyf_result------------') # join_call_contact_ao.printSchema() dyf_result.show(2) print('dyf_result: ', dyf_result.count()) # # # # # # # chon field applymapping1 = ApplyMapping.apply( frame=dyf_result, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", "string", "id_time", "bigint"), ("makh", "int", "makh", "int"), ("ngay_a1", "string", "ngay_a1", "timestamp")]) # resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # print('dropnullfields3::printSchema') # dropnullfields3.printSchema() # dropnullfields3.show(2) # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_dong_tien_a1_v3", "database": "dts_odin", "postactions": """ INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1) SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1 FROM temp_ls_dong_tien_a1_v3 temp_a1 LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = temp_a1.contact_id ; DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3; CALL update_a1_exception_from_eg() """ }, redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2", transformation_ctx="datasink4") df_datasource = dyf_care_call.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet", mode="overwrite") dy_source_care_call_cache.unpersist()
from awsglue.transforms import * from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session session = boto3.Session(region_name='ap-northeast-2') glue_client = session.client(service_name='glue') s3Bucket = "s3://forecast-demogo-bucket" s3Folder = "/forecast_data" # Set source data with raw_data in S3 datasource = glueContext.create_dynamic_frame.from_catalog( database="forecast_raw_db", table_name="raw_data") df1 = datasource.toDF() # RenameField.apply(frame = df, old_name = "sales_quantity", new_name = "target_value") df2 = df1.withColumnRenamed("sales_quantity", "target_value") data_frame = DynamicFrame.fromDF(df2, glueContext, "data_frame") glueContext.write_dynamic_frame.from_options( frame=data_frame, connection_type="s3", connection_options={"path": s3Bucket + s3Folder}, format="csv")
logger.info(f'Dumping features and labels for training...') dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') featurs_graph_df = features_df.withColumn( 'props_values:String', to_json( struct( list( filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names))))) featurs_graph_df = featurs_graph_df.select('TransactionID', 'props_values:String') logger.info(f'Creating glue dynamic frame from spark dataframe...') features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF') features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')]) features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel( features_graph_dynamic_df, 'Transaction') features_graph_dynamic_df = SelectFields.apply( frame=features_graph_dynamic_df, paths=["~id", '~label', 'props_values:String']) logger.info(f'Dumping transaction data as graph data...') dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph=True) relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols) for name, df in relational_edges.items(): if name != TRANSACTION_ID: logger.info(f'Dumping edge {name} for training...')
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source #------------------------------------------------------------------------------------------------------------------# dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_history_log_api') dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')]) try: df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["id"] > read_from_index) except: print('read flag file error ') dyf_native_talk = dyf_native_talk.select_fields( ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time']) dy_cache = dyf_native_talk.toDF() dy_cache = dy_cache.cache() dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk') print('dy_cache------------') dy_cache.printSchema() print('dy_cache: ', dy_cache.count()) dy_cache.show(2) #------------------------------------------------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): #---------------------------------------------------------datasource0-----------------------------------------------------# dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["username"] is not None and x["username"] != '' and x["speaking_dialog_score"] is not None and x["learning_date"] is not None and x["learning_date"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_account_mapping') dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user') dy_cache_2 = dyf_nt_account_mapping.toDF() dy_cache_2 = dy_cache_2.cache() dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping') dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping, f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user') if(join.count() > 0): df_nativetalk = join.toDF() df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut df_nativetalk = df_nativetalk.withColumn('id_time', from_unixtime( unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"), "yyyyMMdd")) df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL") data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk') data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# print('data_nativetalk----------') data_nativetalk.printSchema() # tinh bang "fact_hieusuathoctap" df_hieusuathoctap = data_nativetalk.toDF() # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'), f.count('contact_id')) df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400)) data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap') data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')]) print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------') data_hieusuathoctap.printSchema(); applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("sum(sogio)", 'double', 'sogio', 'double'), ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields2 number: ', dropnullfields2.count()) datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4) FROM temp_staging_lich_su_tu_hoc_native_talk hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk """ }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/", transformation_ctx="datasink2") df_datasource = dyf_native_talk.toDF() flag = df_datasource.agg({"id": "max"}).collect()[0][0] print('flag: ', flag) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite") dy_cache.unpersist() dy_cache_2.unpersist()
ch.last_pos_orientation, \ ch.last_pos_name, \ ch.last_pos_bin, \ ch.last_pos_tier, \ ch.last_pos_anchor, \ ch.last_pos_orientation_degrees, \ ch.last_ops_pos_id, \ ch.last_pos_slot_on_carriage, \ ch.deleted_dt, \ ch.is_deleted \ FROM distxpsche ch \ INNER JOIN maxche mc ON ch.gkey = mc.gkey \ and coalesce(ch.last_time,cast('1900-01-01' as timestamp)) = mc.last_time \ and coalesce(ch.time_dispatch,cast('1900-01-01' as timestamp)) = mc.time_dispatch \ where status = 1") xpsche_dynDF = DynamicFrame.fromDF(xpsche_distDF, glueContext, "nested") ## xps_ecevent connection xpsecevent_DS = glueContext.create_dynamic_frame.from_catalog(database = "staging_combined", table_name = "xps_ecevent", transformation_ctx = "xpsecevent_DS") xpsecevent_regDF = xpsecevent_DS.toDF() xpsecevent_regDF.createOrReplaceTempView("distxpsecevent") xpsecevent_distDF = spark.sql("SELECT sourcesystem, \ gkey, \ yard, \ pkey, \ max(timestamp) ectimestamp, \ type, \ che_id, \ che_name, \ operator_name, \ sub_type, \
# The `provider id` field will be choice between long and string # Cast choices into integers, those values that cannot cast result in null medicare_res = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')]) # Remove erroneous records medicare_df = medicare_res.toDF() medicare_df = medicare_df.where("`provider id` is NOT NULL") # Apply a lambda to remove the '$' chop_f = udf(lambda x: x[1:], StringType()) medicare_df = medicare_df.withColumn("ACC", chop_f(medicare_df["average covered charges"])).withColumn("ATP", chop_f(medicare_df["average total payments"])).withColumn("AMP", chop_f(medicare_df["average medicare payments"])) # Turn it back to a dynamic frame medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested") # Rename, cast, and nest with apply_mapping medicare_nest = medicare_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'), ('provider id', 'long', 'provider.id', 'long'), ('provider name', 'string', 'provider.name', 'string'), ('provider city', 'string', 'provider.city', 'string'), ('provider state', 'string', 'provider.state', 'string'), ('provider zip code', 'long', 'provider.zip', 'long'), ('hospital referral region description', 'string','rr', 'string'), ('ACC', 'string', 'charges.covered', 'double'), ('ATP', 'string', 'charges.total_pay', 'double'), ('AMP', 'string', 'charges.medicare_pay', 'double')]) # Write it out in Parquet glueContext.write_dynamic_frame.from_options(frame = medicare_nest, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet")
# Cast choices into integers, those values that cannot cast result in null medicare_res = medicare_dyf.resolveChoice(specs=[('provider id', 'cast:long')]) # Remove erroneous records medicare_df = medicare_res.toDF() medicare_df = medicare_df.where("`provider id` is NOT NULL") # Apply a lambda to remove the '$' chop_f = udf(lambda x: x[1:], StringType()) medicare_df = medicare_df.withColumn( "ACC", chop_f(medicare_df["average covered charges"])).withColumn( "ATP", chop_f(medicare_df["average total payments"])).withColumn( "AMP", chop_f(medicare_df["average medicare payments"])) # Turn it back to a dynamic frame medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested") # Rename, cast, and nest with apply_mapping medicare_nest = medicare_tmp.apply_mapping([ ('drg definition', 'string', 'drg', 'string'), ('provider id', 'long', 'provider.id', 'long'), ('provider name', 'string', 'provider.name', 'string'), ('provider city', 'string', 'provider.city', 'string'), ('provider state', 'string', 'provider.state', 'string'), ('provider zip code', 'long', 'provider.zip', 'long'), ('hospital referral region description', 'string', 'rr', 'string'), ('ACC', 'string', 'charges.covered', 'double'), ('ATP', 'string', 'charges.total_pay', 'double'), ('AMP', 'string', 'charges.medicare_pay', 'double') ])
def hash_cc(s): return hashlib.sha256(s).hexdigest() ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0") ## @convert glue DynamicFrame to DataFrame to manipulate the columns dataframe0 = DynamicFrame.toDF(datasource0) hash_cc_f = udf(lambda x: hash_cc(x), StringType()) dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"])) dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password') ## @convert dataframe to glue DynamicFrame and write the output in parquet format datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1") datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4") job.commit()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 # ETL TBHV # Custom function def doSplitWord(word): size = len(word) rs = [word[i:i + 2] for i in range(0, size, 1)] rs1 = [word[i:i + 1] for i in range(0, size, 1)] rs.extend(rs1) return rs state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 knowledge = '' # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = [ 'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = [ 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = ['P4_D1', 'P4_D2'] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = '' def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 10 if state == state_wrong: score = -5 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) # chuoi ky tu can replace special_str = '["] ;' splitWord = udf(lambda x: doSplitWord(x)) ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts") dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields( ['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice( specs=[('_key', 'cast:long')]) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show(2) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') dyf_top_quiz_attempts = Filter.apply( frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show() if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']).rename_field( 'id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question") dyf_top_question = dyf_top_question.select_fields( ['id', 'name']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai") dyf_top_result_ai = dyf_top_result_ai.select_fields([ 'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word' ]) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'quest_id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] ", # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select('quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn( "right_word_list", f.split(df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn( "right", f.explode(df_study_right.right_word_list)) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') df_study_right = df_study_right.withColumn( "right", f.lower(f.col("right"))) # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object") dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn( "learning_object_name", f.lower(f.col("learning_object_name"))) dyf_learning_object = DynamicFrame.fromDF( df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) df_knowledge_right = df_knowledge_right.groupby( 'student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where( 'student_id is not null') # df_knowledge_right.printSchema() # df_knowledge_right.show() # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("learning_object_id", "int", "learning_object_id", "int"), # ("date_id", "string", "date_id", "int"), # ("knowledge", "int", "knowledge", "int"), # ("comprehension", "int", "comprehension", "int"), # ("application", "int", "application", "int"), # ("analysis", "int", "analysis", "int"), # ("synthesis", "int", "synthesis", "int"), # ("evaluation", "int", "evaluation", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "temp_right_wrong_learning_object", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################# # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -5 diem neu sai df_study_wrong = df_study.withColumn( "wrong_word_list", f.split(df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn( "wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn( "wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_wrong = dyf_knowledge_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus"))\ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where( 'student_id_wrong is not null') # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all') # df_knowledge_right.printSchema() # df_knowledge_right.show() df_knowledge = df_knowledge_right.join( df_knowledge_wrong, (df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & (df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") applymapping2 = ApplyMapping.apply( frame=dyf_knowledge, mappings=[ ("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) applymapping2.printSchema() applymapping2.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('COUNT df_knowledge: ', dropnullfields2.count()) dropnullfields2.printSchema() dropnullfields2.show(2) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields2, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/", # transformation_ctx="datasink5") # END Tru diem cac tu sai # xoa cache df_study.unpersist() df_knowledge_right.unpersist() df_knowledge_wrong.unpersist() # df_knowledge_right.unpersist() except Exception as e: print( "###################### Exception ##########################" ) print(e) # ghi flag # lay max key trong data source mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF() flag = mdl_dyf_top_quiz_attempts.agg({ "_key": "max" }).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet", mode="overwrite")
job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource = glueContext.create_dynamic_frame.from_catalog( database=args['GLUE_DB_NAME'], table_name=args['GLUE_TABLE_NAME']) sourcedata = datasource.toDF() split_col = split(sourcedata["quarter"], " ") sourcedata = sourcedata.withColumn("quarter new", split_col.getItem(0)) sourcedata = sourcedata.withColumn("profit", col("revenue") * col("gross margin")) sourcedata = sourcedata.withColumn("current date", current_date()) # Convert back to Glue Dynamic Frame datasource = DynamicFrame.fromDF(sourcedata, glueContext, "datasource") applymapping = ApplyMapping.apply( frame=datasource, mappings=[ ("retailer country", "string", "retailer_country", "varchar(20)"), ("order method type", "string", "order_method_type", "varchar(15)"), ("retailer type", "string", "retailer_type", "varchar(30)"), ("product line", "string", "product_line", "varchar(30)"), ("product type", "string", "product_type", "varchar(30)"), ("product", "string", "product", "varchar(50)"), ("year", "bigint", "year", "varchar(4)"), ("quarter new", "string", "quarter", "varchar(2)"), ("revenue", "double", "revenue", "numeric"), ("quantity", "bigint", "quantity", "integer"), ("gross margin", "double", "gross_margin", "decimal(15,10)"),
## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_struct", transformation_ctx="resolvechoice2") ## @type: DropNullFields ## @args: [transformation_ctx = "dropnullfields3"] ## @return: dropnullfields3 ## @inputs: [frame = resolvechoice2] dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://go-lambda-bucket/Taxi_Data"}, format = "parquet", transformation_ctx = "datasink4"] ## @return: datasink4 ## @inputs: [frame = dropnullfields3] ##---------------------------------- #convert to a Spark DataFrame... customDF = datasource0.toDF() #add a new column for "type" customDF = customDF.withColumn("type", lit('yellow')) # Convert back to a DynamicFrame for further processing. customDynamicFrame = DynamicFrame.fromDF(customDF, glueContext, "customDF_df") ##---------------------------------- datasink4 = glueContext.write_dynamic_frame.from_options( frame=customDynamicFrame, connection_type="s3", connection_options={"path": "s3://go-lambda-bucket"}, format="parquet", transformation_ctx="datasink4") job.commit()
None).otherwise(df['Province-State']).alias('Province-State'), 'Country-Region', 'Lat', 'Long', when(df['Recovered_int'].isNull(), 0).otherwise(df['Recovered_int']).alias('Recovered'), when(df['Confirmed_int'].isNull(), 0).otherwise(df['Confirmed_int']).alias('Confirmed'), when(df['Deaths_int'].isNull(), 0).otherwise(df['Deaths_int']).alias('Deaths'), when( to_date(col("Date"), "yyyy-MM-dd").isNotNull(), to_date(col("Date"), "yyyy-MM-dd")).when( to_date(col("Date"), "yyyy/MM/dd").isNotNull(), to_date(col("Date"), "yyyy/MM/dd")).when( to_date(col("Date"), "yyyy-MMM-dd").isNotNull(), to_date(col("Date"), "yyyy-MMM-dd")).when( to_date(col("Date"), "yyyy/MMMM/dd").isNotNull(), to_date(col("Date"), "yyyy/MMMM/dd")).when( to_date(col("Date"), "yyyy, MMMM, dd").isNotNull(), to_date(col("Date"), "yyyy, MMMM, dd")).otherwise( "Unknown Format").alias("Date"), 'id') datasource_transformed = DynamicFrame.fromDF(df, glueContext, "ds0") datasink2 = glueContext.write_dynamic_frame.from_options( frame=datasource_transformed, connection_type="s3", connection_options={"path": "s3://pochetti-covid-19-output"}, format="json", transformation_ctx="datasink2") job.commit()
def main(): ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) start_date_id = 20200101 end_date_id = 20200305 print('start_date_id: ', start_date_id) print('end_date_id: ', end_date_id) # start_year_month_id, end_year_month_id = get_year_month_id_from_date(start_date_id, end_date_id) start_year_week_id, end_year_week_id = get_year_week_id_from_date(start_date_id, end_date_id) # print('start_year_month_id: ', start_year_month_id) print('end_year_month_id: ', end_year_month_id) print('start_year_week_id: ', start_year_week_id) print('end_year_week_id: ', end_year_week_id) print('start_year_week_id: ', start_year_week_id) print('end_year_week_id: ', end_year_week_id) # ------------------------------------------------------------------------------------------------------------------# df_student_package_status_by_date = get_student_package_adivsor_level(start_date_id, end_date_id) df_student_package_status_by_date.cache() df_student_learning_and_duration_by_date = get_total_student_lerning_and_duration_by_date(glueContext, start_year_month_id, end_year_month_id) df_student_package_status_by_date_learning = df_student_package_status_by_date\ .join(df_student_learning_and_duration_by_date, on=['contact_id', 'date_id'], how='left') df_student_package_status_by_date_learning = df_student_package_status_by_date_learning.na.fill({ 'total_learning_ls_sc_lt_le2': 0L, 'total_learning_ls_sc_lt_le2_success': 0L, 'total_learning_ls_sc_lt': 0L, 'total_learning_ls_sc_lt_success': 0L, 'total_learning_ls_success': 0L, 'total_learning_sc_success': 0L, 'total_learning_lt_success': 0L, 'total_duration_ls_sc_lt': 0L, 'total_learning_le2': 0L, 'total_learning_le2_success': 0L, 'total_learning_voxy_success': 0L, 'total_learning_native_talk_success': 0L, 'total_learning_home_work_success': 0L, 'total_learning_ncsbasic_success': 0L, 'total_duration_le2': 0L, 'total_duration_voxy': 0L, 'total_duration_native_talk': 0L, 'total_duration_home_work': 0L, 'total_duration_ncsbasic': 0L }) df_student_package_status_by_date_learning.cache() print('df_student_package_status_by_date_learning') df_student_package_status_by_date_learning.printSchema() df_student_package_status_by_date_learning.show(3) if is_dev: dyf_student_package_status_by_date_learning = DynamicFrame \ .fromDF(df_student_package_status_by_date_learning, glueContext, 'dyf_student_package_status_by_date_learning') atasink4 = glueContext.write_dynamic_frame \ .from_jdbc_conf(frame=dyf_student_package_status_by_date_learning, catalog_connection="glue_redshift", connection_options={ "dbtable": "dev.df_student_package_status_by_date_learning", "database": "student_native_report" }, redshift_tmp_dir="s3://dts-odin/temp/nvn/knowledge/student/df_student_package_status_by_date_learning", transformation_ctx="datasink4") #-------------- save to bc200_fact df_student_package_status_by_date_learning = df_student_package_status_by_date_learning \ .select('date_id', 'package_id', 'student_level_id', 'contact_id', 'advisor_id', 'is_activated', f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_le2'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_le2_success'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_success'), f.when(df_student_package_status_by_date_learning['total_learning_ls_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_success'), f.when(df_student_package_status_by_date_learning['total_learning_sc_success'] > 0L, 1L) .otherwise(0L).alias('is_sc_success'), f.when(df_student_package_status_by_date_learning['total_learning_lt_success'] > 0L, 1L) .otherwise(0L).alias('is_lt_success'), f.when(df_student_package_status_by_date_learning['total_learning_le2'] > 0L, 1L) .otherwise(0L).alias('is_le2'), f.when(df_student_package_status_by_date_learning['total_learning_le2_success'] > 0L, 1L) .otherwise(0L).alias('is_le2_success'), f.when(df_student_package_status_by_date_learning['total_learning_voxy_success'] > 0L, 1L) .otherwise(0L).alias('is_voxy_success'), f.when(df_student_package_status_by_date_learning['total_learning_native_talk_success'] > 0L, 1L) .otherwise(0L).alias('is_native_talk_success'), f.when(df_student_package_status_by_date_learning['total_learning_home_work_success'] > 0L, 1L) .otherwise(0L).alias('is_home_work_success'), f.when(df_student_package_status_by_date_learning['total_learning_ncsbasic_success'] > 0L, 1L) .otherwise(0L).alias('is_ncsbasic_success'), 'total_learning_ls_sc_lt_le2', 'total_learning_ls_sc_lt_le2_success', 'total_learning_ls_sc_lt', 'total_learning_ls_sc_lt_success', 'total_learning_ls_success', 'total_learning_sc_success', 'total_learning_lt_success', 'total_duration_ls_sc_lt', 'total_learning_le2', 'total_learning_le2_success', 'total_learning_voxy_success', 'total_learning_native_talk_success', 'total_learning_home_work_success', 'total_learning_ncsbasic_success', 'total_duration_le2', 'total_duration_voxy', 'total_duration_native_talk', 'total_duration_home_work', 'total_duration_ncsbasic' ) df_student_package_status_group_week = df_student_package_status_by_date_learning \ .groupBy('date_id', 'package_id', 'student_level_id', 'advisor_id') \ .agg(f.count('contact_id').alias('total_student'), f.sum('is_activated').alias('total_student_active'), f.sum('is_ls_sc_lt_le2').alias('total_student_ls_sc_lt_le2'), f.sum('is_ls_sc_lt_le2_success').alias('total_student_ls_sc_lt_le2_success'), f.sum('total_learning_ls_sc_lt_le2').alias('total_learning_ls_sc_lt_le2'), f.sum('total_learning_ls_sc_lt_le2_success').alias('total_learning_ls_sc_lt_le2_success'), f.sum('is_ls_sc_lt').alias('total_student_ls_sc_lt'), f.sum('is_ls_sc_lt_success').alias('total_student_ls_sc_lt_success'), f.sum('is_ls_success').alias('total_student_ls_success'), f.sum('is_sc_success').alias('total_student_sc_success'), f.sum('is_lt_success').alias('total_student_lt_success'), f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt'), f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt_success'), f.sum('total_learning_ls_success').alias('total_learning_ls_success'), f.sum('total_learning_sc_success').alias('total_learning_sc_success'), f.sum('total_learning_lt_success').alias('total_learning_lt_success'), f.sum('total_duration_ls_sc_lt').alias('total_duration_ls_sc_lt'), f.sum('is_le2').alias('total_student_le2'), f.sum('is_le2_success').alias('total_student_le2_success'), f.sum('is_voxy_success').alias('total_student_voxy_success'), f.sum('is_native_talk_success').alias('total_student_native_talk_success'), f.sum('is_home_work_success').alias('total_student_home_work_success'), f.sum('is_ncsbasic_success').alias('total_student_ncsbasic_success'), f.sum('total_learning_le2').alias('total_learning_le2'), f.sum('total_learning_le2_success').alias('total_learning_le2_success'), f.sum('total_learning_voxy_success').alias('total_learning_voxy__success'), f.sum('total_learning_native_talk_success').alias('total_learning_native_talk_success'), f.sum('total_learning_home_work_success').alias('total_learning_home_work_success'), f.sum('total_learning_ncsbasic_success').alias('total_learning_ncsbasic_success'), f.sum('total_duration_le2').alias('total_duration_le2'), f.sum('total_duration_voxy').alias('total_duration_voxy'), f.sum('total_duration_native_talk').alias('total_duration_native_talk'), f.sum('total_duration_home_work').alias('total_duration_home_work'), f.sum('total_duration_ncsbasic').alias('total_duration_ncsbasic') ) \ .withColumn('period_id', f.lit(DAILY_PERIOD_ID)) \ .withColumn('report_role_id', f.lit(REPORT_ROLE_MANAGER_ID)) # display(df_student_package_status_group_week, "df_student_package_status_group_week") dyf_student_package_status_group_week = DynamicFrame.fromDF(df_student_package_status_group_week, glueContext, 'dyf_student_package_status_group_week') apply_ouput = ApplyMapping \ .apply(frame=dyf_student_package_status_group_week, mappings=[("report_role_id", "long", "report_role_id", "long"), ("period_id", "long", "period_id", "long"), ("date_id", "long", "time_id", "long"), ("package_id", "long", "package_id", "long"), ("student_level_id", "long", "student_level_id", "long"), ("advisor_id", "long", "advisor_id", "long"), ("total_student", "long", "total_student", "long"), ("total_student_active", "long", "total_student_active", "long"), ("total_student_ls_sc_lt_le2", "long", "total_student_ls_sc_lt_le2", "long"), ("total_student_ls_sc_lt_le2_success", "long", "total_student_ls_sc_lt_le2_success", "long"), ("total_learning_ls_sc_lt_le2", "long", "total_learning_ls_sc_lt_le2", "long"), ("total_learning_ls_sc_lt_le2_success", "long", "total_learning_ls_sc_lt_le2_success", "long"), ("total_student_ls_sc_lt", "long", "total_student_ls_sc_lt", "long"), ("total_student_ls_sc_lt_success", "long", "total_student_ls_sc_lt_success", "long"), ("total_student_ls_success", "long", "total_student_ls_success", "long"), ("total_student_sc_success", "long", "total_student_sc_success", "long"), ("total_student_lt_success", "long", "total_student_lt_success", "long"), ("total_learning_ls_sc_lt", "long", "total_learning_ls_sc_lt", "long"), ("total_learning_ls_sc_lt_success", "long", "total_learning_ls_sc_lt_success", "long"), ("total_learning_ls_success", "long", "total_learning_ls_success", "long"), ("total_learning_sc_success", "long", "total_learning_sc_success", "long"), ("total_learning_lt_success", "long", "total_learning_lt_success", "long"), ("total_duration_ls_sc_lt", "long", "total_duration_ls_sc_lt", "long"), ("total_student_le2", "long", "total_student_le2", "long"), ("total_student_le2_success", "long", "total_student_le2_success", "long"), ("total_student_voxy_success", "long", "total_student_voxy_success", "long"), ("total_student_native_talk_success", "long", "total_student_native_talk_success", "long"), ("total_student_home_work_success", "long", "total_student_home_work_success", "long"), ("total_student_ncsbasic_success", "long", "total_student_ncsbasic_success", "long"), ("total_learning_le2", "long", "total_learning_le2", "long"), ("total_learning_le2_success", "long", "total_learning_le2_success", "long"), ("total_learning_voxy__success", "long", "total_learning_voxy__success", "long"), ("total_learning_native_talk_success", "long", "total_learning_native_talk_success", "long"), ("total_learning_home_work_success", "long", "total_learning_home_work_success", "long"), ("total_learning_ncsbasic_success", "long", "total_learning_ncsbasic_success", "long"), ("total_duration_le2", "long", "total_duration_le2", "long"), ("total_duration_voxy", "long", "total_duration_voxy", "long"), ("total_duration_native_talk", "long", "total_duration_native_talk", "long"), ("total_duration_home_work", "long", "total_duration_home_work", "long"), ("total_duration_ncsbasic", "long", "total_duration_ncsbasic", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") display(dfy_output, "dfy_output") # save_data_to_redshift( # glueContext, # dfy_output, # 'student_native_report', # 'bc200.bc200_fact_v2_1', # "s3n://dts-odin/temp/bc200/bc200_fact_v2_1", # "datasink4") preactions = "DELETE from bc200.bc200_fact_v2_1 WHERE period_id = " + str(DAILY_PERIOD_ID) + " and time_id >= " + str(start_date_id) glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "bc200.bc200_fact_v2_1", "database": "student_native_report" }, redshift_tmp_dir="s3n://dts-odin/temp/bc200/bc200_fact_v2", transformation_ctx="datasink4") #------------------------------------------------------- df_student_package_status_by_date_learning.unpersist() df_student_package_status_by_date.unpersist()
("body", "string", "body", "string"), ("stocktwitssentiment", "string", "stocktwitssentiment", "string")]) #convert aws glue dynamicframes to spark dataframes stw = dynframe_stocktwits.toDF() #transform time format #from e.g. 2019-10-25T00:11:11Z to 2019-10-25 00:11:11 stw = stw.withColumn("createdat", f.regexp_replace(f.col("createdat"), "[T]", " ")) stw = stw.withColumn("createdat", f.regexp_replace(f.col("createdat"), "[Z]", "")) #remove [\\n\\t\$#] stw = stw.withColumn("body", f.regexp_replace(f.col("body"), "[\\n\\t\$#]", "")) #convert spark dataframes back to aws glue dynamicframes dynframe_stocktwits = DynamicFrame.fromDF(stw, glueContext, "nested") #partition to 1 to get a single s3 file as output dynframe_output = dynframe_stocktwits.repartition(1) datasink = glueContext.write_dynamic_frame.from_options( frame=dynframe_output, connection_type="s3", connection_options={"path": "s3://541304926041-stocktwits"}, format="csv") job.commit()
return True else: return False # Apply filter function to dynamic frame interactions = Filter.apply(frame = datasource0, f = filter_function, transformation_ctx = "interactions") print("Filtered record count: ", interactions.count()) # Map only the fields we want in the output CSV, changing names to match target schema. applymapping1 = ApplyMapping.apply(frame = interactions, mappings = [ \ ("anonymousId", "string", "ANONYMOUS_ID", "string"), \ ("userId", "string", "USER_ID", "string"), \ ("properties.sku", "string", "ITEM_ID", "string"), \ ("event", "string", "EVENT_TYPE", "string"), \ ("timestamp", "string", "TIMESTAMP_ISO", "string")], \ transformation_ctx = "applymapping1") # Repartition to a single file since that is what is required by Personalize onepartitionDF = applymapping1.toDF().repartition(1) # Coalesce timestamp into unix timestamp onepartitionDF = onepartitionDF.withColumn("TIMESTAMP", \ unix_timestamp(onepartitionDF['TIMESTAMP_ISO'], "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")) # Convert back to dynamic frame onepartition = DynamicFrame.fromDF(onepartitionDF, glueContext, "onepartition_df") # Write output back to S3 as a CSV glueContext.write_dynamic_frame.from_options(frame = onepartition, connection_type = "s3", \ connection_options = {"path": args['S3_CSV_OUTPUT_PATH']}, \ format = "csv", transformation_ctx = "datasink2") job.commit()
## @return: resolvechoice2 ## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_struct", transformation_ctx="resolvechoice2") filtered_dyDF = Filter.apply( frame=resolvechoice2, f=lambda x: x["pickup_longitude"] != 0 and x["pickup_latitude"] != 0 and x[ "dropoff_longitude"] != 0 and x["dropoff_latitude"] != 0 and x[ "tpep_dropoff_datetime"] > x["tpep_pickup_datetime"]) yellow_DF = filtered_dyDF.toDF() yellow_DF = yellow_DF.withColumn('cab_type', lit('yellow').astype('string')) \ .withColumn('pickup_location_id', lit(None).astype('byte')) \ .withColumn('dropoff_location_id', lit(None).astype('byte')) target_df = DynamicFrame.fromDF(yellow_DF, glueContext, "target_df") ## @type: DropNullFields ## @args: [transformation_ctx = "dropnullfields3"] ## @return: dropnullfields3 ## @inputs: [frame = resolvechoice2] # dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3") ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://taxi-data-etl/staging/yellow"}, format = "parquet", transformation_ctx = "datasink4"] ## @return: datasink4 ## @inputs: [frame = dropnullfields3] sink = glueContext.write_dynamic_frame.from_options( frame=target_df, connection_type="s3", connection_options={"path": "s3://taxi-data-etl/staging/trips"}, format="parquet", transformation_ctx="sink")
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# def getSolanBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if solan_baoluu > songay_baoluu: return songay_baoluu return solan_baoluu getSolanBaoLuu = udf(getSolanBaoLuu, LongType()) def getSoNgayBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if songay_baoluu > solan_baoluu: return songay_baoluu return solan_baoluu getSoNgayBaoLuu = udf(getSoNgayBaoLuu, LongType()) def getContactId(code, contact_id_advisor): if code is not None: return code return contact_id_advisor getContactId = udf(getContactId, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) # ------------------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------------------# dyf_poss_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='ghinhan_hp') dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.select_fields([ '_key', 'id', 'ngay_thanhtoan', 'so_tien', 'khoa_hoc_makh', 'trang_thai' ]) dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.resolveChoice( specs=[('_key', 'cast:long')]) try: df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_poss_ghinhan_hp = Filter.apply( frame=dyf_poss_ghinhan_hp, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') dyf_poss_ghinhan_hp_number = dyf_poss_ghinhan_hp.count() print('dyf_poss_ghinhan_hp_number: ', dyf_poss_ghinhan_hp_number) if dyf_poss_ghinhan_hp_number < 1: return #-------------------------------------------------------------------------------------------------------------------# dyf_poss_khoa_hoc = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='khoa_hoc') dyf_poss_khoa_hoc = dyf_poss_khoa_hoc.select_fields( ['makh', 'mahv', 'goi_sanpham_id', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# dyf_poss_hoc_vien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='hoc_vien') dyf_poss_hoc_vien = dyf_poss_hoc_vien.select_fields( ['mahv', 'crm_id', 'trang_thai']).rename_field('mahv', 'mahv_hv') # -------------------------------------------------------------------------------------------------------------------# dyf_poss_goi_sanpham = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='goi_sanpham') dyf_poss_goi_sanpham = dyf_poss_goi_sanpham.select_fields( ['ma', 'id', 'solan_baoluu', 'songay_baoluu', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------------# dyf_crm_goi_contacts = glueContext.create_dynamic_frame.from_catalog( database='crm_native', table_name='contacts') # print('dyf_crm_goi_contacts::full') # # dyf_crm_goi_contacts.printSchema() dyf_crm_goi_contacts = dyf_crm_goi_contacts.select_fields( ['Code']).rename_field('Code', 'code') dyf_crm_goi_contacts = Filter.apply( frame=dyf_crm_goi_contacts, f=lambda x: x["code"] is not None and x["code"] != '') dy_crm_goi_contacts = dyf_crm_goi_contacts.toDF() dy_crm_goi_contacts = dy_crm_goi_contacts.dropDuplicates() # print('dy_crm_goi_contacts') # dy_crm_goi_contacts.printSchema() # -------------------------------------------------------------------------------------------------------------------# dyf_advisor_student_contact = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact') dyf_advisor_student_contact = dyf_advisor_student_contact.select_fields( ['student_id', 'contact_id']) dyf_advisor_student_contact = Filter.apply(frame=dyf_advisor_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != '' and x["contact_id"] is not None and x["contact_id"] != '')\ .rename_field('student_id', 'student_id_advisor')\ .rename_field('contact_id', 'contact_id_advisor') dy_advisor_student_contact = dyf_advisor_student_contact.toDF() dy_advisor_student_contact = dy_advisor_student_contact.dropDuplicates( ['student_id_advisor']) # print('dy_advisor_student_contact') # dy_advisor_student_contact.printSchema() # -------------------------------------------------------------------------------------------------------------------# # print('dyf_poss_ghinhan_hp') # dyf_poss_ghinhan_hp.printSchema() # # print('dyf_poss_khoa_hoc') # dyf_poss_khoa_hoc.printSchema() # # print('dyf_poss_hoc_vien') # dyf_poss_hoc_vien.printSchema() # # print('dyf_poss_goi_sanpham') # dyf_poss_goi_sanpham.printSchema() dy_poss_ghinhan_hp = dyf_poss_ghinhan_hp.toDF() dy_poss_ghinhan_hp = dy_poss_ghinhan_hp.dropDuplicates(['id']) dy_poss_khoa_hoc = dyf_poss_khoa_hoc.toDF() dy_poss_khoa_hoc = dy_poss_khoa_hoc.dropDuplicates(['makh', 'mahv']) dy_poss_hoc_vien = dyf_poss_hoc_vien.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['mahv_hv']) dy_poss_goi_sanpham = dyf_poss_goi_sanpham.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['crm_id']) poss_ghinhan_hp_number = dy_poss_ghinhan_hp.count() # print('poss_ghinhan_hp_number: ', poss_ghinhan_hp_number) if poss_ghinhan_hp_number < 1: return df_dong_tien = dy_poss_ghinhan_hp.join(dy_poss_khoa_hoc, dy_poss_ghinhan_hp.khoa_hoc_makh == dy_poss_khoa_hoc.makh, 'left')\ .join(dy_poss_hoc_vien, dy_poss_hoc_vien.mahv_hv == dy_poss_khoa_hoc.mahv, 'left')\ .join(dy_poss_goi_sanpham, dy_poss_goi_sanpham.id == dy_poss_khoa_hoc.goi_sanpham_id, 'left') df_dong_tien = df_dong_tien.select( 'ngay_thanhtoan', 'ma', 'crm_id', 'so_tien', getSolanBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']).alias('solan_baoluu_t'), getSoNgayBaoLuu( df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']).alias('songay_baoluu_t')) # print('df_dong_tien') # df_dong_tien.printSchema() #check lms_id and contact_id df_dong_tien_student = df_dong_tien.join(dy_crm_goi_contacts, df_dong_tien.crm_id == dy_crm_goi_contacts.code, 'left')\ .join(dy_advisor_student_contact, df_dong_tien.crm_id == dy_advisor_student_contact.student_id_advisor, 'left') # print('df_dong_tien_student-----') # df_dong_tien_student.printSchema() df_dong_tien_student = df_dong_tien_student.filter( df_dong_tien_student.code.isNotNull() | (df_dong_tien_student.contact_id_advisor.isNotNull())) df_dong_tien_student = df_dong_tien_student.limit(100) student_id_unavailable = 0L package_endtime_unavailable = 0L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' measure1_unavailable = 0 measure2_unavailable = 0 measure3_unavailable = 0 measure4_unavailable = float(0.0) df_dong_tien_student = df_dong_tien_student.select( f.unix_timestamp(df_dong_tien_student.ngay_thanhtoan, 'yyyy-MM-dd').alias('student_behavior_date'), f.lit(1L).alias('behavior_id'), f.lit(student_id_unavailable).cast('long').alias('student_id'), getContactId( df_dong_tien_student.code, df_dong_tien_student.contact_id_advisor).alias('contact_id'), df_dong_tien_student.ma.alias('package_code'), f.lit(package_endtime_unavailable).cast('long').alias( 'package_endtime'), f.lit(package_starttime_unavailable).cast('long').alias( 'package_starttime'), f.lit(student_level_code_unavailable).cast('string').alias( 'student_level_code'), f.lit(student_status_code_unavailable).cast('string').alias( 'student_status_code'), f.lit(today_second).alias('transformed_at'), 'so_tien', 'solan_baoluu_t', 'songay_baoluu_t', f.lit(measure4_unavailable).alias('measure4')) print('df_dong_tien_student--1') df_dong_tien_student.printSchema() df_dong_tien_student.show(1) df_dong_tien_student2 = df_dong_tien_student.withColumn( 'student_behavior_id', f.md5( concaText(df_dong_tien_student.student_behavior_date, df_dong_tien_student.behavior_id, df_dong_tien_student.student_id, df_dong_tien_student.contact_id, df_dong_tien_student.package_code, df_dong_tien_student.package_endtime, df_dong_tien_student.package_starttime, df_dong_tien_student.student_level_code, df_dong_tien_student.student_status_code, df_dong_tien_student.transformed_at))) print('df_dong_tien_student2--2') df_dong_tien_student2.printSchema() df_dong_tien_student2.show(5) dyf_dong_tien_student = DynamicFrame.fromDF(df_dong_tien_student2, glueContext, 'dyf_dong_tien_student') dyf_dong_tien_student = Filter.apply( frame=dyf_dong_tien_student, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') apply_ouput = ApplyMapping.apply( frame=dyf_dong_tien_student, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "long", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") apply_general = ApplyMapping.apply( frame=dyf_dong_tien_student, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("so_tien", "double", "measure1", "float"), ("solan_baoluu_t", "long", "measure2", "float"), ("songay_baoluu_t", "long", "measure3", "float"), ("measure4", "float", "measure4", "float"), ("behavior_id", "long", "behavior_id", "long")]) dfy_output2 = ResolveChoice.apply(frame=apply_general, choice="make_cols", transformation_ctx="resolvechoice2") print('dfy_output2::') dfy_output2.show(5) glueContext.write_dynamic_frame.from_options( frame=dfy_output2, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") flag = dy_poss_ghinhan_hp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet", mode="overwrite")
table_columns = table['Table']['StorageDescriptor']['Columns'] s3_destination = str(table['Table']['StorageDescriptor']['Location']) # Create Dynamic Frame from S3 CSV Object dynamicFrame = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": [s3_source_path]}, format_options={"withHeader": True,"separator": ","}, format = "csv") # Convert to Spark Data Frame dataFrame = dynamicFrame.toDF() # Cast Column types from Glue Table into Spark Data Frame for column in table_columns: dataFrame = dataFrame.withColumn(column['Name'], dataFrame[column['Name']].cast(column['Type'])) # Convert back to Glue Dynamic Frame for S3 upload final_dynamicFrame = DynamicFrame.fromDF(dataFrame, glueContext, "final_dynamicFrame") # Delete any unnecessary columns final_dynamicFrame = final_dynamicFrame.drop_fields(['col4', 'col5', 'col6']) # Send dynamic frame to S3 as parquet files. S3 location specified by the given Glue table glueContext.write_dynamic_frame.from_options(frame = final_dynamicFrame, connection_type = "s3", connection_options = {"path":s3_destination}, format = "parquet") # Successfully converted CSV file. Move CSV file to processed folder. s3_resource.Object(bucket, "processed/"+key).copy_from( CopySource=bucket+"/"+key) s3_resource.Object(bucket, key).delete() except Exception as e: print("Conversion failed. Moving object to error folder. error message: "+str(e)) s3_resource.Object(bucket, "error/"+key).copy_from( CopySource=bucket+"/"+key) s3_resource.Object(bucket, key).delete()
endpoint_url='https://glue.us-west-2.amazonaws.com') ###################################### #### CONNECTION BLOCK #### ###################################### ## ref_bizunit_scoped connection refBizScopedCon_ds = glueContext.create_dynamic_frame.from_catalog( database="staging_incremental", table_name="ref_bizunit_scoped", transformation_ctx="refBizScopedCon_ds") refBizScopedCon_regDF = refBizScopedCon_ds.toDF() refBizScopedCon_regDF = refBizScopedCon_regDF.withColumn( "sourcesystem", lit("PNCT")).withColumn("audtdateadded", lit(current_timestamp)) refBizScopedCon_dynDF = DynamicFrame.fromDF(refBizScopedCon_regDF, glueContext, "nested") ## ref_carrier_itinerary connection refCarItinCon_ds = glueContext.create_dynamic_frame.from_catalog( database="staging_incremental", table_name="ref_carrier_itinerary", transformation_ctx="refCarItinCon_ds") refCarItinCon_regDF = refCarItinCon_ds.toDF() refCarItinCon_regDF = refCarItinCon_regDF.withColumn( "sourcesystem", lit("PNCT")).withColumn("audtdateadded", lit(current_timestamp)) refCarItinCon_dynDF = DynamicFrame.fromDF(refCarItinCon_regDF, glueContext, "nested") ## ref_carrier_service connection refCarServCon_ds = glueContext.create_dynamic_frame.from_catalog(
# Returns a List['Natural_Key'] NATURAL_KEY = FINAL_TUPLE_WITH_DF_AND_MD5[1] # Taking the natual key that passed in Json File. NATURAL_KEY_1 = NATURAL_KEY[0] # Taking the value from SOURCE_NAME column (example : "HR PERSON") from # FINAL_MD5_DF POST_QUERY_SOURCE_NAME = FINAL_MD5_DF.select( "source_name").limit(1).rdd.map(lambda a: a[0]).collect()[0] print('#######>>>>>>>POST_QUERY_SOURCE_NAME', POST_QUERY_SOURCE_NAME) # Final Data frame is converted to Dynamic frame # Final Dynamic Frame will be written to Stage Table FINAL_DYNAMIC_FRAME = DynamicFrame.fromDF(FINAL_MD5_DF, GLUECONTEXT, "Final_dynamic_frame") # Updates,Inserts and Deletes counts logic here # 1. Create a DF with counts and op_val, Group by JobId,op_val # 2. Extract inserts, updates and deletes # 3. Add it to Cloud Watch Logs. COUNT_DF = FINAL_MD5_DF.withColumn('JobRunId', F.lit(str(RUN_ID)))\ .withColumn('JobName', F.lit(str(RUN_ID))) # Truncating the stage table PRE_QUERY = """begin; truncate table {stage_database_name}.{stage_table}; end;""".format(stage_database_name=STAGE_DATABASE_NAME, stage_table=STAGE_TABLE)
.withColumn("queued_time", expr("CAST(qtime AS LONG)")) \ .withColumn("start_time", expr("CAST(start AS LONG)")) \ .withColumn("created_time", expr("CAST(ctime AS LONG)")) \ .withColumn("etime", expr("CAST(etime AS LONG)")) \ .withColumn("end_time", expr("CAST(end AS LONG)")) \ .withColumn("exit_status", expr("CAST(exit_status AS INTEGER)")) \ .withColumnRenamed("group", "group_name") \ .withColumnRenamed("jobname", "job_name") \ .withColumnRenamed("resource_list_gpu_type", "gpu_type") \ .withColumn("num_cores", expr("CAST(node_ct as LONG) * CAST(num_cpus as INTEGER)")) \ .withColumn("walltime_hrs", expr("cast(round((walltime_secs / 60.00 / 60.00), 3) as float)")) \ .withColumn("cpu_time_hrs", expr("cast(round((cpu_time / 60.00 / 60.00), 3) as float)")) \ .drop('resources_used_vmem', 'kvs', 'session', 'exec_host', 'resource_list_neednodes', 'resource_list_walltime', 'detail', 'resources_used_walltime', 'resources_used_cput', 'resources_used_mem', 'resource_list_nodect', 'resource_list_cpu', 'resource_list_gpu', 'qtime', 'start', 'ctime', 'etime', 'end', 'o_dt', 'date', 'resource_list_mem', 'resource_list_nodes') # eventually drop detail and the asked resources to only use actually used torq = DynamicFrame.fromDF(with_map, glueContext, "joined") datasink5 = glueContext.write_dynamic_frame.from_options( frame=torq, connection_type="s3", connection_options={ "path": args['S3_OUTPUT_PATH'], "partitionKeys": ["year", "month", "day"] }, format="parquet", transformation_ctx="datasink5") job.commit()
#convert to spark dataframe df = dynamic_frame.toDF() df.show() # convert date columns to day & month df = df.withColumn("date_added", to_date(split(df["date"], " ").getItem(0).cast("string"), 'MM/dd/yyyy')) \ .withColumn("month", split(col("date_added"), "-").getItem(1)) \ .withColumn("day", split(col("date_added"), "-").getItem(2)) \ .orderBy('date_added') print("Dataframe sorted") partitioned_dataframe = df.repartition("day") # Convert back to dynamic frame dynamic_frame2 = DynamicFrame.fromDF(partitioned_dataframe, glue_context, "dynamic_frame_write", transformation_ctx="applymapping1") #resolve discrepency in columns data types resolvechoice = ResolveChoice.apply(frame=dynamic_frame2, choice="make_struct", transformation_ctx="resolvechoice2") #transformation function def ReplaceValue(rec): for field in rec: if rec[field] == '999' or rec[field] == 999.0 or rec[ field] == 'nan' or rec[field] == 0 or rec[field] == '0': rec[field] = None rec["category_a"] = False rec["category_b"] = False
collect_list("tag").alias("tags")) tags_dataset_agg.printSchema() tedx_dataset_agg = tedx_dataset.join( tags_dataset_agg, tedx_dataset.idx == tags_dataset_agg.idx_ref, "left").drop("idx_ref").select(col("idx").alias("_id"), col("*")).drop("idx") tedx_dataset_agg.printSchema() ##### CONNECT TO MONGODB ATLAS # change uri mongo_uri = "xxxx" # change username and password write_mongo_options = { "uri": mongo_uri, "database": "unibg_tedx", "collection": "tedx_data", "username": "******", "password": "******", "ssl": "true", "ssl.domain_match": "false" } from awsglue.dynamicframe import DynamicFrame tedx_dataset_dynamic_frame = DynamicFrame.fromDF(tedx_dataset_agg, glueContext, "nested") glueContext.write_dynamic_frame.from_options( tedx_dataset_dynamic_frame, connection_type="mongodb", connection_options=write_mongo_options)
# s3 output directories medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast" medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project" medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols" medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct" medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql" # Read data into a dynamic frame medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name) # The `provider id` field will be choice between long and string # Cast choices into integers, those values that cannot cast result in null medicare_res_cast = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')]) medicare_res_project = medicare_dyf.resolveChoice(specs = [('provider id','project:long')]) medicare_res_make_cols = medicare_dyf.resolveChoice(specs = [('provider id','make_cols')]) medicare_res_make_struct = medicare_dyf.resolveChoice(specs = [('provider id','make_struct')]) # Spark SQL on a Spark dataframe medicare_df = medicare_dyf.toDF() medicare_df.createOrReplaceTempView("medicareTable") medicare_sql_df = spark.sql("SELECT * FROM medicareTable WHERE `total discharges` > 30") medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf") # Write it out in Json glueContext.write_dynamic_frame.from_options(frame = medicare_res_cast, connection_type = "s3", connection_options = {"path": medicare_cast}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_project, connection_type = "s3", connection_options = {"path": medicare_project}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_cols, connection_type = "s3", connection_options = {"path": medicare_cols}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_struct, connection_type = "s3", connection_options = {"path": medicare_struct}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_sql_dyf, connection_type = "s3", connection_options = {"path": medicare_sql}, format = "json")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(database="tig_market", table_name="tpe_enduser_used_product_history") mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields( ['_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated']) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')]) # xu ly truong hop start_read is ngitull try: # # doc moc flag tu s3 df_flag = spark.read.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', mdl_tpe_enduser_used_product_history.count()) # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet") # # max_key = df_flag.collect()[0]['flag'] # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["_key"] > max_key) if (mdl_tpe_enduser_used_product_history.count() > 0): mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x[ "contact_id"] is not None and x[ "used_product_id"] is not None and (x["status_old"] == 'ACTIVED' and x["status_new"] in ['SUSPENDED','SUPPENDED'])) # print(mdl_tpe_enduser_used_product_history.count()) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF() df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn('change_status_date_id', from_unixtime( df_mdl_tpe_enduser_used_product_history[ 'timecreated'], "yyyyMMdd"))\ .withColumn('to_status_id',f.lit(55))\ .withColumn('timestamp1',df_mdl_tpe_enduser_used_product_history[ 'timecreated'] * f.lit( 1000)) # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id', # 'contact_id', # 'ngay_kich_hoat', # 'id').withColumnRenamed( # 'used_product_id', 'id_product_buy') data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(df_mdl_tpe_enduser_used_product_history, glueContext, "data_mdl_tpe_enduser_used_product_history") data_mdl_tpe_enduser_used_product_history.printSchema() data_mdl_tpe_enduser_used_product_history.show(3) applymapping1 = ApplyMapping.apply(frame=data_mdl_tpe_enduser_used_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("change_status_date_id", "string", "change_status_date_id", "long"), ("timestamp1", "long", "timestamp1", "timestamp"), ('to_status_id','int','to_status_id','long')]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions":"""UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 ) WHERE user_id IS NULL and to_status_id=55""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi data vao s3 datasink5 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields3, connection_type="s3", connection_options={ "path": "s3://datashine-dev-redshift-backup/A_55_tam_dung_goi"}, format="parquet", transformation_ctx="datasink5") # ghi flag # lay max key trong data source datasourceTmp = mdl_tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet", mode="overwrite")
def write_df_to_s3(glue_context, data_frame, backup_location): dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3") sink = glue_context.getSink("s3", path=backup_location) sink.setFormat("json") sink.write(dynamic_frame)
trimmedLEOriginRequestLogs = DropFields.apply(frame = labdaEdgeOriginRequestLogs, paths=["executionregion", "distributionid", "distributionname", "requestdata", "customtraceid", "eventtype", "year", "month", "date", "hour"], transformation_ctx ="trimmedLEOriginRequestLogs") ## Rename the requestid field for Lambda@Edge origin request logs to origin requestid modifiedLEOriginRequestLogs = RenameField.apply(frame = trimmedLEOriginRequestLogs, old_name = "requestid", new_name = "origin_requestid", transformation_ctx ="modifiedLEOriginRequestLogs" ) ## Convert to DataFrame modifiedLEOriginRequestLogsDF = modifiedLEOriginRequestLogs.toDF() ## Convert to DataFrame modifiedLEViewerRequestLogsDF = modifiedLEViewerRequestLogs.toDF() ## Join(left outer join) the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid combinedLambdaEdgeLogsDF = modifiedLEViewerRequestLogsDF.join(modifiedLEOriginRequestLogsDF, modifiedLEViewerRequestLogsDF["requestid"] == modifiedLEOriginRequestLogsDF["origin_requestid"], "left_outer") ## Convert to DynamicFrame combinedLambdaEdgeLogs = DynamicFrame.fromDF(combinedLambdaEdgeLogsDF, glueContext, "combinedLambdaEdgeLogs") ## Join the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid #combinedLambdaEdgeLogs = Join.apply(modifiedLEViewerRequestLogs, modifiedLEOriginRequestLogs, 'requestid', 'origin_requestid') ## Drop the origin_requestid field lambdaEdgeLogs = DropFields.apply(frame = combinedLambdaEdgeLogs, paths=["origin_requestid"], transformation_ctx ="lambdaEdgeLogs") ## Drop the "year", "month", "date", "hour" fields trimmedLambdaEdgeLogs = DropFields.apply(frame =lambdaEdgeLogs, paths=["year", "month", "date", "hour", "useragentstring"], transformation_ctx ="trimmedLambdaEdgeLogs") ## Convert to DataFrame trimmedLambdaEdgeLogsDF = trimmedLambdaEdgeLogs.toDF() #Destnation S3 loaction for combine Lambda@Edge logs leLogDestPath = "s3://" + args['target_s3_bucket'] + "/combined/lelogs"