def cara_rating(): dyf_mdl_rating_class_key = connectGlue(database="topicalms", table_name="mdl_rating_class", select_fields=["_key", "points", "vote", "student_id", "room_id", "opinion", "timecreated", "timemodified"], fillter=["points", "student_id", "opinion", "timecreated"] ).rename_field("points", "value_rating"). \ rename_field("opinion", "comment"). \ rename_field("vote", "rating_about"). \ rename_field("timecreated", "student_behavior_date") dyf_mdl_rating_class_key = dyf_mdl_rating_class_key.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_1 = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_cara_rating.parquet") max_key = df_flag_1.collect()[0]["flag"] print("max_key: ", max_key) dyf_mdl_rating_class_key = Filter.apply(frame=dyf_mdl_rating_class_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") dyf_mdl_rating_class_key = Filter.apply(frame=dyf_mdl_rating_class_key, f=lambda x: x["value_rating"] in points) if dyf_mdl_rating_class_key.count() > 0: df_mdl_rating_class_key = dyf_mdl_rating_class_key.toDF() dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["contact_id", "student_id"], fillter=["contact_id", "student_id"], duplicates=["contact_id", "student_id"] ).rename_field("student_id", "student_id_contact") df_student_contact = dyf_student_contact.toDF() df_mdl_rating_class = df_mdl_rating_class_key.join(df_student_contact,df_mdl_rating_class_key["student_id"] == df_student_contact["student_id_contact"]) df_mdl_rating_class = df_mdl_rating_class.drop("student_id_contact") df_mdl_rating_class = df_mdl_rating_class.withColumn("number_rating", f.lit(1)) \ .withColumn("rating_type", f.lit("rating_cara")) if is_dev: df_mdl_rating_class.show(10) # free run df_mdl_rating_class = df_mdl_rating_class \ .withColumn("behavior_id", f.lit(24)) \ .withColumn("transformed_at", f.lit(d4)) df_mdl_rating_class = set_package_advisor_level(df_mdl_rating_class) convertAndSaveS3(df_mdl_rating_class) flag = df_mdl_rating_class_key.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_cara_rating.parquet", mode="overwrite")
def get_df_student_advisor(): dyf_student_package = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/transaction_log", "user": REDSHIFT_USERNAME, "password": REDSHIFT_PASSWORD, "dbtable": "ad_student_advisor", "redshiftTmpDir": "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_level" }) dyf_student_package = Filter.apply( frame=dyf_student_package, f=lambda x: x["contact_id"] is not None and x["contact_id"] != "") if is_dev: dyf_student_package.printSchema() dyf_student_package.show(10) dyf_student_package = dyf_student_package.select_fields( ["contact_id","advisor_id", "end_date","start_date"]) .\ rename_field("contact_id", "contact_id_advisor") df_student_package = dyf_student_package.toDF() prinDev(df_student_package, "dyf_student_package") return df_student_package
def get_voxy(): dyf_voxy = connectGlue( database="voxy", table_name="voxy_api", select_fields=["email", "time_created", "total_hours_studied"], fillter=["email"], duplicates=["email", "time_created"]).rename_field("email", "email_voxy") dyf_voxy = Filter.apply(frame=dyf_voxy, f=lambda x: x["total_hours_studied"] > 0) df_voxy = dyf_voxy.toDF() df_voxy = df_voxy.withColumn( "time_created_new", f.unix_timestamp(df_voxy.time_created, "yyyy-MM-dd")) dyf_voxy = DynamicFrame.fromDF(df_voxy, glueContext, "dyf_voxy") dyf_voxy = dyf_voxy.resolveChoice(specs=[("time_created_new", "cast:long")]) df_voxy = dyf_voxy.toDF() df_voxy.drop("time_created") df_voxy = df_voxy.groupby("email_voxy").agg( f.min(df_voxy.time_created_new).alias("student_behavior_date_voxy")) return df_voxy
def get_native_talk(): dyf_native_talk_history_log = connectGlue(database="native_talk", table_name="native_talk_history_log_api", select_fields=["learning_date", "username","speaking_dialog_score"], fillter=["username"], duplicates=["username", "learning_date"] ) dyf_native_talk_history_log = Filter.apply(frame=dyf_native_talk_history_log, f=lambda x: x["speaking_dialog_score"] > 0 ) df_native_talk_history_log = dyf_native_talk_history_log.toDF() df_native_talk_history_log = df_native_talk_history_log.drop("speaking_dialog_score") df_native_talk_history_log = df_native_talk_history_log.withColumn("learning_date_int", f.unix_timestamp(df_native_talk_history_log.learning_date, "yyyy-MM-dd")) dyf_native_talk_history_log = DynamicFrame.fromDF(df_native_talk_history_log, glueContext, "dyf_native_talk_history_log") dyf_native_talk_history_log = dyf_native_talk_history_log.resolveChoice(specs=[("learning_date_int", "cast:long")]) df_native_talk_history_log = dyf_native_talk_history_log.toDF() df_native_talk_history_log = df_native_talk_history_log.groupby("username").agg( f.min(df_native_talk_history_log.learning_date_int).alias("student_behavior_date_nt")) dyf_native_talk_account_mapping = connectGlue(database="native_talk", table_name="native_talk_account_mapping", select_fields=["username", "contact_id"], fillter=["username", "contact_id"], duplicates=["username", "contact_id"] ).rename_field("username", "username_mapping").\ rename_field("contact_id","contact_id_nt") df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF() join = df_native_talk_account_mapping.join(df_native_talk_history_log, df_native_talk_account_mapping.username_mapping == df_native_talk_history_log.username) join=join.drop("username_mapping","username") return join
def filter_flag(dyf, config_file): try: flag_smile_care = spark.read.parquet(config_file) max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf = Filter.apply(frame=dyf, f=lambda x: x["_key"] > max_key) return dyf except: print("read flag file error") return dyf
def etl_dang_ki_tai_khoan(): dyf_lms_user=connectGlue(database="topicalms",table_name="mdl_user", select_fields=["_key","id","timecreated"], fillter=["id","timecreated"], duplicates=["id","timecreated"] ).rename_field("id","student_id")\ .rename_field("timecreated","student_behavior_date") dyf_lms_user = dyf_lms_user.resolveChoice(specs=[("_key", "cast:long")]) try: flag_behavior_hoc_vien_nhap_hoc = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_hoc_vien_nhap_hoc.parquet" ) max_key = flag_behavior_hoc_vien_nhap_hoc.collect()[0]["flag"] print("max_key: ", max_key) dyf_lms_user = Filter.apply(frame=dyf_lms_user, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_lms_user > 0: df_lms_user = dyf_lms_user.toDF() flag = df_lms_user.agg({"_key": "max"}).collect()[0][0] print("flag: ", flag) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("student_id", "student_id_contact") df_student_contact = dyf_student_contact.toDF() df_lms_user = dyf_lms_user.toDF() df_lms_user_contact = df_lms_user.join( df_student_contact, df_lms_user["student_id"] == df_student_contact["student_id_contact"], "left") # -----------------------------------------------------------------------------------------------------------------# df_join_level_code = set_package_advisor_level(df_lms_user_contact) for k, v in ADD_COLLUM.items(): df_join_level_code = df_join_level_code.withColumn(k, v) prinDev(df_join_level_code, "end data") convertAndSaveS3(df_join_level_code) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_hoc_vien_nhap_hoc.parquet", mode="overwrite")
def filter_latest(spark, dynamic_frame, config_file): try: df_flag = spark.read.parquet(config_file) start_read = df_flag.collect()[0]['flag'] print 'read from index: ', start_read result = Filter.apply(frame=dynamic_frame, f=lambda x: x['_key'] > start_read) return result except (): print 'read flag file error ' return dynamic_frame
def main(): dyf_student_care_advisor = connectGlue( database="callcenter", table_name="student_care_advisor", select_fields=[ "transformed_at", "idcall", "student_behavior_date", "student_id", "answer_duration", "total_duration", "requested_rating", "value_rating", "ip_phone", "call_status" ], duplicates=[ "student_behavior_date", "student_id", "answer_duration", "total_duration", "requested_rating", "value_rating" ], fillter=["student_id", "ip_phone"]).rename_field("transformed_at", "_key") dyf_student_care_advisor = dyf_student_care_advisor.resolveChoice( specs=[("_key", "cast:long")]) try: df_flag_phone_rating = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_student_care_advisor_fact.parquet" ) max_key = df_flag_phone_rating.collect()[0]["flag"] print("max_key: ", max_key) dyf_student_care_advisor = Filter.apply( frame=dyf_student_care_advisor, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") count = dyf_student_care_advisor.count() print(count) if count > 0: df_student_care_advisor = dyf_student_care_advisor.toDF() df_student_care_advisor = df_student_care_advisor.withColumn( "transformed_at", f.lit(d4)) prinDev(df_student_care_advisor) flag = df_student_care_advisor.agg({"_key": "max"}).collect()[0][0] convertAndSaveS3(df_student_care_advisor) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_student_care_advisor_fact.parquet", mode="overwrite")
def get_df_student_contact(glueContext): dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'advisor_id']) # dyf_student_contact = dyf_student_contact.resolveChoice(specs=[('time_lms_created', 'cast:long')]) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x['student_id'] is not None and x['contact_id'] is not None and x['advisor_id'] is not None and x['advisor_id'] != '') df_student_contact = dyf_student_contact.toDF() return df_student_contact
def get_ls_sc(): mdl_logsservice_in_out = connectGlue(database="topicalms", table_name="mdl_logsservice_in_out_cutoff", select_fields=["userid", "time_in", "time_out"], fillter=["userid"], duplicates=["userid","time_in"] ).rename_field("userid","student_id_ls_sc") mdl_logsservice_in_out = Filter.apply(frame=mdl_logsservice_in_out, f=lambda x: x["time_in"] >= 1483203600 and x["time_out"] >1483203600 and (x["time_out"] - x["time_in"])>2100 ) mdl_logsservice_in_out = mdl_logsservice_in_out.toDF() mdl_logsservice_in_out = mdl_logsservice_in_out.drop("time_out") mdl_logsservice_in_out = mdl_logsservice_in_out.groupby("student_id_ls_sc").agg( f.min(mdl_logsservice_in_out.time_in).cast("long").alias("student_behavior_date_ls_sc")) return mdl_logsservice_in_out
def get_lt(): dyf_native_livestream = connectGlue(database="native_livestream", table_name="log_in_out", select_fields=["student_id", "time_in", "thoigianhoc"], fillter=["student_id","thoigianhoc","time_in"], duplicates=["student_id","time_in"] ).rename_field("student_id","student_id_lt") dyf_native_livestream = dyf_native_livestream.resolveChoice(specs=[("time_in", "cast:long")]) dyf_native_livestream = dyf_native_livestream.resolveChoice(specs=[("thoigianhoc", "cast:int")]) dyf_native_livestream = Filter.apply(frame=dyf_native_livestream,f=lambda x: x["time_in"] >1483203600 and x["thoigianhoc"] >59) df_native_livestream = dyf_native_livestream.toDF() df_native_livestream = df_native_livestream.drop("thoigianhoc") df_native_livestream = df_native_livestream.groupby("student_id_lt").agg( f.min(expr("time_in div 1000")).cast("long").alias("student_behavior_date_lt")) return df_native_livestream
def get_dyf_student_advisor(glueContext): dyf_student_advisor = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_change_assignment_advisor") dyf_student_advisor = dyf_student_advisor.select_fields([ "id", "contact_id", "advisor_id_old", "advisor_id_new", "created_at", "updated_at" ]) dyf_student_advisor = Filter.apply( frame=dyf_student_advisor, f=lambda x: x['contact_id'] is not None and x[ 'contact_id'] != '' and x['advisor_id_new'] is not None and x[ 'advisor_id_new'] != '' and x['created_at'] is not None) df_student_advisor = dyf_student_advisor.toDF() return df_student_advisor
def fillterOutNull(dynamicFrame, fields): for field in fields: dynamicFrame = Filter.apply( frame=dynamicFrame, f=lambda x: x[field] is not None and x[field] != "") return dynamicFrame
def etl_native_talk(df_student_level=None, df_student_package=None, df_student_advisor=None): dyf_native_talk_history = connectGlue( database="native_talk", table_name="native_talk_history_log_api_cutoff_2020", select_fields=["_key", "username", "learning_date", "speaking_completed_dialog_name", "speaking_dialog_score", "time_of_completingadialog"], ) dyf_native_talk_history = Filter.apply( frame=dyf_native_talk_history, f=lambda x: x["username"] is not None and x["username"] != "" and x["learning_date"] is not None and x["learning_date"] != "" and x["speaking_completed_dialog_name"] != "" and x["speaking_dialog_score"] > 0) dyf_native_talk_history = dyf_native_talk_history.resolveChoice(specs=[("_key", "cast:float")]) if not is_load_full: # dyf_native_talk_history = filter_flag(dyf_native_talk_history, FLAG_NATIVE_TALK_FILE) try: df_flag = spark.read.parquet(FLAG_NATIVE_TALK_FILE) read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk_history = Filter.apply(frame=dyf_native_talk_history, f=lambda x: x["learning_date"] > read_from_index) except: print('read flag file error ') df_native_talk_history = dyf_native_talk_history.toDF() number_native_talk_history = df_native_talk_history.count() print("native_talk_history") print("number_native_talk_history_number: ", number_native_talk_history) prinDev(df_native_talk_history) if number_native_talk_history < 1: return df_native_talk_history = df_native_talk_history \ .drop_duplicates(["username", "learning_date", "speaking_completed_dialog_name", "speaking_dialog_score", "time_of_completingadialog"]) # --------------------- flag = df_native_talk_history.agg({"learning_date": "max"}).collect()[0][0] # ----------------------------------------------------------------------------------------------------- df_native_talk_history = df_native_talk_history \ .groupby("username", "learning_date") \ .agg(f.count("speaking_dialog_score").alias("total")) df_native_talk_history = df_native_talk_history.select("username", "learning_date", "total") # -------------process duplicate ------------------------ dyf_native_talk_mapping = connectGlue( database="native_talk", table_name="native_talk_account_mapping", select_fields=["username", "contact_id"], ).rename_field("username", "user_name_mapping") dyf_native_talk_mapping = Filter.apply( frame=dyf_native_talk_mapping, f=lambda x: x["user_name_mapping"] is not None and x["user_name_mapping"] != "" and x["contact_id"] is not None and x["contact_id"] != "") dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], ).rename_field("contact_id", "contact_id_contact") df_native_talk_mapping = dyf_native_talk_mapping.toDF() df_native_talk_mapping = df_native_talk_mapping.drop_duplicates(["user_name_mapping", "contact_id"]) df_student_contact = dyf_student_contact.toDF() df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id_contact"]) df_result = df_native_talk_history \ .join(df_native_talk_mapping, df_native_talk_history["username"] == df_native_talk_mapping["user_name_mapping"]) \ .join(df_student_contact, df_native_talk_mapping["contact_id"] == df_student_contact["contact_id_contact"]) df_result = df_result \ .withColumn("total_duration", f.col('total') * f.lit(300)) \ .withColumn("transformed_at", f.lit(d4)) \ .withColumn("class_type", f.lit("NATIVE_TALK")) prinDev(df_result) # df_result = set_package_advisor_level(df_result, df_student_level=df_student_level, # df_student_package=df_student_package, df_student_advisor=df_student_advisor) convertAndSaveS3(df_result) flag_data = [flag] df = spark.createDataFrame(flag_data, "string").toDF("flag") # ghi de _key vao s3 # df.write.parquet(FLAG_NATIVE_TALK_SAVE, mode="overwrite") df.write.parquet(FLAG_NATIVE_TALK_SAVE, mode="overwrite")
def etl_voxy(df_student_level=None, df_student_package=None, df_student_advisor=None): dyf_voxy = connectGlue(database="voxy", table_name="voxy_api_cutoff_2020", select_fields=["_key", "email", "last_login", "total_activities_completed", 'total_hours_studied'], ) dyf_voxy = Filter.apply( frame=dyf_voxy, f=lambda x: # x["total_activities_completed"] > 0 x["email"] is not None and x["email"] != "" and x['last_login'] is not None ) if is_dev: print('dyf_voxy') dyf_voxy.printSchema() dyf_voxy.show(3) dyf_voxy = dyf_voxy.resolveChoice(specs=[("_key", "cast:long")]) print('is_load_full___: ' + str(is_load_full)) print('dyf_voxy before filter: ' + str(dyf_voxy.count())) if not is_load_full: print('not load full-----------------') dyf_voxy = filter_flag(dyf_voxy, FLAG_VOXY_FILE) number_dyf_voxy = dyf_voxy.count() print("number_dyf_voxy after filter :", number_dyf_voxy) if number_dyf_voxy > 0: df_voxy = dyf_voxy.toDF() flag = df_voxy.agg({"_key": "max"}).collect()[0][0] df_voxy = df_voxy.filter(~df_voxy.email.startswith("vip_")) # df_voxy = df_voxy.dropDuplicates(['email', 'last_login']) df_voxy = df_voxy.withColumn('learning_date', f.from_unixtime(timestamp=f.unix_timestamp(f.col('last_login'), format="yyyy-MM-dd'T'HH:mm:ss"), format="yyyy-MM-dd")) \ .withColumn('learning_date_id', f.from_unixtime(timestamp=f.unix_timestamp(f.col('last_login'), format="yyyy-MM-dd'T'HH:mm:ss"), format="yyyyMMdd").cast('long')) df_voxy = df_voxy.filter(f.col('learning_date_id') < today_id) if is_dev: print('dyf_voxy__2') dyf_voxy.printSchema() dyf_voxy.show(3) df_voxy = df_voxy \ .groupby("email", "learning_date") \ .agg(f.sum("total_activities_completed").alias("total"), f.sum("total_hours_studied").cast('double').alias("total_duration") ) df_voxy = df_voxy.select("email", "learning_date", "total", f.round(f.col('total_duration') * 3600).cast('long').alias('total_duration')) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id", "user_name"], ) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != "" and x["contact_id"] is not None and x["contact_id"] != "" and x["user_name"] is not None and x["user_name"] != "" ) df_student_contact = dyf_student_contact.toDF() df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"]) df_result = df_voxy.join(df_student_contact, (df_voxy["email"]).endswith(df_student_contact["user_name"])) df_result = df_result \ .withColumn("transformed_at", f.lit(d4)) \ .withColumn("class_type", f.lit("VOXY")) \ \ prinDev(df_result) df_result = set_package_advisor_level( df_result, df_student_level=df_student_level, df_student_package=df_student_package, df_student_advisor=df_student_advisor) convertAndSaveS3(df_result) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF("flag") print('flag_data : ---') df.show() # ghi de _key vao s3 df.write.parquet(FLAG_VOXY_SAVE, mode="overwrite")
def etl_hoc_vien_trai_nghiem_mot_trong_cac_thanh_to_hoc(): dyf_student_contact = connectGlue(database="tig_advisor",table_name="student_contact", select_fields=["_key","contact_id","student_id","user_name"], fillter=["contact_id","student_id","user_name"], duplicates=["contact_id","user_name"] ).rename_field("user_name","email") dyf_student_contact = dyf_student_contact.resolveChoice(specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_trai_nghiem_1_trong_7_thanh_to.parquet") max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_student_contact>0: df_student_contact = dyf_student_contact.toDF() flag = df_student_contact.agg({"_key": "max"}).collect()[0][0] print(flag) prinDev(df_student_contact) df_ls_sc = get_ls_sc() df_lt = get_lt() df_hw = get_hw_basic() df_nt = get_native_talk() df_voxy = get_voxy() df_ncsb = get_ncsbasic() df_join = df_student_contact.join(df_ls_sc,df_ls_sc["student_id_ls_sc"]==df_student_contact["student_id"],"left").\ join(df_lt,df_lt["student_id_lt"]==df_student_contact["student_id"],"left").\ join(df_hw,df_hw["email_hw"]==df_student_contact["email"],"left").\ join(df_nt,df_nt["contact_id_nt"]==df_student_contact["contact_id"],"left").\ join(df_voxy,df_voxy["email_voxy"]==df_student_contact["email"],"left").\ join(df_ncsb,df_ncsb["email_ncbs"]==df_student_contact["email"],"left") prinDev(df_join) df_fillter = df_join.select("contact_id","student_id", get_behavior_date("student_behavior_date_ls_sc", "student_behavior_date_lt", "student_behavior_date_voxy", "student_behavior_date_hw", "student_behavior_date_nt", "student_behavior_date_ncsb").cast("long").alias("student_behavior_date")) df_fillter = df_fillter.filter(df_fillter.student_behavior_date < 1999999999) prinDev(df_fillter) # -----------------------------------------------------------------------------------------------------------------# df_join_level_code = set_package_advisor_level(df_fillter) for k, v in ADD_COLLUM.items(): df_join_level_code = df_join_level_code.withColumn(k, v) prinDev(df_join_level_code,"end data") # return convertAndSaveS3(df_join_level_code) # (student_behavior_date, behavior_id, student_id, user_id, contact_id, # package_code, student_level_code, package_status_code, transformed) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_behavior_trai_nghiem_1_trong_7_thanh_to.parquet", mode="overwrite")
def smile_care_rating(): dyf_smile_care_key = connectGlue(database="native_smile", table_name="ticket_log_5450ed3d8cb5a34974310b6b26e451fa", select_fields=["_key", "requester_email", "satisfaction", "satisfaction_at","created_at"], fillter=['requester_email'] ).rename_field("satisfaction", "value_rating") dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["value_rating"] in satisfaction) dyf_smile_care_key = dyf_smile_care_key.resolveChoice(specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet") max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_smile_care_key.count() > 0: df_smile_care_key = dyf_smile_care_key.toDF() flag = df_smile_care_key.agg({"_key": "max"}).collect()[0][0] df_smile_care = df_smile_care_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_smile_care_key.created_at, "yyyy-MM-dd HH:mm:ss")) dyf_smile_care = DynamicFrame.fromDF(df_smile_care, glueContext, "dyf_smile_care") dyf_smile_care = dyf_smile_care.resolveChoice(specs=[("student_behavior_date", "cast:int")]) dyf_smile_care = dyf_smile_care.select_fields( ["_key", "requester_email", "value_rating", "satisfaction_at", "student_behavior_date"]) df_smile_care = dyf_smile_care.toDF() dyf_student_contact_email = connectGlue( database="tig_advisor", table_name="student_contact_email", select_fields=["email", "contact_id", "user_id"] ) dyf_student_contact_ = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"] ).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact_.toDF() df_student_contact_email = dyf_student_contact_email.toDF() df_smile_care = df_smile_care.join(df_student_contact_email, (df_smile_care["requester_email"] == df_student_contact_email["email"])) df_smile_care = df_smile_care.join(df_student_contact, (df_smile_care["contact_id"] == df_student_contact["contact_id_contact"])) df_smile_care.drop("email", "requester_email", "contact_id_contact") df_smile_care = df_smile_care.withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("behavior_id", f.lit(26)) \ .withColumn("transformed_at", f.lit(d4)) df_smile_care = set_package_advisor_level(df_smile_care) convertAndSaveS3(df_smile_care) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet", mode="overwrite")
def main(): dyf_advisorcall_key = connectGlue( database="callcenter", table_name="advisorcall", select_fields=[ "_key", "calldate", "idcall", "rating", "device", "hanguphv", "totaltime", "answertime" ], duplicates=[ "calldate", "idcall", "rating", "device", "hanguphv", "totaltime", "answertime" ], fillter=["idcall"]).rename_field("rating", "value_rating") dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice( specs=[("_key", "cast:long")]) try: df_flag_phone_rating = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet" ) max_key = df_flag_phone_rating.collect()[0]["flag"] print("max_key: ", max_key) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") count = dyf_advisorcall_key.count() print(count) if count > 0: df_advisorcall_key = dyf_advisorcall_key.toDF() df_advisorcall = df_advisorcall_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss")) dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall") dyf_advisorcall = dyf_advisorcall.resolveChoice( specs=[("student_behavior_date", "cast:int")]) df_advisorcall = dyf_advisorcall.toDF() dyf_cdr = connectGlue( database="callcenter", table_name="cdr", select_fields=["ip_phone", "call_id", "student_phone", "status"], fillter=["call_id"]) df_cdr = dyf_cdr.toDF() df_advisorcall = df_advisorcall.join( df_cdr, (df_advisorcall["idcall"] == df_cdr["call_id"]), "right") dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["phone", "contact_id"], fillter=["phone", "contact_id"], duplicates=["phone", "contact_id"]) df_student_contact_phone = dyf_student_contact_phone.toDF() df_advisorcall = df_advisorcall.join( df_student_contact_phone, (df_advisorcall["student_phone"] == df_student_contact_phone["phone"])) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], fillter=["student_id", "contact_id"], duplicates=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_advisorcall = df_advisorcall.join( df_student_contact, (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"])) df_advisorcall = df_advisorcall.drop("contact_id_contact", "phone", "call_id") df_rating_phone = df_advisorcall.withColumn("transformed_at", f.lit(d4)) flag = df_rating_phone.agg({"_key": "max"}).collect()[0][0] prinDev(df_rating_phone) convertAndSaveS3(df_rating_phone) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet", mode="overwrite")
def etl_ncsbasic(df_student_level=None, df_student_package=None, df_student_advisor=None): dyf_results = connectGlue(database="ncsbasic", table_name="results_cutoff_2020", select_fields=["_key", "user_id", "time_created", "time_end"], ) dyf_results = Filter.apply( frame=dyf_results, f=lambda x: x["time_created"] is not None and x["time_created"] != "" and x["time_end"] is not None and x["time_end"] != "" and x["user_id"] is not None and x["user_id"] != "") dyf_results = dyf_results.resolveChoice(specs=[("time_created", "cast:long")]) dyf_results = dyf_results.resolveChoice(specs=[("time_end", "cast:long")]) dyf_results = dyf_results.resolveChoice(specs=[("_key", "cast:long")]) if not is_load_full: dyf_results = filter_flag(dyf_results, FLAG_NCSB_FILE) if dyf_results.count() > 0: df_results = dyf_results.toDF() flag = df_results.agg({"_key": "max"}).collect()[0][0] df_results = df_results.drop_duplicates(["user_id", "time_created"]) df_results = df_results.withColumn("learning_date", f.from_unixtime('time_created', format="yyyy-MM-dd").cast("string")) \ .withColumn('total_duration', f.round(f.col('time_end') - f.col('time_created')).cast('long')) df_results = df_results \ .groupby("user_id", "learning_date") \ .agg(f.count("time_end").alias("total"), f.sum('total_duration').alias('total_duration') ) dyf_native_talk_account_mapping = connectGlue(database="ncsbasic", table_name="users", select_fields=["_id", "email"], ) dyf_native_talk_account_mapping = Filter.apply( frame=dyf_native_talk_account_mapping, f=lambda x: x["_id"] is not None and x["_id"] != "" and x["email"] is not None and x["email"] != "") df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF() df_native_talk_account_mapping = df_native_talk_account_mapping.drop_duplicates(["_id"]) df_join = df_results.join(df_native_talk_account_mapping, df_results.user_id == df_native_talk_account_mapping._id) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id", "user_name"], ) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != "" and x["contact_id"] is not None and x["contact_id"] != "" and x["user_name"] is not None and x["user_name"] != "") df_student_contact = dyf_student_contact.toDF() df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"]) df_result = df_join.join(df_student_contact, df_join["email"] == df_student_contact["user_name"]) df_result = df_result.filter(df_result.total > 0) df_result = df_result \ .withColumn("transformed_at", f.lit(d4)) \ .withColumn("class_type", f.lit("NCSBASIC")) \ \ prinDev(df_result) # df_result = set_package_advisor_level(df_result, df_student_level=df_student_level, # df_student_package=df_student_package, # df_student_advisor=df_student_advisor) convertAndSaveS3(df_result) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF("flag") # ghi de _key vao s3 df.write.parquet(FLAG_NCSB_SAVE, mode="overwrite")
def etl_gia_han_goi_hoc(): dyf_ghi_nhan_hoc_phi = connectGlue( database="poss", table_name="ghinhan_hp", select_fields=["_key", "khoa_hoc_makh", "ngay_tao"], fillter=["khoa_hoc_makh", "ngay_tao"]) dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice( specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet" ) max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_ghi_nhan_hoc_phi = Filter.apply(frame=dyf_ghi_nhan_hoc_phi, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_ghi_nhan_hoc_phi.count() > 0: df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF() flag = df_ghi_nhan_hoc_phi.agg({"_key": "max"}).collect()[0][0] prinDev(df_ghi_nhan_hoc_phi) dyf_khoa_hoc_poss = connectGlue( database="poss", table_name="khoa_hoc", select_fields=["makh", "mahv", "goi_sanpham_id"], fillter=["makh", "mahv", "goi_sanpham_id"], duplicates=["makh", "mahv", "goi_sanpham_id" ]).rename_field("mahv", "ma_hv").rename_field("makh", "ma_kh") df_khoa_hoc_poss = dyf_khoa_hoc_poss.toDF() df_ghi_nhan_hoc_phi = df_khoa_hoc_poss.join( df_ghi_nhan_hoc_phi, (df_khoa_hoc_poss["ma_kh"] == df_ghi_nhan_hoc_phi["khoa_hoc_makh"])) df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi \ .withColumn("student_behavior_date", f.unix_timestamp(df_ghi_nhan_hoc_phi.ngay_tao, "yyyy-MM-dd HH:mm:ss")) dyf_ghi_nhan_hoc_phi = DynamicFrame.fromDF(df_ghi_nhan_hoc_phi, glueContext, "dyf_ghi_nhan_hoc_phi") dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice( specs=[("student_behavior_date", "cast:long")]) df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF() prinDev(df_ghi_nhan_hoc_phi) df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi.drop("khoa_hoc_makh") dyf_hoc_vien_poss = connectGlue(database="poss", table_name="hoc_vien", select_fields=["mahv", "crm_id"], fillter=["mahv", "crm_id"], duplicates=["mahv", "crm_id"]).rename_field( "crm_id", "contact_id") df_hoc_vien_poss = dyf_hoc_vien_poss.toDF() df_khoa_hoc_contact = df_ghi_nhan_hoc_phi.join( df_hoc_vien_poss, (df_ghi_nhan_hoc_phi["ma_hv"] == df_hoc_vien_poss["mahv"]), "left") df_khoa_hoc_contact = df_khoa_hoc_contact.drop("mahv") if is_dev: print "df_khoa_hoc_contact" df_khoa_hoc_contact.show(10) # -----------------------------------------------------------------------------------------------------------------# df_package_code = package_code() df_khoa_hoc_contact_package_code = df_khoa_hoc_contact.join( df_package_code, (df_khoa_hoc_contact["goi_sanpham_id"] == df_package_code["id"])) df_khoa_hoc_contact_package_code.drop("goi_sanpham_id", "id") # -----------------------------------------------------------------------------------------------------------------# # -----------------------------------------------------------------------------------------------------------------# # ---------------------------------------------------------------------------------------------------------------- dyf_test_dauvao_poss = connectGlue( database="poss", table_name="test_dauvao", select_fields=["mahv", "trinhdo_dauvao"], duplicates=["mahv", "trinhdo_dauvao"], fillter=["mahv", "trinhdo_dauvao"]) df_test_dauvao_poss = dyf_test_dauvao_poss.toDF() df_join_level_code = df_khoa_hoc_contact_package_code.join( df_test_dauvao_poss, (df_khoa_hoc_contact_package_code["ma_hv"] == df_test_dauvao_poss["mahv"]), "left") df_join_level_code = df_join_level_code.drop("mahv", "ma_hv") dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_join_level_code = df_join_level_code.join( df_student_contact, (df_student_contact["contact_id_contact"] == df_join_level_code["contact_id"])) df_join_level_code = df_join_level_code.drop("contact_id_contact") df_join_level_code = set_package_advisor_level(df_join_level_code) prinDev(df_join_level_code, "end data") for k, v in ADD_COLLUM.items(): df_join_level_code = df_join_level_code.withColumn(k, v) convertAndSaveS3(df_join_level_code) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet", mode="overwrite")
def smile_H2472_care_rating(): dyf_tblreply_rating_key = connectGlue( database="native_smile", table_name="tblreply_rating", select_fields=["_key", "userid", "ratingid", "time_rating"], fillter=['userid'] ).rename_field("time_rating", "student_behavior_date").rename_field("userid", "student_id") dyf_tblreply_rating_key = Filter.apply(frame=dyf_tblreply_rating_key, f=lambda x: x["ratingid"] > 13) dyf_tblreply_rating_key = dyf_tblreply_rating_key.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_H2472 = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_H2472_care_rating.parquet") max_key = df_flag_H2472.collect()[0]["flag"] print("max_key: ", max_key) dyf_tblreply_rating_key = Filter.apply(frame=dyf_tblreply_rating_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_tblreply_rating_key.count() > 0: df_tblreply_rating_key = dyf_tblreply_rating_key.toDF() flag = df_tblreply_rating_key.agg({"_key": "max"}).collect()[0][0] dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["contact_id", "student_id"], fillter=["contact_id", "student_id"], duplicates=["contact_id", "student_id"] ).rename_field("student_id", "student_id_contact") dyf_tma_dm_tu_dien = connectGlue( database="native_smile", table_name="tma_dm_tu_dien", select_fields=["id", "ma_tu_dien", "id_dm_loai_tu_dien"], fillter=["id", "id_dm_loai_tu_dien"] ) dyf_tma_dm_tu_dien = Filter.apply(frame=dyf_tma_dm_tu_dien, f=lambda x: x["id_dm_loai_tu_dien"] == 7 and x["id"] in rangeid) # df_mdl_user=dyf_mdl_user.toDF() df_tma_dm_tu_dien = dyf_tma_dm_tu_dien.toDF() ################ # df_tblreply_rating = df_tblreply_rating.join(df_mdl_user,(df_tblreply_rating["userid"]== df_mdl_user["id"]),"left") # join_rating_user.drop("id","userid") join_rating_user01 = df_tblreply_rating_key.join(df_tma_dm_tu_dien, (df_tblreply_rating_key["ratingid"] == df_tma_dm_tu_dien["id"])) join_rating_user01.drop("id") df_student_contact = dyf_student_contact.toDF() join_rating_user01 = join_rating_user01.join(df_student_contact, (join_rating_user01["student_id"] == df_student_contact[ "student_id_contact"])) join_rating_user01 = join_rating_user01.drop("student_id_contact") if is_dev: join_rating_user01.printSchema() join_rating_user01 = join_rating_user01.dropDuplicates() join_rating_user01 = join_rating_user01.withColumn("rating_type", f.lit("rating_native_smile_h2472")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("value_rating", (join_rating_user01.ratingid - f.lit(13))) \ .withColumn("behavior_id", f.lit(27)) \ .withColumn("transformed_at", f.lit(d4)) join_rating_user01 = set_package_advisor_level(join_rating_user01) convertAndSaveS3(join_rating_user01) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_H2472_care_rating.parquet", mode="overwrite")
def phone_rating(): dyf_advisorcall_key = connectGlue( database="callcenter", table_name="advisorcall", select_fields=["_key", "calldate", "phonenumber", "rating", "device", "hanguptvts", "status"] ) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["rating"] in ratings and x["device"] == "3CX" and x["hanguptvts"] == 1 and x["status"] == "ANSWER") dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_phone_rating = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_phone_rating.parquet") max_key = df_flag_phone_rating.collect()[0]["flag"] print("max_key: ", max_key) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_advisorcall_key.count()>0: df_advisorcall_key = dyf_advisorcall_key.toDF() flag = df_advisorcall_key.agg({"_key": "max"}).collect()[0][0] df_advisorcall = df_advisorcall_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss")) dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall") dyf_advisorcall = dyf_advisorcall.resolveChoice(specs=[("student_behavior_date", "cast:int")]) dyf_advisorcall = dyf_advisorcall.select_fields( ["_key", "student_behavior_date", "phonenumber", "rating", "device", "hanguptvts", "status"]) \ .rename_field("rating", "value_rating") df_advisorcall = dyf_advisorcall.toDF() dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["phone", "contact_id", "user_id"] ) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"] ).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_student_contact_phone = dyf_student_contact_phone.toDF() df_advisorcall = df_advisorcall.join(df_student_contact_phone, (df_advisorcall["phonenumber"] == df_student_contact_phone["phone"])) df_advisorcall = df_advisorcall.join(df_student_contact, (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"])) df_advisorcall = df_advisorcall.drop("phonenumber", "phone", "contact_id_contact") df_advisorcall = df_advisorcall.withColumn("comment", f.lit("")).withColumn("rating_about", f.lit(None)) \ .withColumn("rating_type", f.lit("rating_hotline")) \ .withColumn("number_rating", f.lit(1)) df_rating_phone = df_advisorcall \ .withColumn("behavior_id", f.lit(25)) \ .withColumn("transformed_at", f.lit(d4)) df_rating_phone = set_package_advisor_level(df_rating_phone) convertAndSaveS3(df_rating_phone) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_phone_rating.parquet", mode="overwrite")
def etl_ktkt(): dyf_technical_test = connectGlue(database="technical_test", table_name="student_technical_test", select_fields=[ "_key", "trinhdohocvien", "studentid", "thoigianhenktkt", "ketluan" ], fillter=["studentid", "thoigianhenktkt"], duplicates=[ "trinhdohocvien", "studentid", "thoigianhenktkt", "ketluan" ]) dyf_technical_test = dyf_technical_test.resolveChoice( specs=[("_key", "cast:long"), ("studentid", "cast:string")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet") max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_technical_test = Filter.apply(frame=dyf_technical_test, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_technical_test.count() > 0: df_technical_test = dyf_technical_test.toDF() flag = df_technical_test.agg({"_key": "max"}).collect()[0][0] dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["contact_id", "student_id", "user_name"], fillter=["contact_id", "student_id"], duplicates=["contact_id", "student_id"]) df_student_contact = dyf_student_contact.toDF() df_technical_test = df_technical_test.withColumn( "date", f.unix_timestamp(df_technical_test.thoigianhenktkt, "yyyy-MM-dd HH:mm:ss")) dyf_technical_test = DynamicFrame.fromDF(df_technical_test, glueContext, "dyf_technical_test") dyf_technical_test = dyf_technical_test.resolveChoice( specs=[("date", "cast:long")]) df_technical_test = dyf_technical_test.toDF() df_technical_test_min = df_technical_test.select( "trinhdohocvien", "studentid", "date") df_technical_test_min = df_technical_test_min.groupBy( "studentid", "trinhdohocvien").agg( f.min( df_technical_test_min.date).alias("student_behavior_date")) df_join_min = df_student_contact.join( df_technical_test_min, df_technical_test_min["studentid"] == df_student_contact["student_id"]) df_select_min = df_join_min.select( "contact_id", "student_id", "student_behavior_date", df_join_min.trinhdohocvien.alias("student_level_code")) # -----------------------------------------------------------------------------------------------------------------# df_technical_test_pass = df_technical_test.where( df_technical_test.ketluan == "Pass") df_technical_test_pass = df_technical_test_pass.groupBy( "studentid", "trinhdohocvien").agg( f.min(df_technical_test_pass.date).alias( "student_behavior_date")) df_join_pass = df_student_contact.join( df_technical_test_pass, df_technical_test_pass["studentid"] == df_student_contact["student_id"]) df_select_pass = df_join_pass.select( "contact_id", "student_id", "" "student_behavior_date", df_join_min.trinhdohocvien.alias("student_level_code")) prinDev(df=df_select_pass, df_name="pass") prinDev(df=df_select_pass, df_name="min") df_join_level_code_min = set_package_advisor_level(df_select_min) for k, v in ADD_COLLUM_HEN_KTKT.items(): df_join_level_code_min = df_join_level_code_min.withColumn(k, v) prinDev(df_join_level_code_min, "end data min") convertAndSaveS3(df_join_level_code_min) # ---------------------------------------------------------------------------------------------------------------- df_join_level_code_pass = set_package_advisor_level(df_select_pass) for k, v in ADD_COLLUM_KTKT_TC.items(): df_join_level_code_pass = df_join_level_code_pass.withColumn(k, v) prinDev(df_join_level_code_pass, "end data pass") convertAndSaveS3(df_join_level_code_pass) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet", mode="overwrite")
def etl_hoc_vien_duoc_chao_mung(): dyf_av_care_call = connectGlue( database="tig_advisor", table_name="care_call", select_fields=[ "_key", "phone", "duration", "call_status", "time_created" ], fillter=["phone", "duration", "call_status"], duplicates=["phone", "call_status", "time_created"], ).rename_field("phone", "phone1") dyf_av_care_call = Filter.apply( frame=dyf_av_care_call, f=lambda x: x["call_status"] in ("success", "call_success") and x["duration"] > 30) dyf_av_care_call = dyf_av_care_call.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_1 = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet" ) max_key = df_flag_1.collect()[0]["flag"] print("max_key: ", max_key) dyf_av_care_call = Filter.apply(frame=dyf_av_care_call, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_av_care_call.count() > 0: df_av_care_call = dyf_av_care_call.toDF() flag = df_av_care_call.agg({"_key": "max"}).collect()[0][0] dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["contact_id", "phone"], fillter=["contact_id", "phone"], duplicates=["contact_id", "phone"], ) df_av_care_call = df_av_care_call\ .withColumn("student_behavior_date", f.unix_timestamp(df_av_care_call.time_created, "yyyy-MM-dd HH:mm:ss")) dyf_av_care_call = DynamicFrame.fromDF(df_av_care_call, glueContext, "dyf_av_care_call") dyf_av_care_call = dyf_av_care_call.resolveChoice( specs=[("student_behavior_date", "cast:int")]) df_av_care_call = dyf_av_care_call.toDF() dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_student_contact_phone = dyf_student_contact_phone.toDF() df_student_care = df_av_care_call.join( df_student_contact_phone, (df_av_care_call["phone1"] == df_student_contact_phone["phone"]), "left") df_student_care = df_student_care.drop("phone1", "phone") # -----------------------------------------------------------------------------------------------------------------# # -----------------------------------------------------------------------------------------------------------------# df_student_care = df_student_care.join( df_student_contact, (df_student_care["contact_id"] == df_student_contact["contact_id_contact"]), "left") df_student_care = df_student_care.drop("contact_id_contact") df_package_code = khoa_hoc() df_behavior_join_2 = df_student_care.join( df_package_code, df_student_care["contact_id"] == df_package_code["crm_id"]) df_behavior_join_2 = df_behavior_join_2.drop("crm_ id") df_behavior_join_2 = set_package_advisor_level(df_behavior_join_2) for k, v in ADD_COLLUM.items(): df_behavior_join_2 = df_behavior_join_2.withColumn(k, v) prinDev(df_behavior_join_2, "end_code") # (student_behavior_date, behavior_id, student_id, user_id, contact_id, # package_code, student_level_code, package_status_code, transformed) convertAndSaveS3(df_behavior_join_2) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet", mode="overwrite")
def etl_hoc_viec_kich_hoat_goi_hoc(): dyf_tig_market = connectGlue( database="tig_market", table_name="tpe_enduser_used_product_history", select_fields=["_key", "contact_id", "timecreated", "used_product_id"], fillter=["contact_id", "used_product_id"], duplicates=["contact_id", "timecreated", "used_product_id"]) dyf_tig_market = dyf_tig_market.resolveChoice(specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_kich_hoat_tai_khoan_khoa_hoc.parquet" ) max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_tig_market = Filter.apply(frame=dyf_tig_market, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_tig_market > 0: df_tig_market = dyf_tig_market.toDF() flag = df_tig_market.agg({"_key": "max"}).collect()[0][0] print("flag: ", flag) df_tig_market = df_tig_market.groupby( "used_product_id", "contact_id").agg( f.min('timecreated').alias("student_behavior_date")) df_tig_market = df_tig_market.drop("used_product_id") dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_tig_market_contact = df_tig_market.join( df_student_contact, df_tig_market["contact_id"] == df_student_contact["contact_id_contact"], "left") df_tig_market_contact = df_tig_market_contact.drop( "contact_id_contact") # -----------------------------------------------------------------------------------------------------------------# prinDev(df_tig_market_contact, "df_khoa_hoc_contact") # -----------------------------------------------------------------------------------------------------------------# # prinDev(df_kich_hoat_goi_hoc,"end data") df_join_level_code = set_package_advisor_level(df_tig_market_contact) prinDev(df_join_level_code, "end data") for k, v in ADD_COLLUM.items(): df_join_level_code = df_join_level_code.withColumn(k, v) convertAndSaveS3(df_join_level_code) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_kich_hoat_tai_khoan_khoa_hoc.parquet", mode="overwrite")
def etl_home_work(df_student_level=None, df_student_package=None, df_student_advisor=None): dyf_mdl_le_exam_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_exam_attemp", select_fields=["_key", "user_id", "created_at", "le_attemp_id", 'le_id', 'point'], ) dyf_mdl_le_exam_attemp = Filter.apply( frame=dyf_mdl_le_exam_attemp, f=lambda x: x["user_id"] is not None and x["user_id"] != "" and x["le_attemp_id"] is not None and x["le_attemp_id"] != "" and x["created_at"] is not None and x["created_at"] != "" and x["created_at"] is not None ) dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.resolveChoice(specs=[("_key", "cast:long")]) if is_dev: print('dyf_mdl_le_exam_attemp') dyf_mdl_le_exam_attemp.printSchema() dyf_mdl_le_exam_attemp.show(3) if not is_load_full: dyf_mdl_le_exam_attemp = filter_flag(dyf_mdl_le_exam_attemp, FLAG_HW_FILE) number_dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.count() if number_dyf_mdl_le_exam_attemp < 1: return df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF() flag = df_mdl_le_exam_attemp.agg({"_key": "max"}).collect()[0][0] df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.drop_duplicates(["user_id", "created_at"]) df_mdl_le_exam_attemp = df_mdl_le_exam_attemp \ .select('user_id', 'le_attemp_id', 'le_id', 'point', f.unix_timestamp(df_mdl_le_exam_attemp.created_at, "yyyy-MM-dd HH:mm:ss").cast("long").alias( "created_at") ) if is_dev: print('df_mdl_le_exam_attemp') df_mdl_le_exam_attemp.printSchema() df_mdl_le_exam_attemp.show(3) # ------------------------------------------------------------------------------------------------------------------# dyf_mdl_le_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_attemp", select_fields=["id", "created_at"], ) dyf_mdl_le_attemp = Filter.apply( frame=dyf_mdl_le_attemp, f=lambda x: x["id"] is not None and x["id"] != "" and x["created_at"] is not None and x["created_at"] != "") df_mdl_le_attemp = dyf_mdl_le_attemp.toDF() df_mdl_le_attemp = df_mdl_le_attemp.drop_duplicates(["id"]) df_mdl_le_attemp = df_mdl_le_attemp.select("id", f.unix_timestamp(df_mdl_le_attemp.created_at, "yyyy-MM-dd HH:mm:ss").cast("long") .alias("created_at_le")) # ------------------------------------------------------------------------------------------------------------------# df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp \ .join(other=df_mdl_le_attemp, on=df_mdl_le_exam_attemp.le_attemp_id == df_mdl_le_attemp.id, how='inner' ) df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp_detail \ .select( 'user_id', 'le_attemp_id', 'le_id', 'created_at', 'point', (df_mdl_le_exam_attemp_detail.created_at - df_mdl_le_exam_attemp_detail.created_at_le).alias( 'learning_duration'), f.from_unixtime(f.col('created_at'), "yyyy-MM-dd").cast("string").alias("learning_date") ) if is_dev: print('df_mdl_le_exam_attemp_detail') df_mdl_le_exam_attemp_detail.printSchema() df_mdl_le_exam_attemp_detail.show(3) # ----------------get learning turn by le_id # ("student_id", "string", "student_id", "long"), # ("contact_id", "string", "contact_id", "string"), # # ("class_type", "string", "class_type", "string"), # ("learning_date", "string", "learning_date", "string"), # ("total", "long", "total_learing", "long"), # ("total_duration", "long", "total_duration", "long"), # # ("year_month_id", "string", "year_month_id", "string"), # ("transformed_at", "string", "transformed_at", "long")] df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp_detail \ .orderBy(f.asc('user_id'), f.asc('le_id'), f.asc('learning_date'), f.asc('created_at')) df_mdl_le_exam_attemp_learning_turn = df_mdl_le_exam_attemp_detail \ .groupBy('user_id', 'le_id', 'learning_date') \ .agg( f.first('created_at').alias('created_at'), f.first('learning_duration').alias('learning_duration'), f.first('point').alias('point') ) df_mdl_le_exam_attemp_learning_turn = df_mdl_le_exam_attemp_learning_turn \ .select("user_id", "created_at", 'learning_duration', 'point', 'le_id', 'learning_date', f.from_unixtime(f.col('created_at'), "yyyy-MM-dd HH:mm:ss").cast("string").alias("learning_date_time") ) df_mdl_le_exam_attemp_learning_turn_success = df_mdl_le_exam_attemp_learning_turn \ .filter(f.col('learning_duration') >= 600) df_learning_turn_success_total = df_mdl_le_exam_attemp_learning_turn_success \ .groupby("user_id", "learning_date") \ .agg(f.count("created_at").cast('long').alias("total"), f.sum('learning_duration').cast('long').alias('total_duration') ) if is_dev: print('df_learning_turn_success_total') df_learning_turn_success_total.printSchema() df_learning_turn_success_total.show(3) dyf_mdl_user = connectGlue(database="home_work_basic_production", table_name="mdl_user", select_fields=["id", "username"], ).rename_field("username", "email") dyf_mdl_user = Filter.apply( frame=dyf_mdl_user, f=lambda x: x["id"] is not None and x["id"] != "" and x["email"] is not None and x["email"] != "") df_mdl_user = dyf_mdl_user.toDF() df_mdl_user = df_mdl_user.drop_duplicates(["id"]) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id", "user_name"], ) # ------------------------------------------------------------------------------------------------------------------# dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != "" and x["contact_id"] is not None and x["contact_id"] != "" and x["user_name"] is not None and x["user_name"] != "") df_student_contact = dyf_student_contact.toDF() df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"]) # ------------------------------------------------------------------------------------------------------------------# df_join = df_learning_turn_success_total \ .join(other=df_mdl_user, on=df_mdl_user.id == df_mdl_le_exam_attemp.user_id, how='inner') \ .join(other=df_student_contact, on=df_mdl_user.email == df_student_contact.user_name, how='inner') df_result = df_join \ .withColumn("transformed_at", f.lit(d4)) \ .withColumn("class_type", f.lit("HOME_WORK")) convertAndSaveS3(df_result) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF("flag") # ghi de _key vao s3 df.write.parquet(FLAG_HW_SAVE, mode="overwrite")