def connectGlue(database="", table_name="", select_fields=[], fillter=[], cache=False, duplicates=[], push_down_predicate=""): if database == "" and table_name == "": return dyf = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=table_name, push_down_predicate=push_down_predicate) if is_dev: print("full schema of table: ", database, "and table: ", table_name) dyf.printSchema() # select felds if len(select_fields) != 0: dyf = dyf.select_fields(select_fields) if len(fillter) != 0: dyf = fillterOutNull(dyf, fillter) if is_limit and cache: df = dyf.toDF() df = df.limit(20000) df = checkDumplicate(df, duplicates) df = df.cache() dyf = DynamicFrame.fromDF(df, glueContext, table_name) elif is_limit == False and cache: df = dyf.toDF() df = checkDumplicate(df, duplicates) df = df.cache() dyf = DynamicFrame.fromDF(df, glueContext, table_name) elif is_limit and cache == False: df = dyf.toDF() df = checkDumplicate(df, duplicates) df = df.limit(20000) dyf = DynamicFrame.fromDF(df, glueContext, table_name) else: df = dyf.toDF() df = checkDumplicate(df, duplicates) dyf = DynamicFrame.fromDF(df, glueContext, table_name) if is_dev: print("full schema of table: ", database, " and table: ", table_name, " after select") dyf.printSchema() return dyf
def etl_user_profile(dyf_crm_concat): print_is_dev('etl_user_profile___****************************************************************************************') # ETL for profile myUdf = f.udf(myFunc, StringType()) df_crm_concat_pf = dyf_crm_concat.toDF() df_crm_concat_pf = df_crm_concat_pf.groupBy('Id')\ .agg(f.collect_list('Gender').alias('Gender'), f.collect_list('Birthday').alias('Birthday'), f.collect_list('Job').alias('Job'))\ .withColumn('Gender', myUdf('Gender'))\ .withColumn('Birthday', myUdf('Birthday'))\ .withColumn('Job', myUdf('Job')) dyf_crm_concat_pf = DynamicFrame.fromDF(df_crm_concat_pf, glueContext, 'dyf_crm_contacts') dyf_crm_concat_pf = mappingForAll(dyf_crm_concat_pf, profile, profile_add_collum) if is_dev: print('dyf_crm_concat_pf') dyf_crm_concat_pf.printSchema() dyf_crm_concat_pf.show(10) ######## save to s3###### #parquetToS3(dyf_crm_concat_pf, path="s3://dtsodin/student_behavior/up/user_profile") ############################################################ # save to redshift glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_crm_concat_pf, catalog_connection="glue_redshift", connection_options={ "dbtable": "up_user_profile", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_profile", transformation_ctx="datasink4")
def processBatch(data_frame, batchId): if (data_frame.count() > 0): dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext, "from_data_frame") apply_mapping = ApplyMapping.apply(frame = dynamic_frame, mappings = [ \ ("uuid", "string", "uuid", "string"), \ ("country", "string", "country", "string"), \ ("itemtype", "string", "itemtype", "string"), \ ("saleschannel", "string", "saleschannel", "string"), \ ("orderpriority", "string", "orderpriority", "string"), \ ("orderdate", "string", "orderdate", "string"), \ ("region", "string", "region", "string"), \ ("shipdate", "string", "shipdate", "string"), \ ("unitssold", "string", "unitssold", "string"), \ ("unitprice", "string", "unitprice", "string"), \ ("unitcost", "string", "unitcost", "string"), \ ("totalrevenue", "string", "totalrevenue", "string"), \ ("totalcost", "string", "totalcost", "string"), \ ("totalprofit", "string", "totalprofit", "string")],\ transformation_ctx = "apply_mapping") final_frame = Join.apply(apply_mapping, country_lookup_frame, 'country', 'CountryName').drop_fields( \ ['CountryName', 'country', 'unitprice', 'unitcost', 'totalrevenue', 'totalcost', 'totalprofit']) s3sink = glueContext.write_dynamic_frame.from_options( frame = final_frame, \ connection_type = "s3", \ connection_options = {"path": s3_target}, \ format = "csv", \ transformation_ctx = "s3sink")
def etl_user_profile(dyf_crm_concat): print_is_dev( 'etl_user_profile___****************************************************************************************' ) # ETL for profile myUdf = f.udf(myFunc, StringType()) df_crm_concat_pf = dyf_crm_concat.toDF() df_crm_concat_pf = df_crm_concat_pf.groupBy('Id') \ .agg(f.collect_list('Gender').alias('Gender'), f.collect_list('Birthday').alias('Birthday'), f.collect_list('Job').alias('Job')) \ .withColumn('Gender', myUdf('Gender')) \ .withColumn('Birthday', myUdf('Birthday')) \ .withColumn('Job', myUdf('Job')) dyf_crm_concat_pf = DynamicFrame.fromDF(df_crm_concat_pf, glueContext, 'dyf_crm_contacts') dyf_crm_concat_pf = mappingForAll(dyf_crm_concat_pf, profile, profile_add_collum) dyf_crm_concat_pf.printSchema() dyf_crm_concat_pf.show(10) ######## save to s3###### parquetToS3(dyf_crm_concat_pf, path="s3://dtsodin/student_behavior/up/user_profile")
def convertAndSaveS3(df): df = df.withColumn("year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM")) df = df.withColumn('student_behavior_id', f.md5(concaText( df.student_behavior_date, df.behavior_id, df.student_id, df.contact_id, df.package_code, df.package_status_code, df.student_level_code, df.transformed_at))) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") if is_dev: print('dyf____________________________') dyf.printSchema() dyf.show(10) behavior_mapping = mappingForAll(dyf, MAPPING) if behavior_mapping.count() > 0: parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior") rate_mapping = mappingForAll(dyf, RATE_MAPPING) if rate_mapping.count() > 0: parquetToS3(dyf=rate_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_rating")
def get_hw_basic(): dyf_mdl_le_exam_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_exam_attemp", select_fields=["user_id", "created_at"], fillter=["user_id"], duplicates=["user_id","created_at"] ) df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF() df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.withColumn("created_at_min", f.unix_timestamp(df_mdl_le_exam_attemp.created_at, "yyyy-MM-dd HH:mm:ss")) dyf_mdl_le_exam_attemp = DynamicFrame.fromDF(df_mdl_le_exam_attemp, glueContext, "dyf_mdl_le_exam_attemp") dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.resolveChoice(specs=[("created_at_min", "cast:long")]) df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF() df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.groupby("user_id").agg( f.min(df_mdl_le_exam_attemp.created_at_min).alias("student_behavior_date_hw")) dyf_mdl_user = connectGlue(database="home_work_basic_production", table_name="mdl_user", select_fields=["id", "username"], fillter=["id"], duplicates=["id", "username"] ).rename_field("username","email_hw") df_mdl_user = dyf_mdl_user.toDF() join = df_mdl_user.join(df_mdl_le_exam_attemp,df_mdl_user.id==df_mdl_le_exam_attemp.user_id) join= join.drop("id","user_id") return join
def convertAndSaveS3(df): df = df.dropDuplicates() prinDev(df, "befor convert") df = df.withColumn( "year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM")) prinDev(df, "befor mapping") df = df.withColumn( 'student_behavior_id', f.md5( concaText(df.student_behavior_date, df.behavior_id, df.student_id, df.contact_id, df.package_code, df.package_status_code, df.student_level_code, df.transformed_at))) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") behavior_mapping = mappingForAll(dyf, MAPPING) prinDev(behavior_mapping, "after mapping") if behavior_mapping.count() > 0: parquetToS3( dyf=behavior_mapping, path= "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior" )
def get_voxy(): dyf_voxy = connectGlue( database="voxy", table_name="voxy_api", select_fields=["email", "time_created", "total_hours_studied"], fillter=["email"], duplicates=["email", "time_created"]).rename_field("email", "email_voxy") dyf_voxy = Filter.apply(frame=dyf_voxy, f=lambda x: x["total_hours_studied"] > 0) df_voxy = dyf_voxy.toDF() df_voxy = df_voxy.withColumn( "time_created_new", f.unix_timestamp(df_voxy.time_created, "yyyy-MM-dd")) dyf_voxy = DynamicFrame.fromDF(df_voxy, glueContext, "dyf_voxy") dyf_voxy = dyf_voxy.resolveChoice(specs=[("time_created_new", "cast:long")]) df_voxy = dyf_voxy.toDF() df_voxy.drop("time_created") df_voxy = df_voxy.groupby("email_voxy").agg( f.min(df_voxy.time_created_new).alias("student_behavior_date_voxy")) return df_voxy
def get_native_talk(): dyf_native_talk_history_log = connectGlue(database="native_talk", table_name="native_talk_history_log_api", select_fields=["learning_date", "username","speaking_dialog_score"], fillter=["username"], duplicates=["username", "learning_date"] ) dyf_native_talk_history_log = Filter.apply(frame=dyf_native_talk_history_log, f=lambda x: x["speaking_dialog_score"] > 0 ) df_native_talk_history_log = dyf_native_talk_history_log.toDF() df_native_talk_history_log = df_native_talk_history_log.drop("speaking_dialog_score") df_native_talk_history_log = df_native_talk_history_log.withColumn("learning_date_int", f.unix_timestamp(df_native_talk_history_log.learning_date, "yyyy-MM-dd")) dyf_native_talk_history_log = DynamicFrame.fromDF(df_native_talk_history_log, glueContext, "dyf_native_talk_history_log") dyf_native_talk_history_log = dyf_native_talk_history_log.resolveChoice(specs=[("learning_date_int", "cast:long")]) df_native_talk_history_log = dyf_native_talk_history_log.toDF() df_native_talk_history_log = df_native_talk_history_log.groupby("username").agg( f.min(df_native_talk_history_log.learning_date_int).alias("student_behavior_date_nt")) dyf_native_talk_account_mapping = connectGlue(database="native_talk", table_name="native_talk_account_mapping", select_fields=["username", "contact_id"], fillter=["username", "contact_id"], duplicates=["username", "contact_id"] ).rename_field("username", "username_mapping").\ rename_field("contact_id","contact_id_nt") df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF() join = df_native_talk_account_mapping.join(df_native_talk_history_log, df_native_talk_account_mapping.username_mapping == df_native_talk_history_log.username) join=join.drop("username_mapping","username") return join
def main(): # ========== init glue_context = GlueContext(SparkContext.getOrCreate()) # ========== retrieve dynamic frame df_advisor = retrieve_dynamic_frame( glue_context, 'tig_advisor', 'advisor_account', ['user_id', 'user_name', 'user_email'] ) display(df_advisor, "df_advisor") df_advisor = df_advisor.withColumnRenamed('user_id', 'advisor_id').withColumnRenamed('user_email', 'email') display(df_advisor, "df_advisor renamed") dyf_advisor = DynamicFrame.fromDF( df_advisor, glue_context, "dyf_advisor" ) display(dyf_advisor, "dyf_advisor") # ========== save dynamic frame to redshift save_data_to_redshift( glue_context, dyf_advisor, 'student_learning_fact', 'advisor_dim', 's3://datashine-dev-redshift-backup/student_learning_fact/advisor_dim', 'student_learning_dim' )
def convertAndSaveS3(df): df = df.withColumn("year_month_id", f.from_unixtime( f.unix_timestamp( df.learning_date, "yyyy-MM-dd").cast("long"), "yyyyMM").cast("string")) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") behavior_mapping = mappingForAll(dyf, MAPPING) if behavior_mapping.count() > 0: parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/trasaction_le2/le2_history")
def retrieve_dynamic_frame(glue_context, spark, data, header, name, mapping): data_frame = spark.createDataFrame(data, header) dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, name) display(dynamic_frame, "retrieve" + name) apply_dynamic_frame = ApplyMapping.apply(frame=dynamic_frame, mappings=mapping) dyf_result = ResolveChoice.apply(frame=apply_dynamic_frame, choice="make_cols", transformation_ctx="resolvechoice2") display(dynamic_frame, "apply" + name) return dyf_result
def convertAndSaveS3(df): df = df.withColumn( "year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM")) dyf = DynamicFrame.fromDF(df, glueContext, "dyf") if dyf.count() > 0: if is_dev: print('dyf____________________________') dyf.printSchema() dyf.show(10) behavior_mapping = mappingForAll(dyf, MAPPING) parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/transaction_log/student_care_advisor")
def mappingForAll(dynamicFrame,mapping,add_collum): df_communication = dynamicFrame.toDF() df_communication = df_communication.dropDuplicates() for k, v in add_collum.items(): df_communication = df_communication.withColumn(k, v) dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication') applymapping2 = ApplyMapping.apply(frame=dyf_communication, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping): df = dynamicFrame.toDF() df = df.dropDuplicates() dyf = DynamicFrame.fromDF(df, glueContext, "dyf") print("-------------------------------------------------") dyf.printSchema() print(mapping) applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_mapping
def process_paintings(paintings: DynamicFrame) -> DynamicFrame: # Cast all "bit" fields (LongTypes) into booleans # It's easier to use a list of non-bit fields as the majority of fields imported are bit fields non_bit_fields = ["episode", "title"] bit_fields_specs = [ (field.name, "cast:boolean") for field in paintings.schema() if field.name not in non_bit_fields and field.dataType.typeName() == 'long' ] if not bit_fields_specs: return paintings paintings_with_bool_fields = ResolveChoice.apply(paintings, specs = bit_fields_specs, transformation_ctx = "paintings_with_bool_fields") paintings_with_parsed_episodes = Map.apply(frame = paintings_with_bool_fields, f = parse_episode, transformation_ctx = "paintings_with_parsed_episodes") return paintings_with_parsed_episodes
def etl_advisor(): dyf_advisor_account = connectGlue(database="tig_advisor", table_name="advisor_account", select_fields=[ "user_id", "user_name", "user_display_name", "user_email", "level", "type_manager" ], fillter=["user_id", "type_manager"]) dyf_advisor_account.show(10) # ------------------------------------------------------------------------------------------------------------------# number_advisor_account = dyf_advisor_account.count() if is_dev: print("number_advisor_account: ", number_advisor_account) if number_advisor_account < 1: return print_is_dev( "etl_advisor_profile___****************************************************************************************" ) # ETL for profile myUdf = f.udf(myFunc, StringType()) df_advisor_account = dyf_advisor_account.toDF() df_advisor_account = df_advisor_account.groupBy("user_id") \ .agg(f.collect_list("user_name").alias("user_name"), f.collect_list("user_email").alias("user_email"), f.first("level").alias("level"), f.first("type_manager").alias("type_manager"), f.collect_list("user_display_name").alias("user_display_name")) \ .withColumn("user_name", myUdf("user_name")) \ .withColumn("user_email", myUdf("user_email")) \ .withColumn("user_display_name", myUdf("user_display_name")) dyf_advisor_account = DynamicFrame.fromDF(df_advisor_account, glueContext, "dyf_advisor_account") dyf_advisor_account_mapping = mappingForAll(dyf_advisor_account, mapping=ADVISOR_MAPPING) dyf_advisor_account_mapping.printSchema() dyf_advisor_account_mapping.show(10)
def processBatch(data_frame, batchId): if (data_frame.count() > 0): datasource0 = DynamicFrame.fromDF(data_frame, glueContext, "from_data_frame") now = datetime.datetime.now() year = now.year month = now.month day = now.day hour = now.hour minute = now.minute path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format( str(year)) + "/ingest_month=" + "{:0>2}".format( str(month)) + "/ingest_day=" + "{:0>2}".format( str(day)) + "/ingest_hour=" + "{:0>2}".format( str(hour)) + "/" datasink1 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={"path": path_datasink1}, format="parquet", transformation_ctx="datasink1") logger.info(f'{{"batch_process_successful":True}}')
def processBatch(data_frame, batchId): now = datetime.datetime.now() year = now.year month = now.month day = now.day hour = now.hour minute = now.minute if data_frame.count() > 0: dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext, "from_data_frame") apply_mapping = ApplyMapping.apply( frame=dynamic_frame, mappings=[ ("ventilatorid", "long", "ventilatorid", "long"), ("eventtime", "string", "eventtime", "timestamp"), ("serialnumber", "string", "serialnumber", "string"), ("pressurecontrol", "long", "pressurecontrol", "long"), ("o2stats", "long", "o2stats", "long"), ("minutevolume", "long", "minutevolume", "long"), ("manufacturer", "string", "manufacturer", "string"), ], transformation_ctx="apply_mapping", ) dynamic_frame.printSchema() # Write to S3 Sink s3path = (s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/") s3sink = glueContext.write_dynamic_frame.from_options( frame=apply_mapping, connection_type="s3", connection_options={"path": s3path}, format="parquet", transformation_ctx="s3sink", )
def etl_ktkt(): dyf_technical_test = connectGlue(database="technical_test", table_name="student_technical_test", select_fields=[ "_key", "trinhdohocvien", "studentid", "thoigianhenktkt", "ketluan" ], fillter=["studentid", "thoigianhenktkt"], duplicates=[ "trinhdohocvien", "studentid", "thoigianhenktkt", "ketluan" ]) dyf_technical_test = dyf_technical_test.resolveChoice( specs=[("_key", "cast:long"), ("studentid", "cast:string")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet") max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_technical_test = Filter.apply(frame=dyf_technical_test, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_technical_test.count() > 0: df_technical_test = dyf_technical_test.toDF() flag = df_technical_test.agg({"_key": "max"}).collect()[0][0] dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["contact_id", "student_id", "user_name"], fillter=["contact_id", "student_id"], duplicates=["contact_id", "student_id"]) df_student_contact = dyf_student_contact.toDF() df_technical_test = df_technical_test.withColumn( "date", f.unix_timestamp(df_technical_test.thoigianhenktkt, "yyyy-MM-dd HH:mm:ss")) dyf_technical_test = DynamicFrame.fromDF(df_technical_test, glueContext, "dyf_technical_test") dyf_technical_test = dyf_technical_test.resolveChoice( specs=[("date", "cast:long")]) df_technical_test = dyf_technical_test.toDF() df_technical_test_min = df_technical_test.select( "trinhdohocvien", "studentid", "date") df_technical_test_min = df_technical_test_min.groupBy( "studentid", "trinhdohocvien").agg( f.min( df_technical_test_min.date).alias("student_behavior_date")) df_join_min = df_student_contact.join( df_technical_test_min, df_technical_test_min["studentid"] == df_student_contact["student_id"]) df_select_min = df_join_min.select( "contact_id", "student_id", "student_behavior_date", df_join_min.trinhdohocvien.alias("student_level_code")) # -----------------------------------------------------------------------------------------------------------------# df_technical_test_pass = df_technical_test.where( df_technical_test.ketluan == "Pass") df_technical_test_pass = df_technical_test_pass.groupBy( "studentid", "trinhdohocvien").agg( f.min(df_technical_test_pass.date).alias( "student_behavior_date")) df_join_pass = df_student_contact.join( df_technical_test_pass, df_technical_test_pass["studentid"] == df_student_contact["student_id"]) df_select_pass = df_join_pass.select( "contact_id", "student_id", "" "student_behavior_date", df_join_min.trinhdohocvien.alias("student_level_code")) prinDev(df=df_select_pass, df_name="pass") prinDev(df=df_select_pass, df_name="min") df_join_level_code_min = set_package_advisor_level(df_select_min) for k, v in ADD_COLLUM_HEN_KTKT.items(): df_join_level_code_min = df_join_level_code_min.withColumn(k, v) prinDev(df_join_level_code_min, "end data min") convertAndSaveS3(df_join_level_code_min) # ---------------------------------------------------------------------------------------------------------------- df_join_level_code_pass = set_package_advisor_level(df_select_pass) for k, v in ADD_COLLUM_KTKT_TC.items(): df_join_level_code_pass = df_join_level_code_pass.withColumn(k, v) prinDev(df_join_level_code_pass, "end data pass") convertAndSaveS3(df_join_level_code_pass) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet", mode="overwrite")
def smile_care_rating(): dyf_smile_care_key = connectGlue(database="native_smile", table_name="ticket_log_5450ed3d8cb5a34974310b6b26e451fa", select_fields=["_key", "requester_email", "satisfaction", "satisfaction_at","created_at"], fillter=['requester_email'] ).rename_field("satisfaction", "value_rating") dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["value_rating"] in satisfaction) dyf_smile_care_key = dyf_smile_care_key.resolveChoice(specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet") max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_smile_care_key.count() > 0: df_smile_care_key = dyf_smile_care_key.toDF() flag = df_smile_care_key.agg({"_key": "max"}).collect()[0][0] df_smile_care = df_smile_care_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_smile_care_key.created_at, "yyyy-MM-dd HH:mm:ss")) dyf_smile_care = DynamicFrame.fromDF(df_smile_care, glueContext, "dyf_smile_care") dyf_smile_care = dyf_smile_care.resolveChoice(specs=[("student_behavior_date", "cast:int")]) dyf_smile_care = dyf_smile_care.select_fields( ["_key", "requester_email", "value_rating", "satisfaction_at", "student_behavior_date"]) df_smile_care = dyf_smile_care.toDF() dyf_student_contact_email = connectGlue( database="tig_advisor", table_name="student_contact_email", select_fields=["email", "contact_id", "user_id"] ) dyf_student_contact_ = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"] ).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact_.toDF() df_student_contact_email = dyf_student_contact_email.toDF() df_smile_care = df_smile_care.join(df_student_contact_email, (df_smile_care["requester_email"] == df_student_contact_email["email"])) df_smile_care = df_smile_care.join(df_student_contact, (df_smile_care["contact_id"] == df_student_contact["contact_id_contact"])) df_smile_care.drop("email", "requester_email", "contact_id_contact") df_smile_care = df_smile_care.withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("behavior_id", f.lit(26)) \ .withColumn("transformed_at", f.lit(d4)) df_smile_care = set_package_advisor_level(df_smile_care) convertAndSaveS3(df_smile_care) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet", mode="overwrite")
def phone_rating(): dyf_advisorcall_key = connectGlue( database="callcenter", table_name="advisorcall", select_fields=["_key", "calldate", "phonenumber", "rating", "device", "hanguptvts", "status"] ) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["rating"] in ratings and x["device"] == "3CX" and x["hanguptvts"] == 1 and x["status"] == "ANSWER") dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_phone_rating = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_phone_rating.parquet") max_key = df_flag_phone_rating.collect()[0]["flag"] print("max_key: ", max_key) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_advisorcall_key.count()>0: df_advisorcall_key = dyf_advisorcall_key.toDF() flag = df_advisorcall_key.agg({"_key": "max"}).collect()[0][0] df_advisorcall = df_advisorcall_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss")) dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall") dyf_advisorcall = dyf_advisorcall.resolveChoice(specs=[("student_behavior_date", "cast:int")]) dyf_advisorcall = dyf_advisorcall.select_fields( ["_key", "student_behavior_date", "phonenumber", "rating", "device", "hanguptvts", "status"]) \ .rename_field("rating", "value_rating") df_advisorcall = dyf_advisorcall.toDF() dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["phone", "contact_id", "user_id"] ) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"] ).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_student_contact_phone = dyf_student_contact_phone.toDF() df_advisorcall = df_advisorcall.join(df_student_contact_phone, (df_advisorcall["phonenumber"] == df_student_contact_phone["phone"])) df_advisorcall = df_advisorcall.join(df_student_contact, (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"])) df_advisorcall = df_advisorcall.drop("phonenumber", "phone", "contact_id_contact") df_advisorcall = df_advisorcall.withColumn("comment", f.lit("")).withColumn("rating_about", f.lit(None)) \ .withColumn("rating_type", f.lit("rating_hotline")) \ .withColumn("number_rating", f.lit(1)) df_rating_phone = df_advisorcall \ .withColumn("behavior_id", f.lit(25)) \ .withColumn("transformed_at", f.lit(d4)) df_rating_phone = set_package_advisor_level(df_rating_phone) convertAndSaveS3(df_rating_phone) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_phone_rating.parquet", mode="overwrite")
def etl_hoc_vien_duoc_chao_mung(): dyf_av_care_call = connectGlue( database="tig_advisor", table_name="care_call", select_fields=[ "_key", "phone", "duration", "call_status", "time_created" ], fillter=["phone", "duration", "call_status"], duplicates=["phone", "call_status", "time_created"], ).rename_field("phone", "phone1") dyf_av_care_call = Filter.apply( frame=dyf_av_care_call, f=lambda x: x["call_status"] in ("success", "call_success") and x["duration"] > 30) dyf_av_care_call = dyf_av_care_call.resolveChoice(specs=[("_key", "cast:long")]) try: df_flag_1 = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet" ) max_key = df_flag_1.collect()[0]["flag"] print("max_key: ", max_key) dyf_av_care_call = Filter.apply(frame=dyf_av_care_call, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_av_care_call.count() > 0: df_av_care_call = dyf_av_care_call.toDF() flag = df_av_care_call.agg({"_key": "max"}).collect()[0][0] dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["contact_id", "phone"], fillter=["contact_id", "phone"], duplicates=["contact_id", "phone"], ) df_av_care_call = df_av_care_call\ .withColumn("student_behavior_date", f.unix_timestamp(df_av_care_call.time_created, "yyyy-MM-dd HH:mm:ss")) dyf_av_care_call = DynamicFrame.fromDF(df_av_care_call, glueContext, "dyf_av_care_call") dyf_av_care_call = dyf_av_care_call.resolveChoice( specs=[("student_behavior_date", "cast:int")]) df_av_care_call = dyf_av_care_call.toDF() dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_student_contact_phone = dyf_student_contact_phone.toDF() df_student_care = df_av_care_call.join( df_student_contact_phone, (df_av_care_call["phone1"] == df_student_contact_phone["phone"]), "left") df_student_care = df_student_care.drop("phone1", "phone") # -----------------------------------------------------------------------------------------------------------------# # -----------------------------------------------------------------------------------------------------------------# df_student_care = df_student_care.join( df_student_contact, (df_student_care["contact_id"] == df_student_contact["contact_id_contact"]), "left") df_student_care = df_student_care.drop("contact_id_contact") df_package_code = khoa_hoc() df_behavior_join_2 = df_student_care.join( df_package_code, df_student_care["contact_id"] == df_package_code["crm_id"]) df_behavior_join_2 = df_behavior_join_2.drop("crm_ id") df_behavior_join_2 = set_package_advisor_level(df_behavior_join_2) for k, v in ADD_COLLUM.items(): df_behavior_join_2 = df_behavior_join_2.withColumn(k, v) prinDev(df_behavior_join_2, "end_code") # (student_behavior_date, behavior_id, student_id, user_id, contact_id, # package_code, student_level_code, package_status_code, transformed) convertAndSaveS3(df_behavior_join_2) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet", mode="overwrite")
def display_dynamic_frame_schema(dynamic_frame: DynamicFrame): dynamic_frame.printSchema()
def display_dynamic_frame_data(dynamic_frame: DynamicFrame): dynamic_frame.show()
def main(): # ----------test------------------ ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_id = long(today.strftime("%Y%m%d")) print('today_id: ', today_id) start_date = today - timedelta(5) start_date_focus = long(start_date.strftime("%Y%m%d")) end_date_focus = today_id df_student_contact = get_df_student_contact() df_student_contact.cache() if is_dev: print('df_student_contact') df_student_contact.printSchema() df_student_contact.show(3) if is_checking: df_student_contact_test = df_student_contact.filter( f.col('contact_id') == checking_contact_id) print('is_checking::df_student_contact_test') df_student_contact_test.show(30) df_collect_lssclt = etl_collect_lssclt(start_date_focus, end_date_focus, df_student_contact) if is_dev: df_collect_lssclt.cache() print('df_collect_lssclt') df_collect_lssclt.printSchema() df_collect_lssclt.show(3) if is_checking: df_collect_lssclt_test = df_collect_lssclt.filter( (f.col('contact_id') == checking_contact_id) & (f.col('date_id') == checking_date_id)) print('is_checking::df_collect_lssclt_test') df_collect_lssclt_test.show(30) df_collect_ai_stater = etl_ai_stater(start_date_focus, end_date_focus, df_student_contact) if is_dev: df_collect_ai_stater.cache() print('df_collect_ai_stater') df_collect_ai_stater.printSchema() df_collect_ai_stater.show(3) if is_checking: df_collect_ai_stater_test = df_collect_ai_stater.filter( (f.col('contact_id') == checking_contact_id) & (f.col('date_id') == checking_date_id)) print('is_checking::df_collect_ai_stater_test') df_collect_ai_stater_test.show(30) df_le2_history = etl_collet_le2(start_date_focus, end_date_focus) if is_checking: df_le2_history_test = df_le2_history.filter( (f.col('contact_id') == checking_contact_id) & (f.col('date_id') == checking_date_id)) print('is_checking::df_le2_history_test') df_le2_history_test.show(30) if is_dev: print('df_le2_history') df_le2_history.printSchema() df_le2_history.show(3) df_le_total = df_collect_lssclt\ .join(other=df_collect_ai_stater, on=['contact_id', 'date_id'], how='outer')\ .join(other=df_le2_history, on=['contact_id', 'date_id'], how='outer')\ .join(df_student_contact, on=['contact_id'], how='inner') df_le_total = df_le_total.withColumn("transformed_at", f.lit(d4)) df_le_total = df_le_total.fillna(0) if is_checking: df_le_total_test = df_le_total.filter( (f.col('contact_id') == checking_contact_id) & (f.col('date_id') == checking_date_id)) print('is_checking::df_le_total_test') df_le_total_test.show(30) if is_dev: print('df_le_total') df_le_total.printSchema() df_le_total.show(3) dyf = DynamicFrame.fromDF(df_le_total, glueContext, "dyf") preactions = "DELETE student_detail.student_behavior_aggregation_day_v1 where date_id >= " + str( start_date_focus) glueContext.write_dynamic_frame.from_jdbc_conf( frame=dyf, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "student_detail.student_behavior_aggregation_day_v1", "database": "student_native_report" }, redshift_tmp_dir= "s3n://dts-odin/temp/bc200/student_detail/student_behavior_aggregation_day_v1", transformation_ctx="datasink4") df_student_contact.unpersist() if is_dev: df_collect_lssclt.unpersist() df_collect_ai_stater.unpersist() df_le2_history.unpersist()
def etl_student_advisor(): print_is_dev( "etl_advisor_profile___****************************************************************************************" ) # ETL for profile # dyf_student_advisor = connectGlue(database="tig_advisor", table_name="log_change_assignment_advisor", # # select_fields=["id", "contact_id", "advisor_id_old", "advisor_id_new","created_at", "updated_at"], # # fillter=["id"], # # duplicates=["id"]) # ------------------------------------------------------------------------------------------------------------------# df_student_advisor = get_dyf_student_advisor(glueContext) number_student_advisor = df_student_advisor.count() if is_dev: print("number_advisor_account: ", number_student_advisor) df_student_advisor.printSchema() df_student_advisor.show(3) if number_student_advisor < 1: return # ------------------------------------------------------------------------------------------------------------------# # process contact_id with advisor from log_change_assignment_advisor and student_contact df_contact_id_list = df_student_advisor.select('contact_id') df_contact_id_list = df_contact_id_list.dropDuplicates(['contact_id']) df_change_advisor_new = get_df_change_advisor_new(df_student_advisor) #df_change_advisor_init = get_df_change_advisor_init(df_student_advisor) #df_advisor_from_student_contact = get_advisor_from_student_contact(glueContext, df_contact_id_list) df_change_advisor_total = df_change_advisor_new # .union(df_change_advisor_init) #.union(df_advisor_from_student_contact) if is_dev: print('df_change_advisor_total') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) #-------------------------------------------------------------------------------------------------------------------# df_change_advisor_total = df_change_advisor_total\ .withColumn('created_at', f.unix_timestamp('created_at', format='yyyy-MM-dd HH:mm:ss')) df_change_advisor_total = df_change_advisor_total.orderBy( "contact_id", "created_at") if is_dev: print('df_change_advisor_total_after_order') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) df_change_advisor_total = df_change_advisor_total.select( "contact_id", f.struct('advisor_id', 'created_at').alias('end_start')) print('df_change_advisor_total') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) df_change_advisor_group_contact = df_change_advisor_total.groupBy("contact_id") \ .agg(f.collect_list("end_start").alias("l_created_at")) print('df_change_advisor_group_contact_______________') df_change_advisor_group_contact.printSchema() df_change_advisor_group_contact.show(3) df_change_advisor_group_contact = df_change_advisor_group_contact\ .withColumn("l_created_at", transpose_start_end_udf("l_created_at")) df_change_advisor_group_contact.printSchema() df_change_advisor_explore = df_change_advisor_group_contact\ .select("contact_id", f.explode("l_created_at").alias("start_end_transpose") ) df_change_advisor_explore.printSchema() df_change_advisor_explore = df_change_advisor_explore.select( "contact_id", f.col("start_end_transpose").getItem("advisor_id").alias("advisor_id"), f.col("start_end_transpose").getItem("start").alias("start"), f.col("start_end_transpose").getItem("end").alias("end")) df_change_advisor_explore.printSchema() df_change_advisor_explore = addCollum(df_change_advisor_explore, STUDENT_ADVISOR_ADD_COL) dyf_change_advisor_explore = DynamicFrame.fromDF( df_change_advisor_explore, glueContext, "dyf_change_advisor_explore") # dyf_change_advisor_explore = mappingForAll(dyf_change_advisor_explore, mapping = STUDENT_ADVISOR_MAPPING) print('dyf_change_advisor_explore') dyf_change_advisor_explore.printSchema() dyf_change_advisor_explore.show(10) ######## save to s3###### # parquetToS3(dyf_student_advisor_mapping, path="s3://dtsodin/student_behavior/advisor_thangvm/advisor_history") ############################################################ # ------------------------------------------------------------------------------------------------------------------# # | -- student_advisor_id: string # | -- contact_id: string # | -- advisor_id: string # | -- start_date: string # | -- end_date: string # | -- created_at: int # | -- updated_at: string apply_ouput = ApplyMapping.apply( frame=dyf_change_advisor_explore, mappings=[("contact_id", "string", "contact_id", "string"), ("advisor_id", "string", "advisor_id", "long"), ("start", "string", "start_date", "long"), ("end", "string", "end_date", "long"), ("created_at", "int", "created_at", "long")]) # dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output, catalog_connection="glue_redshift", connection_options={ # "preactions": """TRUNCATE TABLE ad_student_advisor """, "dbtable": "ad_student_advisor", "database": "transaction_log" }, redshift_tmp_dir= "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_package", transformation_ctx="datasink4")
def etl_user_communication(dyf_crm_concats): print_is_dev('etl_user_user_communication___****************************************************************************') # Cache all data(Extra data) # ----------------------------------------------------------------------------------------------------------------# dyf_adv_phones = connectGlue(database='tig_advisor', table_name='student_contact_phone', select_fields=["contact_id", "phone","deleted"], fillter=['contact_id', 'phone']) dyf_adv_phones= RenameField.apply(dyf_adv_phones,"deleted","is_deleted") count_adv_phones = dyf_adv_phones.count() if count_adv_phones < 1: return # ----------------------------------------------------------------------------------------------------------------# dyf_adv_advisor = connectGlue(database='tig_advisor', table_name='student_contact', select_fields=["contact_id", "student_id"], fillter=['contact_id', 'student_id']) count_adv_advisor = dyf_adv_advisor.count() if count_adv_advisor < 1: return # ----------------------------------------------------------------------------------------------------------------# dyf_toa_enterprise = connectGlue(database='dm_toa', table_name='toa_mapping_student_enterprise', select_fields=["lms_id", "enterprise_id"], fillter=['lms_id', 'enterprise_id']) count_toa_enterprise = dyf_toa_enterprise.count() if count_toa_enterprise < 1: return dyf_toa_enterprise.printSchema() # --------------------------------------------------------------------------------------------------------------# # Start Tranform print_is_dev("id and code----------------------------------------------------") print_is_dev(dyf_crm_concats.count()) df_crm_concats = dyf_crm_concats.toDF() df_concat_enterprise_clear = df_crm_concats.dropDuplicates(['Id', 'Code']) dyf_concat_enterprise_clear=DynamicFrame.fromDF(df_concat_enterprise_clear, glueContext, 'dyf_concat_enterprise_clear') print_is_dev(dyf_concat_enterprise_clear.count()) dyf_toa_enterprise = dyf_toa_enterprise.resolveChoice(specs=[('enterprise_id', 'cast:string')]) dyf_jon_contact_enterprise = Join.apply(dyf_concat_enterprise_clear, dyf_adv_advisor, 'Code', 'contact_id') dyf_jon_contact_enterprise = Join.apply(dyf_toa_enterprise, dyf_jon_contact_enterprise, 'lms_id', 'student_id') dyf_jon_contact_enterprise = RenameField.apply(dyf_jon_contact_enterprise, "enterprise_id", "comunication") comunication_add_collum['communication_type']=f.lit(8) dyf_concat_enterprise = mappingForAll(dyf_jon_contact_enterprise, comunication, comunication_add_collum) if is_dev: print('dyf_concat_enterprise') dyf_concat_enterprise.printSchema() dyf_concat_enterprise.show(10) ######## save to s3###### # parquetToS3(dyf_concat_enterprise, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_enterprise) ########################################## # ETL for phone comunication dyf_jon_contact_phones = Join.apply(dyf_concat_enterprise_clear, dyf_adv_phones, 'Code', 'contact_id') dyf_jon_contact_phones = RenameField.apply(dyf_jon_contact_phones, "phone", "comunication") comunication_add_collum['communication_type']=f.lit(1) dyf_concat_phone = mappingForAll(dyf_jon_contact_phones,comunication,comunication_add_collum_for_phone) if is_dev: print('dyf_concat_phone') dyf_concat_phone.printSchema() dyf_concat_phone.show(10) ######## save to s3###### #parquetToS3(dyf_concat_phone, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_phone) ########################################## # ETL for username comunication dyf_crm_concat_name = fillterOutNull(dyf_crm_concats, ['Fullname']) df_crm_concat_name=dyf_crm_concat_name.toDF() df_concat_name_clear = df_crm_concat_name.dropDuplicates(['Id', 'Code','Fullname']) dyf_concat_name_clear=DynamicFrame.fromDF(df_concat_name_clear, glueContext, 'dyf_concat_name_clear') dyf_crm_concat_name = RenameField.apply(dyf_concat_name_clear, "Fullname", "comunication") # set comunication type comunication_add_collum['communication_type'] = f.lit(4) dyf_concat_name = mappingForAll(dyf_crm_concat_name, comunication, comunication_add_collum) if is_dev: print('dyf_concat_name') dyf_concat_name.printSchema() dyf_concat_name.show(10) ######## save to s3###### #parquetToS3(dyf_concat_name, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_name) ########################################## # ETL for email comunication dyf_crm_concat_emails = fillterOutNull(dyf_crm_concats, ['Email']) df_crm_concat_emails=dyf_crm_concat_emails.toDF() df_crm_concat_emails = df_crm_concat_emails.dropDuplicates(['Id', 'Code','Email']) dyf_crm_concat_emails=DynamicFrame.fromDF(df_crm_concat_emails, glueContext, 'dyf_crm_concat_emails') dyf_crm_concat_email = RenameField.apply(dyf_crm_concat_emails, "Email", "comunication") comunication_add_collum['communication_type']=f.lit(2) dyf_concat_email = mappingForAll(dyf_crm_concat_email,comunication,comunication_add_collum) if is_dev: print('dyf_concat_email') dyf_concat_email.printSchema() dyf_concat_email.show(10) ######## save to s3###### #parquetToS3(dyf_concat_email, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_email) ########################################## # ETL for email2 comunication dyf_crm_concat_email_2 = fillterOutNull(dyf_crm_concats, ['Email2']) df_crm_concat_email_2=dyf_crm_concat_email_2.toDF() df_crm_concat_email_2 = df_crm_concat_email_2.dropDuplicates(['Id', 'Code','Email2']) dyf_crm_concat_email_2=DynamicFrame.fromDF(df_crm_concat_email_2, glueContext, 'dyf_crm_concat_email_2') dyf_crm_concat_email_2 = RenameField.apply(dyf_crm_concat_email_2, "Email", "comunication") comunication_add_collum['is_primary']=f.lit(0) dyf_concat_email2 = mappingForAll(dyf_crm_concat_email_2, comunication,comunication_add_collum) if is_dev: print('dyf_concat_email2') dyf_concat_email2.printSchema() dyf_concat_email2.show(10) ######## save to s3###### #parquetToS3(dyf_concat_email2, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_email2) ########################################## # ETL for address comunication dyf_crm_concat_address = fillterOutNull(dyf_crm_concats, ['Address']) df_crm_concat_address=dyf_crm_concat_address.toDF() df_crm_concat_address = df_crm_concat_address.dropDuplicates(['Id', 'Code','Address']) dyf_crm_concat_address=DynamicFrame.fromDF(df_crm_concat_address, glueContext, 'dyf_crm_concat_address') dyf_crm_concat_address = RenameField.apply(dyf_crm_concat_address, "Address", "comunication") comunication_add_collum['is_primary'] = f.lit(1) comunication_add_collum['communication_type'] = f.lit(6) dyf_concat_address = mappingForAll(dyf_crm_concat_address,comunication, comunication_add_collum) if is_dev: print('dyf_concat_address') dyf_concat_address.printSchema() dyf_concat_address.show(10) ######## save to s3###### #parquetToS3(dyf_concat_address, path="s3://dtsodin/student_behavior/up/user_comunication") save_communication_redshift(dyf_concat_address)
def etl_user_map(dyf_crm_concats): print_is_dev('etl_user_map___*******************************************************************************************') # ETL for mapper df_crm_concats= dyf_crm_concats.toDF() df_crm_concat_clear= df_crm_concats.dropDuplicates(['Id', 'Code']) dyf_crm_concat_clear=DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear') dyf_crm_concat_map = RenameField.apply(dyf_crm_concat_clear, "Code", "source_id") dyf_crm_concat_map = mappingForAll(dyf_crm_concat_map, mapper, mapper_add_collum) if is_dev: print('dyf_crm_concat_map') dyf_crm_concat_map.printSchema() dyf_crm_concat_map.show(10) ######## save to s3###### #parquetToS3(dyf_crm_concat_map,path="s3://dtsodin/student_behavior/up/user_mapper") glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_crm_concat_map, catalog_connection="glue_redshift", connection_options={ "dbtable": "up_user_map", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map", transformation_ctx="datasink4") ############################################################ print_is_dev('etl_user_map___********************************************************************************************') # ---------------------------------------------------------------------------------------------------------------# dyf_adv_student = connectGlue(database='tig_advisor', table_name='student_contact', select_fields=["contact_id", "student_id"], fillter=['contact_id', 'student_id']) count_adv_student = dyf_adv_student.count() if count_adv_student < 1: return dyf_adv_student.show(10) # -----------------------------------------------------------------------------------------------------------------# # Start Tranform # ETL for phone comunication dyf_crm_concat_clear = DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear') dyf_join_contact_student = Join.apply(dyf_adv_student, dyf_crm_concat_clear, 'contact_id', 'Code') dyf_adv_student = RenameField.apply(dyf_join_contact_student, "student_id", "source_id") mapper_add_collum['source_type']=f.lit(2) dyf_adv_student = mappingForAll(dyf_adv_student, mapper,mapper_add_collum) dyf_adv_student.show(10) ######## save to s3###### #parquetToS3(dyf_adv_student, path="s3://dtsodin/student_behavior/up/user_mapper") ############################################################ glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_adv_student, catalog_connection="glue_redshift", connection_options={ "dbtable": "up_user_map", "database": "transaction_log" }, redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map", transformation_ctx="datasink4")
def main(): dyf_advisorcall_key = connectGlue( database="callcenter", table_name="advisorcall", select_fields=[ "_key", "calldate", "idcall", "rating", "device", "hanguphv", "totaltime", "answertime" ], duplicates=[ "calldate", "idcall", "rating", "device", "hanguphv", "totaltime", "answertime" ], fillter=["idcall"]).rename_field("rating", "value_rating") dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice( specs=[("_key", "cast:long")]) try: df_flag_phone_rating = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet" ) max_key = df_flag_phone_rating.collect()[0]["flag"] print("max_key: ", max_key) dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") count = dyf_advisorcall_key.count() print(count) if count > 0: df_advisorcall_key = dyf_advisorcall_key.toDF() df_advisorcall = df_advisorcall_key \ .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss")) dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall") dyf_advisorcall = dyf_advisorcall.resolveChoice( specs=[("student_behavior_date", "cast:int")]) df_advisorcall = dyf_advisorcall.toDF() dyf_cdr = connectGlue( database="callcenter", table_name="cdr", select_fields=["ip_phone", "call_id", "student_phone", "status"], fillter=["call_id"]) df_cdr = dyf_cdr.toDF() df_advisorcall = df_advisorcall.join( df_cdr, (df_advisorcall["idcall"] == df_cdr["call_id"]), "right") dyf_student_contact_phone = connectGlue( database="tig_advisor", table_name="student_contact_phone", select_fields=["phone", "contact_id"], fillter=["phone", "contact_id"], duplicates=["phone", "contact_id"]) df_student_contact_phone = dyf_student_contact_phone.toDF() df_advisorcall = df_advisorcall.join( df_student_contact_phone, (df_advisorcall["student_phone"] == df_student_contact_phone["phone"])) dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], fillter=["student_id", "contact_id"], duplicates=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_advisorcall = df_advisorcall.join( df_student_contact, (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"])) df_advisorcall = df_advisorcall.drop("contact_id_contact", "phone", "call_id") df_rating_phone = df_advisorcall.withColumn("transformed_at", f.lit(d4)) flag = df_rating_phone.agg({"_key": "max"}).collect()[0][0] prinDev(df_rating_phone) convertAndSaveS3(df_rating_phone) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet", mode="overwrite")