コード例 #1
0
def cara_rating():

    dyf_mdl_rating_class_key = connectGlue(database="topicalms", table_name="mdl_rating_class",
                                       select_fields=["_key", "points", "vote", "student_id",
                                                      "room_id", "opinion", "timecreated", "timemodified"],
                                       fillter=["points", "student_id", "opinion", "timecreated"]
                                       ).rename_field("points", "value_rating"). \
        rename_field("opinion", "comment"). \
        rename_field("vote", "rating_about"). \
        rename_field("timecreated", "student_behavior_date")

    dyf_mdl_rating_class_key = dyf_mdl_rating_class_key.resolveChoice(specs=[("_key", "cast:long")])
    try:
        df_flag_1 = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_cara_rating.parquet")
        max_key = df_flag_1.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_mdl_rating_class_key = Filter.apply(frame=dyf_mdl_rating_class_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    dyf_mdl_rating_class_key = Filter.apply(frame=dyf_mdl_rating_class_key, f=lambda x: x["value_rating"] in points)
    if dyf_mdl_rating_class_key.count() > 0:

        df_mdl_rating_class_key = dyf_mdl_rating_class_key.toDF()
        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["contact_id", "student_id"],
            fillter=["contact_id", "student_id"],
            duplicates=["contact_id", "student_id"]
        ).rename_field("student_id", "student_id_contact")




        df_student_contact = dyf_student_contact.toDF()
        df_mdl_rating_class = df_mdl_rating_class_key.join(df_student_contact,df_mdl_rating_class_key["student_id"] == df_student_contact["student_id_contact"])

        df_mdl_rating_class = df_mdl_rating_class.drop("student_id_contact")
        df_mdl_rating_class = df_mdl_rating_class.withColumn("number_rating", f.lit(1)) \
            .withColumn("rating_type", f.lit("rating_cara"))
        if is_dev:
            df_mdl_rating_class.show(10)

        #     free run
        df_mdl_rating_class = df_mdl_rating_class \
            .withColumn("behavior_id", f.lit(24)) \
            .withColumn("transformed_at", f.lit(d4))

        df_mdl_rating_class = set_package_advisor_level(df_mdl_rating_class)
        convertAndSaveS3(df_mdl_rating_class)

        flag = df_mdl_rating_class_key.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_cara_rating.parquet", mode="overwrite")
コード例 #2
0
def get_df_student_advisor():
    dyf_student_package = glueContext.create_dynamic_frame.from_options(
        connection_type="redshift",
        connection_options={
            "url":
            "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/transaction_log",
            "user":
            REDSHIFT_USERNAME,
            "password":
            REDSHIFT_PASSWORD,
            "dbtable":
            "ad_student_advisor",
            "redshiftTmpDir":
            "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_level"
        })
    dyf_student_package = Filter.apply(
        frame=dyf_student_package,
        f=lambda x: x["contact_id"] is not None and x["contact_id"] != "")
    if is_dev:
        dyf_student_package.printSchema()
        dyf_student_package.show(10)

    dyf_student_package = dyf_student_package.select_fields(
        ["contact_id","advisor_id", "end_date","start_date"]) .\
        rename_field("contact_id", "contact_id_advisor")

    df_student_package = dyf_student_package.toDF()
    prinDev(df_student_package, "dyf_student_package")
    return df_student_package
コード例 #3
0
def get_voxy():
    dyf_voxy = connectGlue(
        database="voxy",
        table_name="voxy_api",
        select_fields=["email", "time_created", "total_hours_studied"],
        fillter=["email"],
        duplicates=["email",
                    "time_created"]).rename_field("email", "email_voxy")
    dyf_voxy = Filter.apply(frame=dyf_voxy,
                            f=lambda x: x["total_hours_studied"] > 0)
    df_voxy = dyf_voxy.toDF()

    df_voxy = df_voxy.withColumn(
        "time_created_new", f.unix_timestamp(df_voxy.time_created,
                                             "yyyy-MM-dd"))
    dyf_voxy = DynamicFrame.fromDF(df_voxy, glueContext, "dyf_voxy")

    dyf_voxy = dyf_voxy.resolveChoice(specs=[("time_created_new",
                                              "cast:long")])
    df_voxy = dyf_voxy.toDF()
    df_voxy.drop("time_created")
    df_voxy = df_voxy.groupby("email_voxy").agg(
        f.min(df_voxy.time_created_new).alias("student_behavior_date_voxy"))

    return df_voxy
def get_native_talk():
    dyf_native_talk_history_log = connectGlue(database="native_talk", table_name="native_talk_history_log_api",
                                         select_fields=["learning_date", "username","speaking_dialog_score"],
                                         fillter=["username"],
                                         duplicates=["username", "learning_date"]
                                         )
    dyf_native_talk_history_log = Filter.apply(frame=dyf_native_talk_history_log,
                            f=lambda x: x["speaking_dialog_score"] > 0
                            )

    df_native_talk_history_log = dyf_native_talk_history_log.toDF()
    df_native_talk_history_log = df_native_talk_history_log.drop("speaking_dialog_score")

    df_native_talk_history_log = df_native_talk_history_log.withColumn("learning_date_int",
                                                             f.unix_timestamp(df_native_talk_history_log.learning_date,
                                                                              "yyyy-MM-dd"))
    dyf_native_talk_history_log = DynamicFrame.fromDF(df_native_talk_history_log, glueContext, "dyf_native_talk_history_log")

    dyf_native_talk_history_log = dyf_native_talk_history_log.resolveChoice(specs=[("learning_date_int", "cast:long")])
    df_native_talk_history_log = dyf_native_talk_history_log.toDF()
    df_native_talk_history_log = df_native_talk_history_log.groupby("username").agg(
        f.min(df_native_talk_history_log.learning_date_int).alias("student_behavior_date_nt"))

    dyf_native_talk_account_mapping = connectGlue(database="native_talk", table_name="native_talk_account_mapping",
                                                  select_fields=["username", "contact_id"],
                                                  fillter=["username", "contact_id"],
                                                  duplicates=["username", "contact_id"]
                                                  ).rename_field("username", "username_mapping").\
                                                    rename_field("contact_id","contact_id_nt")
    df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF()
    join = df_native_talk_account_mapping.join(df_native_talk_history_log, df_native_talk_account_mapping.username_mapping == df_native_talk_history_log.username)
    join=join.drop("username_mapping","username")
    return join
コード例 #5
0
def filter_flag(dyf, config_file):
    try:
        flag_smile_care = spark.read.parquet(config_file)
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf = Filter.apply(frame=dyf, f=lambda x: x["_key"] > max_key)
        return dyf
    except:
        print("read flag file error")
        return dyf
コード例 #6
0
def etl_dang_ki_tai_khoan():
    dyf_lms_user=connectGlue(database="topicalms",table_name="mdl_user",
                                    select_fields=["_key","id","timecreated"],
                                    fillter=["id","timecreated"],
                                    duplicates=["id","timecreated"]
                                    ).rename_field("id","student_id")\
                                    .rename_field("timecreated","student_behavior_date")

    dyf_lms_user = dyf_lms_user.resolveChoice(specs=[("_key", "cast:long")])
    try:
        flag_behavior_hoc_vien_nhap_hoc = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_hoc_vien_nhap_hoc.parquet"
        )
        max_key = flag_behavior_hoc_vien_nhap_hoc.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_lms_user = Filter.apply(frame=dyf_lms_user,
                                    f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_lms_user > 0:
        df_lms_user = dyf_lms_user.toDF()

        flag = df_lms_user.agg({"_key": "max"}).collect()[0][0]
        print("flag: ", flag)

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("student_id",
                                                 "student_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_lms_user = dyf_lms_user.toDF()
        df_lms_user_contact = df_lms_user.join(
            df_student_contact, df_lms_user["student_id"] ==
            df_student_contact["student_id_contact"], "left")

        # -----------------------------------------------------------------------------------------------------------------#

        df_join_level_code = set_package_advisor_level(df_lms_user_contact)
        for k, v in ADD_COLLUM.items():
            df_join_level_code = df_join_level_code.withColumn(k, v)
        prinDev(df_join_level_code, "end data")
        convertAndSaveS3(df_join_level_code)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_hoc_vien_nhap_hoc.parquet",
            mode="overwrite")
コード例 #7
0
ファイル: utils.py プロジェクト: 01662024622/dts-odin-etl
def filter_latest(spark, dynamic_frame, config_file):
    try:
        df_flag = spark.read.parquet(config_file)
        start_read = df_flag.collect()[0]['flag']
        print 'read from index: ', start_read

        result = Filter.apply(frame=dynamic_frame,
                              f=lambda x: x['_key'] > start_read)
        return result
    except ():
        print 'read flag file error '
        return dynamic_frame
コード例 #8
0
def main():

    dyf_student_care_advisor = connectGlue(
        database="callcenter",
        table_name="student_care_advisor",
        select_fields=[
            "transformed_at", "idcall", "student_behavior_date", "student_id",
            "answer_duration", "total_duration", "requested_rating",
            "value_rating", "ip_phone", "call_status"
        ],
        duplicates=[
            "student_behavior_date", "student_id", "answer_duration",
            "total_duration", "requested_rating", "value_rating"
        ],
        fillter=["student_id",
                 "ip_phone"]).rename_field("transformed_at", "_key")

    dyf_student_care_advisor = dyf_student_care_advisor.resolveChoice(
        specs=[("_key", "cast:long")])

    try:
        df_flag_phone_rating = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_student_care_advisor_fact.parquet"
        )
        max_key = df_flag_phone_rating.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_student_care_advisor = Filter.apply(
            frame=dyf_student_care_advisor, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    count = dyf_student_care_advisor.count()
    print(count)
    if count > 0:
        df_student_care_advisor = dyf_student_care_advisor.toDF()

        df_student_care_advisor = df_student_care_advisor.withColumn(
            "transformed_at", f.lit(d4))
        prinDev(df_student_care_advisor)

        flag = df_student_care_advisor.agg({"_key": "max"}).collect()[0][0]

        convertAndSaveS3(df_student_care_advisor)
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_student_care_advisor_fact.parquet",
            mode="overwrite")
コード例 #9
0
def get_df_student_contact(glueContext):
    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['contact_id', 'student_id', 'advisor_id'])

    # dyf_student_contact = dyf_student_contact.resolveChoice(specs=[('time_lms_created', 'cast:long')])

    dyf_student_contact = Filter.apply(
        frame=dyf_student_contact,
        f=lambda x: x['student_id'] is not None and x['contact_id'] is not None
        and x['advisor_id'] is not None and x['advisor_id'] != '')

    df_student_contact = dyf_student_contact.toDF()
    return df_student_contact
def get_ls_sc():
    mdl_logsservice_in_out = connectGlue(database="topicalms", table_name="mdl_logsservice_in_out_cutoff",
                               select_fields=["userid", "time_in", "time_out"],
                               fillter=["userid"],
                               duplicates=["userid","time_in"]
                               ).rename_field("userid","student_id_ls_sc")
    mdl_logsservice_in_out = Filter.apply(frame=mdl_logsservice_in_out,
                                          f=lambda x: x["time_in"] >= 1483203600
                                                      and x["time_out"] >1483203600
                                                      and (x["time_out"] - x["time_in"])>2100
                                          )
    mdl_logsservice_in_out = mdl_logsservice_in_out.toDF()
    mdl_logsservice_in_out = mdl_logsservice_in_out.drop("time_out")
    mdl_logsservice_in_out = mdl_logsservice_in_out.groupby("student_id_ls_sc").agg(
        f.min(mdl_logsservice_in_out.time_in).cast("long").alias("student_behavior_date_ls_sc"))
    return mdl_logsservice_in_out
def get_lt():
    dyf_native_livestream = connectGlue(database="native_livestream", table_name="log_in_out",
                               select_fields=["student_id", "time_in", "thoigianhoc"],
                               fillter=["student_id","thoigianhoc","time_in"],
                               duplicates=["student_id","time_in"]
                               ).rename_field("student_id","student_id_lt")

    dyf_native_livestream = dyf_native_livestream.resolveChoice(specs=[("time_in", "cast:long")])
    dyf_native_livestream = dyf_native_livestream.resolveChoice(specs=[("thoigianhoc", "cast:int")])


    dyf_native_livestream = Filter.apply(frame=dyf_native_livestream,f=lambda x: x["time_in"] >1483203600 and x["thoigianhoc"] >59)
    df_native_livestream = dyf_native_livestream.toDF()
    df_native_livestream = df_native_livestream.drop("thoigianhoc")
    df_native_livestream = df_native_livestream.groupby("student_id_lt").agg(
        f.min(expr("time_in div 1000")).cast("long").alias("student_behavior_date_lt"))
    return df_native_livestream
コード例 #12
0
def get_dyf_student_advisor(glueContext):
    dyf_student_advisor = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_change_assignment_advisor")

    dyf_student_advisor = dyf_student_advisor.select_fields([
        "id", "contact_id", "advisor_id_old", "advisor_id_new", "created_at",
        "updated_at"
    ])

    dyf_student_advisor = Filter.apply(
        frame=dyf_student_advisor,
        f=lambda x: x['contact_id'] is not None and x[
            'contact_id'] != '' and x['advisor_id_new'] is not None and x[
                'advisor_id_new'] != '' and x['created_at'] is not None)

    df_student_advisor = dyf_student_advisor.toDF()
    return df_student_advisor
コード例 #13
0
def fillterOutNull(dynamicFrame, fields):
    for field in fields:
        dynamicFrame = Filter.apply(
            frame=dynamicFrame,
            f=lambda x: x[field] is not None and x[field] != "")
    return dynamicFrame
コード例 #14
0
def etl_native_talk(df_student_level=None, df_student_package=None, df_student_advisor=None):
    dyf_native_talk_history = connectGlue(
        database="native_talk",
        table_name="native_talk_history_log_api_cutoff_2020",
        select_fields=["_key", "username", "learning_date", "speaking_completed_dialog_name",
                       "speaking_dialog_score", "time_of_completingadialog"],
    )

    dyf_native_talk_history = Filter.apply(
        frame=dyf_native_talk_history,
        f=lambda x: x["username"] is not None and x["username"] != ""
                    and x["learning_date"] is not None
                    and x["learning_date"] != ""
                    and x["speaking_completed_dialog_name"] != ""
                    and x["speaking_dialog_score"] > 0)

    dyf_native_talk_history = dyf_native_talk_history.resolveChoice(specs=[("_key", "cast:float")])

    if not is_load_full:
        # dyf_native_talk_history = filter_flag(dyf_native_talk_history, FLAG_NATIVE_TALK_FILE)
        try:
            df_flag = spark.read.parquet(FLAG_NATIVE_TALK_FILE)
            read_from_index = df_flag.collect()[0]['flag']
            print('read from index: ', read_from_index)
            dyf_native_talk_history = Filter.apply(frame=dyf_native_talk_history,
                                                  f=lambda x: x["learning_date"] > read_from_index)
        except:
            print('read flag file error ')

    df_native_talk_history = dyf_native_talk_history.toDF()

    number_native_talk_history = df_native_talk_history.count()

    print("native_talk_history")
    print("number_native_talk_history_number: ", number_native_talk_history)
    prinDev(df_native_talk_history)

    if number_native_talk_history < 1:
        return
    df_native_talk_history = df_native_talk_history \
        .drop_duplicates(["username", "learning_date",
                          "speaking_completed_dialog_name",
                          "speaking_dialog_score", "time_of_completingadialog"])
    # ---------------------
    flag = df_native_talk_history.agg({"learning_date": "max"}).collect()[0][0]
    # -----------------------------------------------------------------------------------------------------

    df_native_talk_history = df_native_talk_history \
        .groupby("username", "learning_date") \
        .agg(f.count("speaking_dialog_score").alias("total"))
    df_native_talk_history = df_native_talk_history.select("username", "learning_date", "total")

    # -------------process duplicate ------------------------

    dyf_native_talk_mapping = connectGlue(
        database="native_talk",
        table_name="native_talk_account_mapping",
        select_fields=["username", "contact_id"],
    ).rename_field("username", "user_name_mapping")

    dyf_native_talk_mapping = Filter.apply(
        frame=dyf_native_talk_mapping,
        f=lambda x: x["user_name_mapping"] is not None and x["user_name_mapping"] != ""
                    and x["contact_id"] is not None
                    and x["contact_id"] != "")

    dyf_student_contact = connectGlue(
        database="tig_advisor",
        table_name="student_contact",
        select_fields=["student_id", "contact_id"],
    ).rename_field("contact_id", "contact_id_contact")
    df_native_talk_mapping = dyf_native_talk_mapping.toDF()
    df_native_talk_mapping = df_native_talk_mapping.drop_duplicates(["user_name_mapping", "contact_id"])

    df_student_contact = dyf_student_contact.toDF()
    df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id_contact"])

    df_result = df_native_talk_history \
        .join(df_native_talk_mapping,
              df_native_talk_history["username"] == df_native_talk_mapping["user_name_mapping"]) \
        .join(df_student_contact, df_native_talk_mapping["contact_id"] == df_student_contact["contact_id_contact"])

    df_result = df_result \
        .withColumn("total_duration", f.col('total') * f.lit(300)) \
        .withColumn("transformed_at", f.lit(d4)) \
        .withColumn("class_type", f.lit("NATIVE_TALK"))

    prinDev(df_result)
    # df_result = set_package_advisor_level(df_result, df_student_level=df_student_level,
    #                                       df_student_package=df_student_package, df_student_advisor=df_student_advisor)

    convertAndSaveS3(df_result)

    flag_data = [flag]
    df = spark.createDataFrame(flag_data, "string").toDF("flag")
    # ghi de _key vao s3
    # df.write.parquet(FLAG_NATIVE_TALK_SAVE, mode="overwrite")
    df.write.parquet(FLAG_NATIVE_TALK_SAVE, mode="overwrite")
コード例 #15
0
def etl_voxy(df_student_level=None, df_student_package=None, df_student_advisor=None):
    dyf_voxy = connectGlue(database="voxy", table_name="voxy_api_cutoff_2020",
                           select_fields=["_key", "email", "last_login", "total_activities_completed",
                                          'total_hours_studied'],
                           )

    dyf_voxy = Filter.apply(
        frame=dyf_voxy,
        f=lambda x:
        # x["total_activities_completed"] > 0
        x["email"] is not None and x["email"] != ""
        and x['last_login'] is not None
    )

    if is_dev:
        print('dyf_voxy')
        dyf_voxy.printSchema()
        dyf_voxy.show(3)

    dyf_voxy = dyf_voxy.resolveChoice(specs=[("_key", "cast:long")])

    print('is_load_full___: ' + str(is_load_full))
    print('dyf_voxy before filter: ' + str(dyf_voxy.count()))
    if not is_load_full:
        print('not load full-----------------')
        dyf_voxy = filter_flag(dyf_voxy, FLAG_VOXY_FILE)

    number_dyf_voxy = dyf_voxy.count()
    print("number_dyf_voxy after filter :", number_dyf_voxy)

    if number_dyf_voxy > 0:
        df_voxy = dyf_voxy.toDF()

        flag = df_voxy.agg({"_key": "max"}).collect()[0][0]

        df_voxy = df_voxy.filter(~df_voxy.email.startswith("vip_"))

        # df_voxy = df_voxy.dropDuplicates(['email', 'last_login'])

        df_voxy = df_voxy.withColumn('learning_date',
                                     f.from_unixtime(timestamp=f.unix_timestamp(f.col('last_login'),
                                                                                format="yyyy-MM-dd'T'HH:mm:ss"),
                                                     format="yyyy-MM-dd")) \
            .withColumn('learning_date_id',
                        f.from_unixtime(timestamp=f.unix_timestamp(f.col('last_login'),
                                                                   format="yyyy-MM-dd'T'HH:mm:ss"),
                                        format="yyyyMMdd").cast('long'))

        df_voxy = df_voxy.filter(f.col('learning_date_id') < today_id)

        if is_dev:
            print('dyf_voxy__2')
            dyf_voxy.printSchema()
            dyf_voxy.show(3)

        df_voxy = df_voxy \
            .groupby("email", "learning_date") \
            .agg(f.sum("total_activities_completed").alias("total"),
                 f.sum("total_hours_studied").cast('double').alias("total_duration")
                 )
        df_voxy = df_voxy.select("email",
                                 "learning_date",
                                 "total",
                                 f.round(f.col('total_duration') * 3600).cast('long').alias('total_duration'))

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id", "user_name"],
        )
        dyf_student_contact = Filter.apply(
            frame=dyf_student_contact,
            f=lambda x: x["student_id"] is not None and x["student_id"] != ""
                        and x["contact_id"] is not None and x["contact_id"] != ""
                        and x["user_name"] is not None and x["user_name"] != ""
        )

        df_student_contact = dyf_student_contact.toDF()
        df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"])
        df_result = df_voxy.join(df_student_contact, (df_voxy["email"]).endswith(df_student_contact["user_name"]))

        df_result = df_result \
            .withColumn("transformed_at", f.lit(d4)) \
            .withColumn("class_type", f.lit("VOXY")) \
 \
                prinDev(df_result)
        df_result = set_package_advisor_level(
            df_result,
            df_student_level=df_student_level,
            df_student_package=df_student_package,
            df_student_advisor=df_student_advisor)

        convertAndSaveS3(df_result)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF("flag")
        print('flag_data : ---')
        df.show()
        # ghi de _key vao s3
        df.write.parquet(FLAG_VOXY_SAVE, mode="overwrite")
def etl_hoc_vien_trai_nghiem_mot_trong_cac_thanh_to_hoc():

    dyf_student_contact = connectGlue(database="tig_advisor",table_name="student_contact",
                                    select_fields=["_key","contact_id","student_id","user_name"],
                                    fillter=["contact_id","student_id","user_name"],
                                    duplicates=["contact_id","user_name"]
                                    ).rename_field("user_name","email")

    dyf_student_contact = dyf_student_contact.resolveChoice(specs=[("_key", "cast:long")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_trai_nghiem_1_trong_7_thanh_to.parquet")
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_student_contact>0:
        df_student_contact = dyf_student_contact.toDF()
        flag = df_student_contact.agg({"_key": "max"}).collect()[0][0]
        print(flag)
        prinDev(df_student_contact)
        df_ls_sc = get_ls_sc()
        df_lt = get_lt()
        df_hw = get_hw_basic()
        df_nt = get_native_talk()
        df_voxy = get_voxy()
        df_ncsb = get_ncsbasic()
        df_join = df_student_contact.join(df_ls_sc,df_ls_sc["student_id_ls_sc"]==df_student_contact["student_id"],"left").\
            join(df_lt,df_lt["student_id_lt"]==df_student_contact["student_id"],"left").\
            join(df_hw,df_hw["email_hw"]==df_student_contact["email"],"left").\
            join(df_nt,df_nt["contact_id_nt"]==df_student_contact["contact_id"],"left").\
            join(df_voxy,df_voxy["email_voxy"]==df_student_contact["email"],"left").\
            join(df_ncsb,df_ncsb["email_ncbs"]==df_student_contact["email"],"left")
        prinDev(df_join)
        df_fillter = df_join.select("contact_id","student_id",
            get_behavior_date("student_behavior_date_ls_sc",
                              "student_behavior_date_lt",
                              "student_behavior_date_voxy",
                              "student_behavior_date_hw",
                              "student_behavior_date_nt",
                              "student_behavior_date_ncsb").cast("long").alias("student_behavior_date"))

        df_fillter = df_fillter.filter(df_fillter.student_behavior_date < 1999999999)
        prinDev(df_fillter)
        # -----------------------------------------------------------------------------------------------------------------#


        df_join_level_code = set_package_advisor_level(df_fillter)
        for k, v in ADD_COLLUM.items():
            df_join_level_code = df_join_level_code.withColumn(k, v)
        prinDev(df_join_level_code,"end data")
        # return
        convertAndSaveS3(df_join_level_code)
        # (student_behavior_date, behavior_id, student_id, user_id, contact_id,
        #  package_code, student_level_code, package_status_code, transformed)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_behavior_trai_nghiem_1_trong_7_thanh_to.parquet",
                         mode="overwrite")
コード例 #17
0
def smile_care_rating():
    dyf_smile_care_key = connectGlue(database="native_smile",
                                 table_name="ticket_log_5450ed3d8cb5a34974310b6b26e451fa",
                                 select_fields=["_key", "requester_email", "satisfaction", "satisfaction_at","created_at"],
                                 fillter=['requester_email']
                                 ).rename_field("satisfaction", "value_rating")

    dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["value_rating"] in satisfaction)
    dyf_smile_care_key = dyf_smile_care_key.resolveChoice(specs=[("_key", "cast:long")])
    try:
        flag_smile_care = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet")
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_smile_care_key.count() > 0:
        df_smile_care_key = dyf_smile_care_key.toDF()
        flag = df_smile_care_key.agg({"_key": "max"}).collect()[0][0]
        df_smile_care = df_smile_care_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_smile_care_key.created_at, "yyyy-MM-dd HH:mm:ss"))
        dyf_smile_care = DynamicFrame.fromDF(df_smile_care, glueContext, "dyf_smile_care")

        dyf_smile_care = dyf_smile_care.resolveChoice(specs=[("student_behavior_date", "cast:int")])
        dyf_smile_care = dyf_smile_care.select_fields(
            ["_key", "requester_email", "value_rating", "satisfaction_at", "student_behavior_date"])
        df_smile_care = dyf_smile_care.toDF()

        dyf_student_contact_email = connectGlue(
            database="tig_advisor",
            table_name="student_contact_email",
            select_fields=["email", "contact_id", "user_id"]
        )
        dyf_student_contact_ = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"]
        ).rename_field("contact_id", "contact_id_contact")
        df_student_contact = dyf_student_contact_.toDF()
        df_student_contact_email = dyf_student_contact_email.toDF()

        df_smile_care = df_smile_care.join(df_student_contact_email,
                                           (df_smile_care["requester_email"] == df_student_contact_email["email"]))
        df_smile_care = df_smile_care.join(df_student_contact,
                                           (df_smile_care["contact_id"] == df_student_contact["contact_id_contact"]))

        df_smile_care.drop("email", "requester_email", "contact_id_contact")

        df_smile_care = df_smile_care.withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \
            .withColumn("comment", f.lit("")) \
            .withColumn("rating_about", f.lit(None)) \
            .withColumn("number_rating", f.lit(1)) \
            .withColumn("behavior_id", f.lit(26)) \
            .withColumn("transformed_at", f.lit(d4))

        df_smile_care = set_package_advisor_level(df_smile_care)

        convertAndSaveS3(df_smile_care)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet", mode="overwrite")
コード例 #18
0
def main():

    dyf_advisorcall_key = connectGlue(
        database="callcenter",
        table_name="advisorcall",
        select_fields=[
            "_key", "calldate", "idcall", "rating", "device", "hanguphv",
            "totaltime", "answertime"
        ],
        duplicates=[
            "calldate", "idcall", "rating", "device", "hanguphv", "totaltime",
            "answertime"
        ],
        fillter=["idcall"]).rename_field("rating", "value_rating")

    dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(
        specs=[("_key", "cast:long")])

    try:
        df_flag_phone_rating = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet"
        )
        max_key = df_flag_phone_rating.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key,
                                           f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    count = dyf_advisorcall_key.count()
    print(count)
    if count > 0:
        df_advisorcall_key = dyf_advisorcall_key.toDF()
        df_advisorcall = df_advisorcall_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss"))
        dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext,
                                              "dyf_advisorcall")

        dyf_advisorcall = dyf_advisorcall.resolveChoice(
            specs=[("student_behavior_date", "cast:int")])

        df_advisorcall = dyf_advisorcall.toDF()

        dyf_cdr = connectGlue(
            database="callcenter",
            table_name="cdr",
            select_fields=["ip_phone", "call_id", "student_phone", "status"],
            fillter=["call_id"])
        df_cdr = dyf_cdr.toDF()
        df_advisorcall = df_advisorcall.join(
            df_cdr, (df_advisorcall["idcall"] == df_cdr["call_id"]), "right")

        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["phone", "contact_id"],
            fillter=["phone", "contact_id"],
            duplicates=["phone", "contact_id"])
        df_student_contact_phone = dyf_student_contact_phone.toDF()

        df_advisorcall = df_advisorcall.join(
            df_student_contact_phone, (df_advisorcall["student_phone"]
                                       == df_student_contact_phone["phone"]))

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            fillter=["student_id", "contact_id"],
            duplicates=["student_id",
                        "contact_id"]).rename_field("contact_id",
                                                    "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_advisorcall = df_advisorcall.join(
            df_student_contact, (df_advisorcall["contact_id"]
                                 == df_student_contact["contact_id_contact"]))

        df_advisorcall = df_advisorcall.drop("contact_id_contact", "phone",
                                             "call_id")

        df_rating_phone = df_advisorcall.withColumn("transformed_at",
                                                    f.lit(d4))

        flag = df_rating_phone.agg({"_key": "max"}).collect()[0][0]
        prinDev(df_rating_phone)

        convertAndSaveS3(df_rating_phone)
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet",
            mode="overwrite")
コード例 #19
0
def etl_ncsbasic(df_student_level=None, df_student_package=None, df_student_advisor=None):
    dyf_results = connectGlue(database="ncsbasic", table_name="results_cutoff_2020",
                              select_fields=["_key", "user_id", "time_created", "time_end"],
                              )

    dyf_results = Filter.apply(
        frame=dyf_results,
        f=lambda x: x["time_created"] is not None and x["time_created"] != ""
                    and x["time_end"] is not None and x["time_end"] != ""
                    and x["user_id"] is not None and x["user_id"] != "")

    dyf_results = dyf_results.resolveChoice(specs=[("time_created", "cast:long")])
    dyf_results = dyf_results.resolveChoice(specs=[("time_end", "cast:long")])
    dyf_results = dyf_results.resolveChoice(specs=[("_key", "cast:long")])

    if not is_load_full:
        dyf_results = filter_flag(dyf_results, FLAG_NCSB_FILE)

    if dyf_results.count() > 0:
        df_results = dyf_results.toDF()
        flag = df_results.agg({"_key": "max"}).collect()[0][0]
        df_results = df_results.drop_duplicates(["user_id", "time_created"])
        df_results = df_results.withColumn("learning_date",
                                           f.from_unixtime('time_created', format="yyyy-MM-dd").cast("string")) \
            .withColumn('total_duration', f.round(f.col('time_end') - f.col('time_created')).cast('long'))
        df_results = df_results \
            .groupby("user_id", "learning_date") \
            .agg(f.count("time_end").alias("total"),
                 f.sum('total_duration').alias('total_duration')
                 )
        dyf_native_talk_account_mapping = connectGlue(database="ncsbasic", table_name="users",
                                                      select_fields=["_id", "email"],
                                                      )
        dyf_native_talk_account_mapping = Filter.apply(
            frame=dyf_native_talk_account_mapping,
            f=lambda x: x["_id"] is not None and x["_id"] != ""
                        and x["email"] is not None and x["email"] != "")

        df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF()
        df_native_talk_account_mapping = df_native_talk_account_mapping.drop_duplicates(["_id"])

        df_join = df_results.join(df_native_talk_account_mapping,
                                  df_results.user_id == df_native_talk_account_mapping._id)

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id", "user_name"],
        )
        dyf_student_contact = Filter.apply(
            frame=dyf_student_contact,
            f=lambda x: x["student_id"] is not None and x["student_id"] != ""
                        and x["contact_id"] is not None and x["contact_id"] != ""
                        and x["user_name"] is not None and x["user_name"] != "")

        df_student_contact = dyf_student_contact.toDF()
        df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"])

        df_result = df_join.join(df_student_contact, df_join["email"] == df_student_contact["user_name"])

        df_result = df_result.filter(df_result.total > 0)

        df_result = df_result \
            .withColumn("transformed_at", f.lit(d4)) \
            .withColumn("class_type", f.lit("NCSBASIC")) \
 \
                prinDev(df_result)
        # df_result = set_package_advisor_level(df_result, df_student_level=df_student_level,
        #                                       df_student_package=df_student_package,
        #                                       df_student_advisor=df_student_advisor)
        convertAndSaveS3(df_result)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF("flag")
        # ghi de _key vao s3
        df.write.parquet(FLAG_NCSB_SAVE, mode="overwrite")
コード例 #20
0
def etl_gia_han_goi_hoc():
    dyf_ghi_nhan_hoc_phi = connectGlue(
        database="poss",
        table_name="ghinhan_hp",
        select_fields=["_key", "khoa_hoc_makh", "ngay_tao"],
        fillter=["khoa_hoc_makh", "ngay_tao"])

    dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice(
        specs=[("_key", "cast:long")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet"
        )
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_ghi_nhan_hoc_phi = Filter.apply(frame=dyf_ghi_nhan_hoc_phi,
                                            f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_ghi_nhan_hoc_phi.count() > 0:
        df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF()
        flag = df_ghi_nhan_hoc_phi.agg({"_key": "max"}).collect()[0][0]
        prinDev(df_ghi_nhan_hoc_phi)
        dyf_khoa_hoc_poss = connectGlue(
            database="poss",
            table_name="khoa_hoc",
            select_fields=["makh", "mahv", "goi_sanpham_id"],
            fillter=["makh", "mahv", "goi_sanpham_id"],
            duplicates=["makh", "mahv", "goi_sanpham_id"
                        ]).rename_field("mahv",
                                        "ma_hv").rename_field("makh", "ma_kh")

        df_khoa_hoc_poss = dyf_khoa_hoc_poss.toDF()

        df_ghi_nhan_hoc_phi = df_khoa_hoc_poss.join(
            df_ghi_nhan_hoc_phi, (df_khoa_hoc_poss["ma_kh"]
                                  == df_ghi_nhan_hoc_phi["khoa_hoc_makh"]))

        df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi \
            .withColumn("student_behavior_date", f.unix_timestamp(df_ghi_nhan_hoc_phi.ngay_tao, "yyyy-MM-dd HH:mm:ss"))
        dyf_ghi_nhan_hoc_phi = DynamicFrame.fromDF(df_ghi_nhan_hoc_phi,
                                                   glueContext,
                                                   "dyf_ghi_nhan_hoc_phi")
        dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice(
            specs=[("student_behavior_date", "cast:long")])

        df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF()
        prinDev(df_ghi_nhan_hoc_phi)
        df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi.drop("khoa_hoc_makh")
        dyf_hoc_vien_poss = connectGlue(database="poss",
                                        table_name="hoc_vien",
                                        select_fields=["mahv", "crm_id"],
                                        fillter=["mahv", "crm_id"],
                                        duplicates=["mahv",
                                                    "crm_id"]).rename_field(
                                                        "crm_id", "contact_id")

        df_hoc_vien_poss = dyf_hoc_vien_poss.toDF()

        df_khoa_hoc_contact = df_ghi_nhan_hoc_phi.join(
            df_hoc_vien_poss,
            (df_ghi_nhan_hoc_phi["ma_hv"] == df_hoc_vien_poss["mahv"]), "left")
        df_khoa_hoc_contact = df_khoa_hoc_contact.drop("mahv")
        if is_dev:
            print "df_khoa_hoc_contact"
            df_khoa_hoc_contact.show(10)
        # -----------------------------------------------------------------------------------------------------------------#
        df_package_code = package_code()

        df_khoa_hoc_contact_package_code = df_khoa_hoc_contact.join(
            df_package_code,
            (df_khoa_hoc_contact["goi_sanpham_id"] == df_package_code["id"]))

        df_khoa_hoc_contact_package_code.drop("goi_sanpham_id", "id")

        # -----------------------------------------------------------------------------------------------------------------#

        # -----------------------------------------------------------------------------------------------------------------#

        # ----------------------------------------------------------------------------------------------------------------
        dyf_test_dauvao_poss = connectGlue(
            database="poss",
            table_name="test_dauvao",
            select_fields=["mahv", "trinhdo_dauvao"],
            duplicates=["mahv", "trinhdo_dauvao"],
            fillter=["mahv", "trinhdo_dauvao"])
        df_test_dauvao_poss = dyf_test_dauvao_poss.toDF()

        df_join_level_code = df_khoa_hoc_contact_package_code.join(
            df_test_dauvao_poss, (df_khoa_hoc_contact_package_code["ma_hv"]
                                  == df_test_dauvao_poss["mahv"]), "left")
        df_join_level_code = df_join_level_code.drop("mahv", "ma_hv")

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("contact_id",
                                                 "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()
        df_join_level_code = df_join_level_code.join(
            df_student_contact, (df_student_contact["contact_id_contact"]
                                 == df_join_level_code["contact_id"]))
        df_join_level_code = df_join_level_code.drop("contact_id_contact")
        df_join_level_code = set_package_advisor_level(df_join_level_code)
        prinDev(df_join_level_code, "end data")
        for k, v in ADD_COLLUM.items():
            df_join_level_code = df_join_level_code.withColumn(k, v)

        convertAndSaveS3(df_join_level_code)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet",
            mode="overwrite")
コード例 #21
0
def smile_H2472_care_rating():
    dyf_tblreply_rating_key = connectGlue(
        database="native_smile",
        table_name="tblreply_rating",
        select_fields=["_key", "userid", "ratingid", "time_rating"],
        fillter=['userid']
    ).rename_field("time_rating", "student_behavior_date").rename_field("userid", "student_id")

    dyf_tblreply_rating_key = Filter.apply(frame=dyf_tblreply_rating_key, f=lambda x: x["ratingid"] > 13)
    dyf_tblreply_rating_key = dyf_tblreply_rating_key.resolveChoice(specs=[("_key", "cast:long")])

    try:
        df_flag_H2472 = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_H2472_care_rating.parquet")
        max_key = df_flag_H2472.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_tblreply_rating_key = Filter.apply(frame=dyf_tblreply_rating_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_tblreply_rating_key.count() > 0:
        df_tblreply_rating_key = dyf_tblreply_rating_key.toDF()

        flag = df_tblreply_rating_key.agg({"_key": "max"}).collect()[0][0]

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["contact_id", "student_id"],
            fillter=["contact_id", "student_id"],
            duplicates=["contact_id", "student_id"]
        ).rename_field("student_id", "student_id_contact")

        dyf_tma_dm_tu_dien = connectGlue(
            database="native_smile",
            table_name="tma_dm_tu_dien",
            select_fields=["id", "ma_tu_dien", "id_dm_loai_tu_dien"],
            fillter=["id", "id_dm_loai_tu_dien"]
        )

        dyf_tma_dm_tu_dien = Filter.apply(frame=dyf_tma_dm_tu_dien,
                                          f=lambda x: x["id_dm_loai_tu_dien"] == 7
                                                      and x["id"] in rangeid)
        # df_mdl_user=dyf_mdl_user.toDF()
        df_tma_dm_tu_dien = dyf_tma_dm_tu_dien.toDF()
        ################
        # df_tblreply_rating = df_tblreply_rating.join(df_mdl_user,(df_tblreply_rating["userid"]== df_mdl_user["id"]),"left")
        # join_rating_user.drop("id","userid")
        join_rating_user01 = df_tblreply_rating_key.join(df_tma_dm_tu_dien,
                                                     (df_tblreply_rating_key["ratingid"] == df_tma_dm_tu_dien["id"]))
        join_rating_user01.drop("id")
        df_student_contact = dyf_student_contact.toDF()
        join_rating_user01 = join_rating_user01.join(df_student_contact,
                                                     (join_rating_user01["student_id"] == df_student_contact[
                                                         "student_id_contact"]))
        join_rating_user01 = join_rating_user01.drop("student_id_contact")
        if is_dev:
            join_rating_user01.printSchema()
        join_rating_user01 = join_rating_user01.dropDuplicates()
        join_rating_user01 = join_rating_user01.withColumn("rating_type", f.lit("rating_native_smile_h2472")) \
            .withColumn("comment", f.lit("")) \
            .withColumn("rating_about", f.lit(None)) \
            .withColumn("number_rating", f.lit(1)) \
            .withColumn("value_rating", (join_rating_user01.ratingid - f.lit(13))) \
            .withColumn("behavior_id", f.lit(27)) \
            .withColumn("transformed_at", f.lit(d4))

        join_rating_user01 = set_package_advisor_level(join_rating_user01)
        convertAndSaveS3(join_rating_user01)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_H2472_care_rating.parquet", mode="overwrite")
コード例 #22
0
def phone_rating():
    dyf_advisorcall_key = connectGlue(
        database="callcenter",
        table_name="advisorcall",
        select_fields=["_key", "calldate", "phonenumber", "rating", "device", "hanguptvts", "status"]
    )
    dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["rating"] in ratings
                                                                              and x["device"] == "3CX"
                                                                              and x["hanguptvts"] == 1
                                                                              and x["status"] == "ANSWER")

    dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(specs=[("_key", "cast:long")])

    try:
        df_flag_phone_rating = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_phone_rating.parquet")
        max_key = df_flag_phone_rating.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_advisorcall_key.count()>0:

        df_advisorcall_key = dyf_advisorcall_key.toDF()
        flag = df_advisorcall_key.agg({"_key": "max"}).collect()[0][0]
        df_advisorcall = df_advisorcall_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss"))
        dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall")

        dyf_advisorcall = dyf_advisorcall.resolveChoice(specs=[("student_behavior_date", "cast:int")])
        dyf_advisorcall = dyf_advisorcall.select_fields(
            ["_key", "student_behavior_date", "phonenumber", "rating", "device", "hanguptvts", "status"]) \
            .rename_field("rating", "value_rating")

        df_advisorcall = dyf_advisorcall.toDF()


        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["phone", "contact_id", "user_id"]
        )
        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"]
        ).rename_field("contact_id", "contact_id_contact")

        df_student_contact = dyf_student_contact.toDF()
        df_student_contact_phone = dyf_student_contact_phone.toDF()
        df_advisorcall = df_advisorcall.join(df_student_contact_phone,
                                             (df_advisorcall["phonenumber"] == df_student_contact_phone["phone"]))
        df_advisorcall = df_advisorcall.join(df_student_contact,
                                             (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"]))

        df_advisorcall = df_advisorcall.drop("phonenumber", "phone", "contact_id_contact")

        df_advisorcall = df_advisorcall.withColumn("comment", f.lit("")).withColumn("rating_about", f.lit(None)) \
            .withColumn("rating_type", f.lit("rating_hotline")) \
            .withColumn("number_rating", f.lit(1))

        df_rating_phone = df_advisorcall \
            .withColumn("behavior_id", f.lit(25)) \
            .withColumn("transformed_at", f.lit(d4))

        df_rating_phone = set_package_advisor_level(df_rating_phone)

        convertAndSaveS3(df_rating_phone)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_phone_rating.parquet", mode="overwrite")
コード例 #23
0
def etl_ktkt():
    dyf_technical_test = connectGlue(database="technical_test",
                                     table_name="student_technical_test",
                                     select_fields=[
                                         "_key", "trinhdohocvien", "studentid",
                                         "thoigianhenktkt", "ketluan"
                                     ],
                                     fillter=["studentid", "thoigianhenktkt"],
                                     duplicates=[
                                         "trinhdohocvien", "studentid",
                                         "thoigianhenktkt", "ketluan"
                                     ])

    dyf_technical_test = dyf_technical_test.resolveChoice(
        specs=[("_key", "cast:long"), ("studentid", "cast:string")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet")
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_technical_test = Filter.apply(frame=dyf_technical_test,
                                          f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_technical_test.count() > 0:
        df_technical_test = dyf_technical_test.toDF()

        flag = df_technical_test.agg({"_key": "max"}).collect()[0][0]

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["contact_id", "student_id", "user_name"],
            fillter=["contact_id", "student_id"],
            duplicates=["contact_id", "student_id"])

        df_student_contact = dyf_student_contact.toDF()

        df_technical_test = df_technical_test.withColumn(
            "date",
            f.unix_timestamp(df_technical_test.thoigianhenktkt,
                             "yyyy-MM-dd HH:mm:ss"))
        dyf_technical_test = DynamicFrame.fromDF(df_technical_test,
                                                 glueContext,
                                                 "dyf_technical_test")
        dyf_technical_test = dyf_technical_test.resolveChoice(
            specs=[("date", "cast:long")])

        df_technical_test = dyf_technical_test.toDF()

        df_technical_test_min = df_technical_test.select(
            "trinhdohocvien", "studentid", "date")
        df_technical_test_min = df_technical_test_min.groupBy(
            "studentid", "trinhdohocvien").agg(
                f.min(
                    df_technical_test_min.date).alias("student_behavior_date"))

        df_join_min = df_student_contact.join(
            df_technical_test_min, df_technical_test_min["studentid"] ==
            df_student_contact["student_id"])
        df_select_min = df_join_min.select(
            "contact_id", "student_id", "student_behavior_date",
            df_join_min.trinhdohocvien.alias("student_level_code"))

        # -----------------------------------------------------------------------------------------------------------------#
        df_technical_test_pass = df_technical_test.where(
            df_technical_test.ketluan == "Pass")

        df_technical_test_pass = df_technical_test_pass.groupBy(
            "studentid", "trinhdohocvien").agg(
                f.min(df_technical_test_pass.date).alias(
                    "student_behavior_date"))

        df_join_pass = df_student_contact.join(
            df_technical_test_pass, df_technical_test_pass["studentid"] ==
            df_student_contact["student_id"])
        df_select_pass = df_join_pass.select(
            "contact_id", "student_id", ""
            "student_behavior_date",
            df_join_min.trinhdohocvien.alias("student_level_code"))

        prinDev(df=df_select_pass, df_name="pass")
        prinDev(df=df_select_pass, df_name="min")

        df_join_level_code_min = set_package_advisor_level(df_select_min)
        for k, v in ADD_COLLUM_HEN_KTKT.items():
            df_join_level_code_min = df_join_level_code_min.withColumn(k, v)
        prinDev(df_join_level_code_min, "end data min")
        convertAndSaveS3(df_join_level_code_min)

        # ----------------------------------------------------------------------------------------------------------------
        df_join_level_code_pass = set_package_advisor_level(df_select_pass)
        for k, v in ADD_COLLUM_KTKT_TC.items():
            df_join_level_code_pass = df_join_level_code_pass.withColumn(k, v)
        prinDev(df_join_level_code_pass, "end data pass")
        convertAndSaveS3(df_join_level_code_pass)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet",
            mode="overwrite")
コード例 #24
0
def etl_hoc_vien_duoc_chao_mung():
    dyf_av_care_call = connectGlue(
        database="tig_advisor",
        table_name="care_call",
        select_fields=[
            "_key", "phone", "duration", "call_status", "time_created"
        ],
        fillter=["phone", "duration", "call_status"],
        duplicates=["phone", "call_status", "time_created"],
    ).rename_field("phone", "phone1")

    dyf_av_care_call = Filter.apply(
        frame=dyf_av_care_call,
        f=lambda x: x["call_status"] in
        ("success", "call_success") and x["duration"] > 30)

    dyf_av_care_call = dyf_av_care_call.resolveChoice(specs=[("_key",
                                                              "cast:long")])

    try:
        df_flag_1 = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet"
        )
        max_key = df_flag_1.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_av_care_call = Filter.apply(frame=dyf_av_care_call,
                                        f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_av_care_call.count() > 0:
        df_av_care_call = dyf_av_care_call.toDF()
        flag = df_av_care_call.agg({"_key": "max"}).collect()[0][0]

        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["contact_id", "phone"],
            fillter=["contact_id", "phone"],
            duplicates=["contact_id", "phone"],
        )

        df_av_care_call = df_av_care_call\
        .withColumn("student_behavior_date", f.unix_timestamp(df_av_care_call.time_created, "yyyy-MM-dd HH:mm:ss"))
        dyf_av_care_call = DynamicFrame.fromDF(df_av_care_call, glueContext,
                                               "dyf_av_care_call")

        dyf_av_care_call = dyf_av_care_call.resolveChoice(
            specs=[("student_behavior_date", "cast:int")])
        df_av_care_call = dyf_av_care_call.toDF()

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("contact_id",
                                                 "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_student_contact_phone = dyf_student_contact_phone.toDF()
        df_student_care = df_av_care_call.join(
            df_student_contact_phone,
            (df_av_care_call["phone1"] == df_student_contact_phone["phone"]),
            "left")
        df_student_care = df_student_care.drop("phone1", "phone")
        # -----------------------------------------------------------------------------------------------------------------#

        # -----------------------------------------------------------------------------------------------------------------#
        df_student_care = df_student_care.join(
            df_student_contact, (df_student_care["contact_id"]
                                 == df_student_contact["contact_id_contact"]),
            "left")
        df_student_care = df_student_care.drop("contact_id_contact")

        df_package_code = khoa_hoc()
        df_behavior_join_2 = df_student_care.join(
            df_package_code,
            df_student_care["contact_id"] == df_package_code["crm_id"])
        df_behavior_join_2 = df_behavior_join_2.drop("crm_ id")
        df_behavior_join_2 = set_package_advisor_level(df_behavior_join_2)

        for k, v in ADD_COLLUM.items():
            df_behavior_join_2 = df_behavior_join_2.withColumn(k, v)
        prinDev(df_behavior_join_2, "end_code")
        # (student_behavior_date, behavior_id, student_id, user_id, contact_id,
        #  package_code, student_level_code, package_status_code, transformed)

        convertAndSaveS3(df_behavior_join_2)
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet",
            mode="overwrite")
def etl_hoc_viec_kich_hoat_goi_hoc():
    dyf_tig_market = connectGlue(
        database="tig_market",
        table_name="tpe_enduser_used_product_history",
        select_fields=["_key", "contact_id", "timecreated", "used_product_id"],
        fillter=["contact_id", "used_product_id"],
        duplicates=["contact_id", "timecreated", "used_product_id"])
    dyf_tig_market = dyf_tig_market.resolveChoice(specs=[("_key",
                                                          "cast:long")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_kich_hoat_tai_khoan_khoa_hoc.parquet"
        )
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_tig_market = Filter.apply(frame=dyf_tig_market,
                                      f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_tig_market > 0:
        df_tig_market = dyf_tig_market.toDF()

        flag = df_tig_market.agg({"_key": "max"}).collect()[0][0]
        print("flag: ", flag)
        df_tig_market = df_tig_market.groupby(
            "used_product_id", "contact_id").agg(
                f.min('timecreated').alias("student_behavior_date"))
        df_tig_market = df_tig_market.drop("used_product_id")

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("contact_id",
                                                 "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_tig_market_contact = df_tig_market.join(
            df_student_contact, df_tig_market["contact_id"] ==
            df_student_contact["contact_id_contact"], "left")
        df_tig_market_contact = df_tig_market_contact.drop(
            "contact_id_contact")
        # -----------------------------------------------------------------------------------------------------------------#

        prinDev(df_tig_market_contact, "df_khoa_hoc_contact")
        # -----------------------------------------------------------------------------------------------------------------#

        # prinDev(df_kich_hoat_goi_hoc,"end data")
        df_join_level_code = set_package_advisor_level(df_tig_market_contact)
        prinDev(df_join_level_code, "end data")
        for k, v in ADD_COLLUM.items():
            df_join_level_code = df_join_level_code.withColumn(k, v)
        convertAndSaveS3(df_join_level_code)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_kich_hoat_tai_khoan_khoa_hoc.parquet",
            mode="overwrite")
コード例 #26
0
def etl_home_work(df_student_level=None, df_student_package=None, df_student_advisor=None):
    dyf_mdl_le_exam_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_exam_attemp",
                                         select_fields=["_key", "user_id", "created_at", "le_attemp_id", 'le_id',
                                                        'point'],
                                         )

    dyf_mdl_le_exam_attemp = Filter.apply(
        frame=dyf_mdl_le_exam_attemp,
        f=lambda x: x["user_id"] is not None and x["user_id"] != ""
                    and x["le_attemp_id"] is not None and x["le_attemp_id"] != ""
                    and x["created_at"] is not None and x["created_at"] != ""
                    and x["created_at"] is not None
    )

    dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.resolveChoice(specs=[("_key", "cast:long")])

    if is_dev:
        print('dyf_mdl_le_exam_attemp')
        dyf_mdl_le_exam_attemp.printSchema()
        dyf_mdl_le_exam_attemp.show(3)

    if not is_load_full:
        dyf_mdl_le_exam_attemp = filter_flag(dyf_mdl_le_exam_attemp, FLAG_HW_FILE)

    number_dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.count()
    if number_dyf_mdl_le_exam_attemp < 1:
        return

    df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF()

    flag = df_mdl_le_exam_attemp.agg({"_key": "max"}).collect()[0][0]

    df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.drop_duplicates(["user_id", "created_at"])

    df_mdl_le_exam_attemp = df_mdl_le_exam_attemp \
        .select('user_id',
                'le_attemp_id',
                'le_id',
                'point',
                f.unix_timestamp(df_mdl_le_exam_attemp.created_at, "yyyy-MM-dd HH:mm:ss").cast("long").alias(
                    "created_at")
                )

    if is_dev:
        print('df_mdl_le_exam_attemp')
        df_mdl_le_exam_attemp.printSchema()
        df_mdl_le_exam_attemp.show(3)

    # ------------------------------------------------------------------------------------------------------------------#
    dyf_mdl_le_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_attemp",
                                    select_fields=["id", "created_at"],
                                    )
    dyf_mdl_le_attemp = Filter.apply(
        frame=dyf_mdl_le_attemp,
        f=lambda x: x["id"] is not None and x["id"] != ""
                    and x["created_at"] is not None and x["created_at"] != "")

    df_mdl_le_attemp = dyf_mdl_le_attemp.toDF()
    df_mdl_le_attemp = df_mdl_le_attemp.drop_duplicates(["id"])
    df_mdl_le_attemp = df_mdl_le_attemp.select("id", f.unix_timestamp(df_mdl_le_attemp.created_at,
                                                                      "yyyy-MM-dd HH:mm:ss").cast("long")
                                               .alias("created_at_le"))

    # ------------------------------------------------------------------------------------------------------------------#

    df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp \
        .join(other=df_mdl_le_attemp,
              on=df_mdl_le_exam_attemp.le_attemp_id == df_mdl_le_attemp.id,
              how='inner'
              )

    df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp_detail \
        .select(
        'user_id',
        'le_attemp_id',
        'le_id',
        'created_at',
        'point',
        (df_mdl_le_exam_attemp_detail.created_at - df_mdl_le_exam_attemp_detail.created_at_le).alias(
            'learning_duration'),
        f.from_unixtime(f.col('created_at'), "yyyy-MM-dd").cast("string").alias("learning_date")
    )

    if is_dev:
        print('df_mdl_le_exam_attemp_detail')
        df_mdl_le_exam_attemp_detail.printSchema()
        df_mdl_le_exam_attemp_detail.show(3)

    # ----------------get learning turn by le_id

    # ("student_id", "string", "student_id", "long"),
    # ("contact_id", "string", "contact_id", "string"),
    #
    # ("class_type", "string", "class_type", "string"),
    # ("learning_date", "string", "learning_date", "string"),
    # ("total", "long", "total_learing", "long"),
    # ("total_duration", "long", "total_duration", "long"),
    #
    # ("year_month_id", "string", "year_month_id", "string"),
    # ("transformed_at", "string", "transformed_at", "long")]

    df_mdl_le_exam_attemp_detail = df_mdl_le_exam_attemp_detail \
        .orderBy(f.asc('user_id'), f.asc('le_id'), f.asc('learning_date'), f.asc('created_at'))

    df_mdl_le_exam_attemp_learning_turn = df_mdl_le_exam_attemp_detail \
        .groupBy('user_id', 'le_id', 'learning_date') \
        .agg(
        f.first('created_at').alias('created_at'),
        f.first('learning_duration').alias('learning_duration'),
        f.first('point').alias('point')
    )

    df_mdl_le_exam_attemp_learning_turn = df_mdl_le_exam_attemp_learning_turn \
        .select("user_id",
                "created_at",
                'learning_duration',
                'point',
                'le_id',
                'learning_date',
                f.from_unixtime(f.col('created_at'), "yyyy-MM-dd HH:mm:ss").cast("string").alias("learning_date_time")
                )

    df_mdl_le_exam_attemp_learning_turn_success = df_mdl_le_exam_attemp_learning_turn \
        .filter(f.col('learning_duration') >= 600)

    df_learning_turn_success_total = df_mdl_le_exam_attemp_learning_turn_success \
        .groupby("user_id", "learning_date") \
        .agg(f.count("created_at").cast('long').alias("total"),
             f.sum('learning_duration').cast('long').alias('total_duration')
             )

    if is_dev:
        print('df_learning_turn_success_total')
        df_learning_turn_success_total.printSchema()
        df_learning_turn_success_total.show(3)

    dyf_mdl_user = connectGlue(database="home_work_basic_production", table_name="mdl_user",
                               select_fields=["id", "username"],
                               ).rename_field("username", "email")
    dyf_mdl_user = Filter.apply(
        frame=dyf_mdl_user,
        f=lambda x: x["id"] is not None and x["id"] != ""
                    and x["email"] is not None and x["email"] != "")

    df_mdl_user = dyf_mdl_user.toDF()
    df_mdl_user = df_mdl_user.drop_duplicates(["id"])

    dyf_student_contact = connectGlue(
        database="tig_advisor",
        table_name="student_contact",
        select_fields=["student_id", "contact_id", "user_name"],
    )

    # ------------------------------------------------------------------------------------------------------------------#

    dyf_student_contact = Filter.apply(
        frame=dyf_student_contact,
        f=lambda x: x["student_id"] is not None and x["student_id"] != ""
                    and x["contact_id"] is not None and x["contact_id"] != ""
                    and x["user_name"] is not None and x["user_name"] != "")
    df_student_contact = dyf_student_contact.toDF()
    df_student_contact = df_student_contact.drop_duplicates(["student_id", "contact_id", "user_name"])

    # ------------------------------------------------------------------------------------------------------------------#

    df_join = df_learning_turn_success_total \
        .join(other=df_mdl_user,
              on=df_mdl_user.id == df_mdl_le_exam_attemp.user_id,
              how='inner') \
        .join(other=df_student_contact,
              on=df_mdl_user.email == df_student_contact.user_name,
              how='inner')

    df_result = df_join \
        .withColumn("transformed_at", f.lit(d4)) \
        .withColumn("class_type", f.lit("HOME_WORK"))

    convertAndSaveS3(df_result)
    flag_data = [flag]
    df = spark.createDataFrame(flag_data, "long").toDF("flag")
    # ghi de _key vao s3
    df.write.parquet(FLAG_HW_SAVE, mode="overwrite")