def connectGlue(database="",
                table_name="",
                select_fields=[],
                fillter=[],
                cache=False,
                duplicates=[],
                push_down_predicate=""):
    if database == "" and table_name == "":
        return
    dyf = glueContext.create_dynamic_frame.from_catalog(
        database=database,
        table_name=table_name,
        push_down_predicate=push_down_predicate)

    if is_dev:
        print("full schema of table: ", database, "and table: ", table_name)
        dyf.printSchema()
    #     select felds
    if len(select_fields) != 0:
        dyf = dyf.select_fields(select_fields)

    if len(fillter) != 0:
        dyf = fillterOutNull(dyf, fillter)

    if is_limit and cache:
        df = dyf.toDF()
        df = df.limit(20000)
        df = checkDumplicate(df, duplicates)
        df = df.cache()
        dyf = DynamicFrame.fromDF(df, glueContext, table_name)
    elif is_limit == False and cache:
        df = dyf.toDF()
        df = checkDumplicate(df, duplicates)
        df = df.cache()
        dyf = DynamicFrame.fromDF(df, glueContext, table_name)
    elif is_limit and cache == False:
        df = dyf.toDF()
        df = checkDumplicate(df, duplicates)
        df = df.limit(20000)
        dyf = DynamicFrame.fromDF(df, glueContext, table_name)
    else:
        df = dyf.toDF()
        df = checkDumplicate(df, duplicates)
        dyf = DynamicFrame.fromDF(df, glueContext, table_name)

    if is_dev:
        print("full schema of table: ", database, " and table: ", table_name,
              " after select")
        dyf.printSchema()
    return dyf
def etl_user_profile(dyf_crm_concat):
    print_is_dev('etl_user_profile___****************************************************************************************')
    # ETL for profile
    myUdf = f.udf(myFunc, StringType())

    df_crm_concat_pf = dyf_crm_concat.toDF()
    df_crm_concat_pf = df_crm_concat_pf.groupBy('Id')\
        .agg(f.collect_list('Gender').alias('Gender'),
             f.collect_list('Birthday').alias('Birthday'),
             f.collect_list('Job').alias('Job'))\
        .withColumn('Gender', myUdf('Gender'))\
        .withColumn('Birthday', myUdf('Birthday'))\
        .withColumn('Job', myUdf('Job'))
    dyf_crm_concat_pf = DynamicFrame.fromDF(df_crm_concat_pf, glueContext, 'dyf_crm_contacts')
    dyf_crm_concat_pf = mappingForAll(dyf_crm_concat_pf, profile, profile_add_collum)

    if is_dev:
        print('dyf_crm_concat_pf')
        dyf_crm_concat_pf.printSchema()
        dyf_crm_concat_pf.show(10)
    ########  save to s3######
    #parquetToS3(dyf_crm_concat_pf, path="s3://dtsodin/student_behavior/up/user_profile")
    ############################################################

    # save to redshift

    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_crm_concat_pf,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "up_user_profile",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_profile",
                                                   transformation_ctx="datasink4")
Beispiel #3
0
def processBatch(data_frame, batchId):
    if (data_frame.count() > 0):
        dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext,
                                            "from_data_frame")
        apply_mapping = ApplyMapping.apply(frame = dynamic_frame, mappings = [ \
            ("uuid", "string", "uuid", "string"), \
            ("country", "string", "country", "string"), \
            ("itemtype", "string", "itemtype", "string"), \
            ("saleschannel", "string", "saleschannel", "string"), \
            ("orderpriority", "string", "orderpriority", "string"), \
            ("orderdate", "string", "orderdate", "string"), \
            ("region", "string", "region", "string"), \
            ("shipdate", "string", "shipdate", "string"), \
            ("unitssold", "string", "unitssold", "string"), \
            ("unitprice", "string", "unitprice", "string"), \
            ("unitcost", "string", "unitcost", "string"), \
            ("totalrevenue", "string", "totalrevenue", "string"), \
            ("totalcost", "string", "totalcost", "string"), \
            ("totalprofit", "string", "totalprofit", "string")],\
            transformation_ctx = "apply_mapping")

        final_frame = Join.apply(apply_mapping, country_lookup_frame, 'country', 'CountryName').drop_fields( \
                ['CountryName', 'country', 'unitprice', 'unitcost', 'totalrevenue', 'totalcost', 'totalprofit'])

        s3sink = glueContext.write_dynamic_frame.from_options(  frame = final_frame, \
                                                                connection_type = "s3", \
                                                                connection_options = {"path": s3_target}, \
                                                                format = "csv", \
                                                                transformation_ctx = "s3sink")
Beispiel #4
0
def etl_user_profile(dyf_crm_concat):
    print_is_dev(
        'etl_user_profile___****************************************************************************************'
    )
    # ETL for profile
    myUdf = f.udf(myFunc, StringType())

    df_crm_concat_pf = dyf_crm_concat.toDF()
    df_crm_concat_pf = df_crm_concat_pf.groupBy('Id') \
        .agg(f.collect_list('Gender').alias('Gender'),
             f.collect_list('Birthday').alias('Birthday'),
             f.collect_list('Job').alias('Job')) \
        .withColumn('Gender', myUdf('Gender')) \
        .withColumn('Birthday', myUdf('Birthday')) \
        .withColumn('Job', myUdf('Job'))
    dyf_crm_concat_pf = DynamicFrame.fromDF(df_crm_concat_pf, glueContext,
                                            'dyf_crm_contacts')
    dyf_crm_concat_pf = mappingForAll(dyf_crm_concat_pf, profile,
                                      profile_add_collum)

    dyf_crm_concat_pf.printSchema()
    dyf_crm_concat_pf.show(10)
    ########  save to s3######
    parquetToS3(dyf_crm_concat_pf,
                path="s3://dtsodin/student_behavior/up/user_profile")
Beispiel #5
0
def convertAndSaveS3(df):
    df = df.withColumn("year_month_id", f.from_unixtime('student_behavior_date', format="yyyyMM"))
    df = df.withColumn('student_behavior_id',
                       f.md5(concaText(
                           df.student_behavior_date,
                           df.behavior_id,
                           df.student_id,
                           df.contact_id,
                           df.package_code,
                           df.package_status_code,
                           df.student_level_code,
                           df.transformed_at)))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    if is_dev:
        print('dyf____________________________')
        dyf.printSchema()
        dyf.show(10)

    behavior_mapping = mappingForAll(dyf, MAPPING)

    if behavior_mapping.count() > 0:
        parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior")
    rate_mapping = mappingForAll(dyf, RATE_MAPPING)
    if rate_mapping.count() > 0:
        parquetToS3(dyf=rate_mapping, path="s3://toxd-olap/transaction_log/student_behavior/sb_student_rating")
def get_hw_basic():
    dyf_mdl_le_exam_attemp = connectGlue(database="home_work_basic_production", table_name="mdl_le_exam_attemp",
                               select_fields=["user_id", "created_at"],
                               fillter=["user_id"],
                               duplicates=["user_id","created_at"]
                               )

    df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF()

    df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.withColumn("created_at_min",
                                 f.unix_timestamp(df_mdl_le_exam_attemp.created_at, "yyyy-MM-dd HH:mm:ss"))
    dyf_mdl_le_exam_attemp = DynamicFrame.fromDF(df_mdl_le_exam_attemp, glueContext, "dyf_mdl_le_exam_attemp")

    dyf_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.resolveChoice(specs=[("created_at_min", "cast:long")])
    df_mdl_le_exam_attemp = dyf_mdl_le_exam_attemp.toDF()

    df_mdl_le_exam_attemp = df_mdl_le_exam_attemp.groupby("user_id").agg(
        f.min(df_mdl_le_exam_attemp.created_at_min).alias("student_behavior_date_hw"))

    dyf_mdl_user = connectGlue(database="home_work_basic_production", table_name="mdl_user",
                                         select_fields=["id", "username"],
                                         fillter=["id"],
                                         duplicates=["id", "username"]
                                         ).rename_field("username","email_hw")
    df_mdl_user = dyf_mdl_user.toDF()
    join = df_mdl_user.join(df_mdl_le_exam_attemp,df_mdl_user.id==df_mdl_le_exam_attemp.user_id)
    join= join.drop("id","user_id")
    return join
Beispiel #7
0
def convertAndSaveS3(df):
    df = df.dropDuplicates()
    prinDev(df, "befor convert")
    df = df.withColumn(
        "year_month_id",
        f.from_unixtime('student_behavior_date', format="yyyyMM"))
    prinDev(df, "befor mapping")
    df = df.withColumn(
        'student_behavior_id',
        f.md5(
            concaText(df.student_behavior_date, df.behavior_id, df.student_id,
                      df.contact_id, df.package_code, df.package_status_code,
                      df.student_level_code, df.transformed_at)))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    behavior_mapping = mappingForAll(dyf, MAPPING)

    prinDev(behavior_mapping, "after mapping")
    if behavior_mapping.count() > 0:
        parquetToS3(
            dyf=behavior_mapping,
            path=
            "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior"
        )
Beispiel #8
0
def get_voxy():
    dyf_voxy = connectGlue(
        database="voxy",
        table_name="voxy_api",
        select_fields=["email", "time_created", "total_hours_studied"],
        fillter=["email"],
        duplicates=["email",
                    "time_created"]).rename_field("email", "email_voxy")
    dyf_voxy = Filter.apply(frame=dyf_voxy,
                            f=lambda x: x["total_hours_studied"] > 0)
    df_voxy = dyf_voxy.toDF()

    df_voxy = df_voxy.withColumn(
        "time_created_new", f.unix_timestamp(df_voxy.time_created,
                                             "yyyy-MM-dd"))
    dyf_voxy = DynamicFrame.fromDF(df_voxy, glueContext, "dyf_voxy")

    dyf_voxy = dyf_voxy.resolveChoice(specs=[("time_created_new",
                                              "cast:long")])
    df_voxy = dyf_voxy.toDF()
    df_voxy.drop("time_created")
    df_voxy = df_voxy.groupby("email_voxy").agg(
        f.min(df_voxy.time_created_new).alias("student_behavior_date_voxy"))

    return df_voxy
def get_native_talk():
    dyf_native_talk_history_log = connectGlue(database="native_talk", table_name="native_talk_history_log_api",
                                         select_fields=["learning_date", "username","speaking_dialog_score"],
                                         fillter=["username"],
                                         duplicates=["username", "learning_date"]
                                         )
    dyf_native_talk_history_log = Filter.apply(frame=dyf_native_talk_history_log,
                            f=lambda x: x["speaking_dialog_score"] > 0
                            )

    df_native_talk_history_log = dyf_native_talk_history_log.toDF()
    df_native_talk_history_log = df_native_talk_history_log.drop("speaking_dialog_score")

    df_native_talk_history_log = df_native_talk_history_log.withColumn("learning_date_int",
                                                             f.unix_timestamp(df_native_talk_history_log.learning_date,
                                                                              "yyyy-MM-dd"))
    dyf_native_talk_history_log = DynamicFrame.fromDF(df_native_talk_history_log, glueContext, "dyf_native_talk_history_log")

    dyf_native_talk_history_log = dyf_native_talk_history_log.resolveChoice(specs=[("learning_date_int", "cast:long")])
    df_native_talk_history_log = dyf_native_talk_history_log.toDF()
    df_native_talk_history_log = df_native_talk_history_log.groupby("username").agg(
        f.min(df_native_talk_history_log.learning_date_int).alias("student_behavior_date_nt"))

    dyf_native_talk_account_mapping = connectGlue(database="native_talk", table_name="native_talk_account_mapping",
                                                  select_fields=["username", "contact_id"],
                                                  fillter=["username", "contact_id"],
                                                  duplicates=["username", "contact_id"]
                                                  ).rename_field("username", "username_mapping").\
                                                    rename_field("contact_id","contact_id_nt")
    df_native_talk_account_mapping = dyf_native_talk_account_mapping.toDF()
    join = df_native_talk_account_mapping.join(df_native_talk_history_log, df_native_talk_account_mapping.username_mapping == df_native_talk_history_log.username)
    join=join.drop("username_mapping","username")
    return join
Beispiel #10
0
def main():
    # ========== init
    glue_context = GlueContext(SparkContext.getOrCreate())

    # ========== retrieve dynamic frame
    df_advisor = retrieve_dynamic_frame(
        glue_context,
        'tig_advisor',
        'advisor_account',
        ['user_id', 'user_name', 'user_email']
    )
    display(df_advisor, "df_advisor")

    df_advisor = df_advisor.withColumnRenamed('user_id', 'advisor_id').withColumnRenamed('user_email', 'email')
    display(df_advisor, "df_advisor renamed")

    dyf_advisor = DynamicFrame.fromDF(
        df_advisor,
        glue_context,
        "dyf_advisor"
    )
    display(dyf_advisor, "dyf_advisor")

    # ========== save dynamic frame to redshift
    save_data_to_redshift(
        glue_context,
        dyf_advisor,
        'student_learning_fact',
        'advisor_dim',
        's3://datashine-dev-redshift-backup/student_learning_fact/advisor_dim',
        'student_learning_dim'
    )
Beispiel #11
0
def convertAndSaveS3(df):
    df = df.withColumn("year_month_id",
                       f.from_unixtime(
                           f.unix_timestamp(
                               df.learning_date, "yyyy-MM-dd").cast("long"), "yyyyMM").cast("string"))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    behavior_mapping = mappingForAll(dyf, MAPPING)

    if behavior_mapping.count() > 0:
        parquetToS3(dyf=behavior_mapping, path="s3://toxd-olap/trasaction_le2/le2_history")
Beispiel #12
0
def retrieve_dynamic_frame(glue_context, spark, data, header, name, mapping):
    data_frame = spark.createDataFrame(data, header)
    dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, name)

    display(dynamic_frame, "retrieve" + name)

    apply_dynamic_frame = ApplyMapping.apply(frame=dynamic_frame,
                                             mappings=mapping)
    dyf_result = ResolveChoice.apply(frame=apply_dynamic_frame,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")

    display(dynamic_frame, "apply" + name)

    return dyf_result
Beispiel #13
0
def convertAndSaveS3(df):
    df = df.withColumn(
        "year_month_id",
        f.from_unixtime('student_behavior_date', format="yyyyMM"))

    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")

    if dyf.count() > 0:
        if is_dev:
            print('dyf____________________________')
            dyf.printSchema()
            dyf.show(10)

        behavior_mapping = mappingForAll(dyf, MAPPING)
        parquetToS3(dyf=behavior_mapping,
                    path="s3://toxd-olap/transaction_log/student_care_advisor")
def mappingForAll(dynamicFrame,mapping,add_collum):
    df_communication = dynamicFrame.toDF()
    df_communication = df_communication.dropDuplicates()

    for k, v in add_collum.items():
        df_communication = df_communication.withColumn(k, v)

    dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication')
    applymapping2 = ApplyMapping.apply(frame=dyf_communication,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")
    return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping):
    df = dynamicFrame.toDF()
    df = df.dropDuplicates()
    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")
    print("-------------------------------------------------")
    dyf.printSchema()
    print(mapping)

    applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_mapping = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields2")
    return dyf_mapping
def process_paintings(paintings: DynamicFrame) -> DynamicFrame:
  # Cast all "bit" fields (LongTypes) into booleans
  # It's easier to use a list of non-bit fields as the majority of fields imported are bit fields
  non_bit_fields = ["episode", "title"]
  bit_fields_specs = [
      (field.name, "cast:boolean")
      for field in paintings.schema()
      if field.name not in non_bit_fields and field.dataType.typeName() == 'long'
  ]
  if not bit_fields_specs:
    return paintings
  paintings_with_bool_fields = ResolveChoice.apply(paintings,
                                                   specs = bit_fields_specs,
                                                   transformation_ctx = "paintings_with_bool_fields")
  paintings_with_parsed_episodes = Map.apply(frame = paintings_with_bool_fields,
                                             f = parse_episode,
                                             transformation_ctx = "paintings_with_parsed_episodes")
  return paintings_with_parsed_episodes
def etl_advisor():
    dyf_advisor_account = connectGlue(database="tig_advisor",
                                      table_name="advisor_account",
                                      select_fields=[
                                          "user_id", "user_name",
                                          "user_display_name", "user_email",
                                          "level", "type_manager"
                                      ],
                                      fillter=["user_id", "type_manager"])

    dyf_advisor_account.show(10)
    # ------------------------------------------------------------------------------------------------------------------#
    number_advisor_account = dyf_advisor_account.count()

    if is_dev:
        print("number_advisor_account: ", number_advisor_account)
    if number_advisor_account < 1:
        return
    print_is_dev(
        "etl_advisor_profile___****************************************************************************************"
    )
    # ETL for profile
    myUdf = f.udf(myFunc, StringType())

    df_advisor_account = dyf_advisor_account.toDF()
    df_advisor_account = df_advisor_account.groupBy("user_id") \
        .agg(f.collect_list("user_name").alias("user_name"),
             f.collect_list("user_email").alias("user_email"),
             f.first("level").alias("level"),
             f.first("type_manager").alias("type_manager"),
             f.collect_list("user_display_name").alias("user_display_name")) \
        .withColumn("user_name", myUdf("user_name")) \
        .withColumn("user_email", myUdf("user_email")) \
        .withColumn("user_display_name", myUdf("user_display_name"))
    dyf_advisor_account = DynamicFrame.fromDF(df_advisor_account, glueContext,
                                              "dyf_advisor_account")
    dyf_advisor_account_mapping = mappingForAll(dyf_advisor_account,
                                                mapping=ADVISOR_MAPPING)

    dyf_advisor_account_mapping.printSchema()
    dyf_advisor_account_mapping.show(10)
Beispiel #18
0
def processBatch(data_frame, batchId):
    if (data_frame.count() > 0):
        datasource0 = DynamicFrame.fromDF(data_frame, glueContext,
                                          "from_data_frame")
        now = datetime.datetime.now()
        year = now.year
        month = now.month
        day = now.day
        hour = now.hour
        minute = now.minute
        path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format(
            str(year)) + "/ingest_month=" + "{:0>2}".format(
                str(month)) + "/ingest_day=" + "{:0>2}".format(
                    str(day)) + "/ingest_hour=" + "{:0>2}".format(
                        str(hour)) + "/"
        datasink1 = glueContext.write_dynamic_frame.from_options(
            frame=datasource0,
            connection_type="s3",
            connection_options={"path": path_datasink1},
            format="parquet",
            transformation_ctx="datasink1")
        logger.info(f'{{"batch_process_successful":True}}')
Beispiel #19
0
def processBatch(data_frame, batchId):
    now = datetime.datetime.now()
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute
    if data_frame.count() > 0:
        dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext,
                                            "from_data_frame")
        apply_mapping = ApplyMapping.apply(
            frame=dynamic_frame,
            mappings=[
                ("ventilatorid", "long", "ventilatorid", "long"),
                ("eventtime", "string", "eventtime", "timestamp"),
                ("serialnumber", "string", "serialnumber", "string"),
                ("pressurecontrol", "long", "pressurecontrol", "long"),
                ("o2stats", "long", "o2stats", "long"),
                ("minutevolume", "long", "minutevolume", "long"),
                ("manufacturer", "string", "manufacturer", "string"),
            ],
            transformation_ctx="apply_mapping",
        )

        dynamic_frame.printSchema()

        # Write to S3 Sink
        s3path = (s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) +
                  "/ingest_month=" + "{:0>2}".format(str(month)) +
                  "/ingest_day=" + "{:0>2}".format(str(day)) +
                  "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/")
        s3sink = glueContext.write_dynamic_frame.from_options(
            frame=apply_mapping,
            connection_type="s3",
            connection_options={"path": s3path},
            format="parquet",
            transformation_ctx="s3sink",
        )
Beispiel #20
0
def etl_ktkt():
    dyf_technical_test = connectGlue(database="technical_test",
                                     table_name="student_technical_test",
                                     select_fields=[
                                         "_key", "trinhdohocvien", "studentid",
                                         "thoigianhenktkt", "ketluan"
                                     ],
                                     fillter=["studentid", "thoigianhenktkt"],
                                     duplicates=[
                                         "trinhdohocvien", "studentid",
                                         "thoigianhenktkt", "ketluan"
                                     ])

    dyf_technical_test = dyf_technical_test.resolveChoice(
        specs=[("_key", "cast:long"), ("studentid", "cast:string")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet")
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_technical_test = Filter.apply(frame=dyf_technical_test,
                                          f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_technical_test.count() > 0:
        df_technical_test = dyf_technical_test.toDF()

        flag = df_technical_test.agg({"_key": "max"}).collect()[0][0]

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["contact_id", "student_id", "user_name"],
            fillter=["contact_id", "student_id"],
            duplicates=["contact_id", "student_id"])

        df_student_contact = dyf_student_contact.toDF()

        df_technical_test = df_technical_test.withColumn(
            "date",
            f.unix_timestamp(df_technical_test.thoigianhenktkt,
                             "yyyy-MM-dd HH:mm:ss"))
        dyf_technical_test = DynamicFrame.fromDF(df_technical_test,
                                                 glueContext,
                                                 "dyf_technical_test")
        dyf_technical_test = dyf_technical_test.resolveChoice(
            specs=[("date", "cast:long")])

        df_technical_test = dyf_technical_test.toDF()

        df_technical_test_min = df_technical_test.select(
            "trinhdohocvien", "studentid", "date")
        df_technical_test_min = df_technical_test_min.groupBy(
            "studentid", "trinhdohocvien").agg(
                f.min(
                    df_technical_test_min.date).alias("student_behavior_date"))

        df_join_min = df_student_contact.join(
            df_technical_test_min, df_technical_test_min["studentid"] ==
            df_student_contact["student_id"])
        df_select_min = df_join_min.select(
            "contact_id", "student_id", "student_behavior_date",
            df_join_min.trinhdohocvien.alias("student_level_code"))

        # -----------------------------------------------------------------------------------------------------------------#
        df_technical_test_pass = df_technical_test.where(
            df_technical_test.ketluan == "Pass")

        df_technical_test_pass = df_technical_test_pass.groupBy(
            "studentid", "trinhdohocvien").agg(
                f.min(df_technical_test_pass.date).alias(
                    "student_behavior_date"))

        df_join_pass = df_student_contact.join(
            df_technical_test_pass, df_technical_test_pass["studentid"] ==
            df_student_contact["student_id"])
        df_select_pass = df_join_pass.select(
            "contact_id", "student_id", ""
            "student_behavior_date",
            df_join_min.trinhdohocvien.alias("student_level_code"))

        prinDev(df=df_select_pass, df_name="pass")
        prinDev(df=df_select_pass, df_name="min")

        df_join_level_code_min = set_package_advisor_level(df_select_min)
        for k, v in ADD_COLLUM_HEN_KTKT.items():
            df_join_level_code_min = df_join_level_code_min.withColumn(k, v)
        prinDev(df_join_level_code_min, "end data min")
        convertAndSaveS3(df_join_level_code_min)

        # ----------------------------------------------------------------------------------------------------------------
        df_join_level_code_pass = set_package_advisor_level(df_select_pass)
        for k, v in ADD_COLLUM_KTKT_TC.items():
            df_join_level_code_pass = df_join_level_code_pass.withColumn(k, v)
        prinDev(df_join_level_code_pass, "end data pass")
        convertAndSaveS3(df_join_level_code_pass)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_ktkt.parquet",
            mode="overwrite")
Beispiel #21
0
def smile_care_rating():
    dyf_smile_care_key = connectGlue(database="native_smile",
                                 table_name="ticket_log_5450ed3d8cb5a34974310b6b26e451fa",
                                 select_fields=["_key", "requester_email", "satisfaction", "satisfaction_at","created_at"],
                                 fillter=['requester_email']
                                 ).rename_field("satisfaction", "value_rating")

    dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["value_rating"] in satisfaction)
    dyf_smile_care_key = dyf_smile_care_key.resolveChoice(specs=[("_key", "cast:long")])
    try:
        flag_smile_care = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet")
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_smile_care_key = Filter.apply(frame=dyf_smile_care_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_smile_care_key.count() > 0:
        df_smile_care_key = dyf_smile_care_key.toDF()
        flag = df_smile_care_key.agg({"_key": "max"}).collect()[0][0]
        df_smile_care = df_smile_care_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_smile_care_key.created_at, "yyyy-MM-dd HH:mm:ss"))
        dyf_smile_care = DynamicFrame.fromDF(df_smile_care, glueContext, "dyf_smile_care")

        dyf_smile_care = dyf_smile_care.resolveChoice(specs=[("student_behavior_date", "cast:int")])
        dyf_smile_care = dyf_smile_care.select_fields(
            ["_key", "requester_email", "value_rating", "satisfaction_at", "student_behavior_date"])
        df_smile_care = dyf_smile_care.toDF()

        dyf_student_contact_email = connectGlue(
            database="tig_advisor",
            table_name="student_contact_email",
            select_fields=["email", "contact_id", "user_id"]
        )
        dyf_student_contact_ = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"]
        ).rename_field("contact_id", "contact_id_contact")
        df_student_contact = dyf_student_contact_.toDF()
        df_student_contact_email = dyf_student_contact_email.toDF()

        df_smile_care = df_smile_care.join(df_student_contact_email,
                                           (df_smile_care["requester_email"] == df_student_contact_email["email"]))
        df_smile_care = df_smile_care.join(df_student_contact,
                                           (df_smile_care["contact_id"] == df_student_contact["contact_id_contact"]))

        df_smile_care.drop("email", "requester_email", "contact_id_contact")

        df_smile_care = df_smile_care.withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \
            .withColumn("comment", f.lit("")) \
            .withColumn("rating_about", f.lit(None)) \
            .withColumn("number_rating", f.lit(1)) \
            .withColumn("behavior_id", f.lit(26)) \
            .withColumn("transformed_at", f.lit(d4))

        df_smile_care = set_package_advisor_level(df_smile_care)

        convertAndSaveS3(df_smile_care)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_smile_care_rating.parquet", mode="overwrite")
Beispiel #22
0
def phone_rating():
    dyf_advisorcall_key = connectGlue(
        database="callcenter",
        table_name="advisorcall",
        select_fields=["_key", "calldate", "phonenumber", "rating", "device", "hanguptvts", "status"]
    )
    dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["rating"] in ratings
                                                                              and x["device"] == "3CX"
                                                                              and x["hanguptvts"] == 1
                                                                              and x["status"] == "ANSWER")

    dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(specs=[("_key", "cast:long")])

    try:
        df_flag_phone_rating = spark.read.parquet("s3://toxd-olap/transaction_log/flag/flag_phone_rating.parquet")
        max_key = df_flag_phone_rating.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key, f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")

    if dyf_advisorcall_key.count()>0:

        df_advisorcall_key = dyf_advisorcall_key.toDF()
        flag = df_advisorcall_key.agg({"_key": "max"}).collect()[0][0]
        df_advisorcall = df_advisorcall_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss"))
        dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext, "dyf_advisorcall")

        dyf_advisorcall = dyf_advisorcall.resolveChoice(specs=[("student_behavior_date", "cast:int")])
        dyf_advisorcall = dyf_advisorcall.select_fields(
            ["_key", "student_behavior_date", "phonenumber", "rating", "device", "hanguptvts", "status"]) \
            .rename_field("rating", "value_rating")

        df_advisorcall = dyf_advisorcall.toDF()


        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["phone", "contact_id", "user_id"]
        )
        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"]
        ).rename_field("contact_id", "contact_id_contact")

        df_student_contact = dyf_student_contact.toDF()
        df_student_contact_phone = dyf_student_contact_phone.toDF()
        df_advisorcall = df_advisorcall.join(df_student_contact_phone,
                                             (df_advisorcall["phonenumber"] == df_student_contact_phone["phone"]))
        df_advisorcall = df_advisorcall.join(df_student_contact,
                                             (df_advisorcall["contact_id"] == df_student_contact["contact_id_contact"]))

        df_advisorcall = df_advisorcall.drop("phonenumber", "phone", "contact_id_contact")

        df_advisorcall = df_advisorcall.withColumn("comment", f.lit("")).withColumn("rating_about", f.lit(None)) \
            .withColumn("rating_type", f.lit("rating_hotline")) \
            .withColumn("number_rating", f.lit(1))

        df_rating_phone = df_advisorcall \
            .withColumn("behavior_id", f.lit(25)) \
            .withColumn("transformed_at", f.lit(d4))

        df_rating_phone = set_package_advisor_level(df_rating_phone)

        convertAndSaveS3(df_rating_phone)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://toxd-olap/transaction_log/flag/flag_phone_rating.parquet", mode="overwrite")
Beispiel #23
0
def etl_hoc_vien_duoc_chao_mung():
    dyf_av_care_call = connectGlue(
        database="tig_advisor",
        table_name="care_call",
        select_fields=[
            "_key", "phone", "duration", "call_status", "time_created"
        ],
        fillter=["phone", "duration", "call_status"],
        duplicates=["phone", "call_status", "time_created"],
    ).rename_field("phone", "phone1")

    dyf_av_care_call = Filter.apply(
        frame=dyf_av_care_call,
        f=lambda x: x["call_status"] in
        ("success", "call_success") and x["duration"] > 30)

    dyf_av_care_call = dyf_av_care_call.resolveChoice(specs=[("_key",
                                                              "cast:long")])

    try:
        df_flag_1 = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet"
        )
        max_key = df_flag_1.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_av_care_call = Filter.apply(frame=dyf_av_care_call,
                                        f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_av_care_call.count() > 0:
        df_av_care_call = dyf_av_care_call.toDF()
        flag = df_av_care_call.agg({"_key": "max"}).collect()[0][0]

        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["contact_id", "phone"],
            fillter=["contact_id", "phone"],
            duplicates=["contact_id", "phone"],
        )

        df_av_care_call = df_av_care_call\
        .withColumn("student_behavior_date", f.unix_timestamp(df_av_care_call.time_created, "yyyy-MM-dd HH:mm:ss"))
        dyf_av_care_call = DynamicFrame.fromDF(df_av_care_call, glueContext,
                                               "dyf_av_care_call")

        dyf_av_care_call = dyf_av_care_call.resolveChoice(
            specs=[("student_behavior_date", "cast:int")])
        df_av_care_call = dyf_av_care_call.toDF()

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("contact_id",
                                                 "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_student_contact_phone = dyf_student_contact_phone.toDF()
        df_student_care = df_av_care_call.join(
            df_student_contact_phone,
            (df_av_care_call["phone1"] == df_student_contact_phone["phone"]),
            "left")
        df_student_care = df_student_care.drop("phone1", "phone")
        # -----------------------------------------------------------------------------------------------------------------#

        # -----------------------------------------------------------------------------------------------------------------#
        df_student_care = df_student_care.join(
            df_student_contact, (df_student_care["contact_id"]
                                 == df_student_contact["contact_id_contact"]),
            "left")
        df_student_care = df_student_care.drop("contact_id_contact")

        df_package_code = khoa_hoc()
        df_behavior_join_2 = df_student_care.join(
            df_package_code,
            df_student_care["contact_id"] == df_package_code["crm_id"])
        df_behavior_join_2 = df_behavior_join_2.drop("crm_ id")
        df_behavior_join_2 = set_package_advisor_level(df_behavior_join_2)

        for k, v in ADD_COLLUM.items():
            df_behavior_join_2 = df_behavior_join_2.withColumn(k, v)
        prinDev(df_behavior_join_2, "end_code")
        # (student_behavior_date, behavior_id, student_id, user_id, contact_id,
        #  package_code, student_level_code, package_status_code, transformed)

        convertAndSaveS3(df_behavior_join_2)
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_chao_mung_hoc_vien.parquet",
            mode="overwrite")
Beispiel #24
0
def display_dynamic_frame_schema(dynamic_frame: DynamicFrame):
    dynamic_frame.printSchema()
Beispiel #25
0
def display_dynamic_frame_data(dynamic_frame: DynamicFrame):
    dynamic_frame.show()
def main():
    # ----------test------------------

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_id = long(today.strftime("%Y%m%d"))
    print('today_id: ', today_id)

    start_date = today - timedelta(5)
    start_date_focus = long(start_date.strftime("%Y%m%d"))
    end_date_focus = today_id

    df_student_contact = get_df_student_contact()
    df_student_contact.cache()
    if is_dev:
        print('df_student_contact')
        df_student_contact.printSchema()
        df_student_contact.show(3)

    if is_checking:
        df_student_contact_test = df_student_contact.filter(
            f.col('contact_id') == checking_contact_id)
        print('is_checking::df_student_contact_test')
        df_student_contact_test.show(30)

    df_collect_lssclt = etl_collect_lssclt(start_date_focus, end_date_focus,
                                           df_student_contact)
    if is_dev:
        df_collect_lssclt.cache()
        print('df_collect_lssclt')
        df_collect_lssclt.printSchema()
        df_collect_lssclt.show(3)

    if is_checking:
        df_collect_lssclt_test = df_collect_lssclt.filter(
            (f.col('contact_id') == checking_contact_id)
            & (f.col('date_id') == checking_date_id))
        print('is_checking::df_collect_lssclt_test')
        df_collect_lssclt_test.show(30)

    df_collect_ai_stater = etl_ai_stater(start_date_focus, end_date_focus,
                                         df_student_contact)

    if is_dev:
        df_collect_ai_stater.cache()
        print('df_collect_ai_stater')
        df_collect_ai_stater.printSchema()
        df_collect_ai_stater.show(3)

    if is_checking:
        df_collect_ai_stater_test = df_collect_ai_stater.filter(
            (f.col('contact_id') == checking_contact_id)
            & (f.col('date_id') == checking_date_id))
        print('is_checking::df_collect_ai_stater_test')
        df_collect_ai_stater_test.show(30)

    df_le2_history = etl_collet_le2(start_date_focus, end_date_focus)

    if is_checking:
        df_le2_history_test = df_le2_history.filter(
            (f.col('contact_id') == checking_contact_id)
            & (f.col('date_id') == checking_date_id))
        print('is_checking::df_le2_history_test')
        df_le2_history_test.show(30)

    if is_dev:
        print('df_le2_history')
        df_le2_history.printSchema()
        df_le2_history.show(3)

    df_le_total = df_collect_lssclt\
        .join(other=df_collect_ai_stater,
              on=['contact_id', 'date_id'],
              how='outer')\
        .join(other=df_le2_history,
              on=['contact_id', 'date_id'],
              how='outer')\
        .join(df_student_contact,
              on=['contact_id'],
              how='inner')

    df_le_total = df_le_total.withColumn("transformed_at", f.lit(d4))
    df_le_total = df_le_total.fillna(0)

    if is_checking:
        df_le_total_test = df_le_total.filter(
            (f.col('contact_id') == checking_contact_id)
            & (f.col('date_id') == checking_date_id))
        print('is_checking::df_le_total_test')
        df_le_total_test.show(30)

    if is_dev:
        print('df_le_total')
        df_le_total.printSchema()
        df_le_total.show(3)

    dyf = DynamicFrame.fromDF(df_le_total, glueContext, "dyf")

    preactions = "DELETE student_detail.student_behavior_aggregation_day_v1 where date_id >= " + str(
        start_date_focus)

    glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dyf,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": preactions,
            "dbtable": "student_detail.student_behavior_aggregation_day_v1",
            "database": "student_native_report"
        },
        redshift_tmp_dir=
        "s3n://dts-odin/temp/bc200/student_detail/student_behavior_aggregation_day_v1",
        transformation_ctx="datasink4")
    df_student_contact.unpersist()
    if is_dev:
        df_collect_lssclt.unpersist()
        df_collect_ai_stater.unpersist()
        df_le2_history.unpersist()
def etl_student_advisor():
    print_is_dev(
        "etl_advisor_profile___****************************************************************************************"
    )
    # ETL for profile
    # dyf_student_advisor = connectGlue(database="tig_advisor", table_name="log_change_assignment_advisor",
    #     #                                   select_fields=["id", "contact_id", "advisor_id_old", "advisor_id_new","created_at", "updated_at"],
    #     #                                   fillter=["id"],
    #     #                                   duplicates=["id"])

    # ------------------------------------------------------------------------------------------------------------------#
    df_student_advisor = get_dyf_student_advisor(glueContext)

    number_student_advisor = df_student_advisor.count()
    if is_dev:
        print("number_advisor_account: ", number_student_advisor)
        df_student_advisor.printSchema()
        df_student_advisor.show(3)

    if number_student_advisor < 1:
        return

    # ------------------------------------------------------------------------------------------------------------------#
    # process contact_id with advisor from log_change_assignment_advisor and student_contact

    df_contact_id_list = df_student_advisor.select('contact_id')

    df_contact_id_list = df_contact_id_list.dropDuplicates(['contact_id'])

    df_change_advisor_new = get_df_change_advisor_new(df_student_advisor)
    #df_change_advisor_init = get_df_change_advisor_init(df_student_advisor)

    #df_advisor_from_student_contact = get_advisor_from_student_contact(glueContext, df_contact_id_list)

    df_change_advisor_total = df_change_advisor_new
    # .union(df_change_advisor_init)
    #.union(df_advisor_from_student_contact)

    if is_dev:
        print('df_change_advisor_total')
        df_change_advisor_total.printSchema()
        df_change_advisor_total.show(3)
    #-------------------------------------------------------------------------------------------------------------------#


    df_change_advisor_total = df_change_advisor_total\
        .withColumn('created_at',
                    f.unix_timestamp('created_at', format='yyyy-MM-dd HH:mm:ss'))

    df_change_advisor_total = df_change_advisor_total.orderBy(
        "contact_id", "created_at")

    if is_dev:
        print('df_change_advisor_total_after_order')
        df_change_advisor_total.printSchema()
        df_change_advisor_total.show(3)

    df_change_advisor_total = df_change_advisor_total.select(
        "contact_id",
        f.struct('advisor_id', 'created_at').alias('end_start'))

    print('df_change_advisor_total')
    df_change_advisor_total.printSchema()
    df_change_advisor_total.show(3)

    df_change_advisor_group_contact = df_change_advisor_total.groupBy("contact_id") \
        .agg(f.collect_list("end_start").alias("l_created_at"))

    print('df_change_advisor_group_contact_______________')
    df_change_advisor_group_contact.printSchema()
    df_change_advisor_group_contact.show(3)

    df_change_advisor_group_contact = df_change_advisor_group_contact\
        .withColumn("l_created_at", transpose_start_end_udf("l_created_at"))

    df_change_advisor_group_contact.printSchema()
    df_change_advisor_explore = df_change_advisor_group_contact\
        .select("contact_id",
        f.explode("l_created_at").alias("start_end_transpose")
    )

    df_change_advisor_explore.printSchema()
    df_change_advisor_explore = df_change_advisor_explore.select(
        "contact_id",
        f.col("start_end_transpose").getItem("advisor_id").alias("advisor_id"),
        f.col("start_end_transpose").getItem("start").alias("start"),
        f.col("start_end_transpose").getItem("end").alias("end"))
    df_change_advisor_explore.printSchema()
    df_change_advisor_explore = addCollum(df_change_advisor_explore,
                                          STUDENT_ADVISOR_ADD_COL)
    dyf_change_advisor_explore = DynamicFrame.fromDF(
        df_change_advisor_explore, glueContext, "dyf_change_advisor_explore")
    # dyf_change_advisor_explore = mappingForAll(dyf_change_advisor_explore, mapping = STUDENT_ADVISOR_MAPPING)

    print('dyf_change_advisor_explore')
    dyf_change_advisor_explore.printSchema()
    dyf_change_advisor_explore.show(10)

    ########  save to s3######
    # parquetToS3(dyf_student_advisor_mapping, path="s3://dtsodin/student_behavior/advisor_thangvm/advisor_history")
    ############################################################
    # ------------------------------------------------------------------------------------------------------------------#

    # | -- student_advisor_id: string
    # | -- contact_id: string
    # | -- advisor_id: string
    # | -- start_date: string
    # | -- end_date: string
    # | -- created_at: int
    # | -- updated_at: string

    apply_ouput = ApplyMapping.apply(
        frame=dyf_change_advisor_explore,
        mappings=[("contact_id", "string", "contact_id", "string"),
                  ("advisor_id", "string", "advisor_id", "long"),
                  ("start", "string", "start_date", "long"),
                  ("end", "string", "end_date", "long"),
                  ("created_at", "int", "created_at", "long")])
    #
    dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")

    glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dfy_output,
        catalog_connection="glue_redshift",
        connection_options={
            # "preactions": """TRUNCATE TABLE ad_student_advisor """,
            "dbtable": "ad_student_advisor",
            "database": "transaction_log"
        },
        redshift_tmp_dir=
        "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_package",
        transformation_ctx="datasink4")
def etl_user_communication(dyf_crm_concats):
    print_is_dev('etl_user_user_communication___****************************************************************************')
    # Cache all data(Extra data)
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_adv_phones = connectGlue(database='tig_advisor', table_name='student_contact_phone',
                                  select_fields=["contact_id", "phone","deleted"],
                                  fillter=['contact_id', 'phone'])
    dyf_adv_phones= RenameField.apply(dyf_adv_phones,"deleted","is_deleted")
    count_adv_phones = dyf_adv_phones.count()
    if count_adv_phones < 1:
        return
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_adv_advisor = connectGlue(database='tig_advisor', table_name='student_contact',
                                  select_fields=["contact_id", "student_id"],
                                  fillter=['contact_id', 'student_id'])
    count_adv_advisor = dyf_adv_advisor.count()
    if count_adv_advisor < 1:
        return
    # ----------------------------------------------------------------------------------------------------------------#
    dyf_toa_enterprise = connectGlue(database='dm_toa', table_name='toa_mapping_student_enterprise',
                                  select_fields=["lms_id", "enterprise_id"],
                                  fillter=['lms_id', 'enterprise_id'])

    count_toa_enterprise = dyf_toa_enterprise.count()
    if count_toa_enterprise < 1:
        return

    dyf_toa_enterprise.printSchema()
    # --------------------------------------------------------------------------------------------------------------#
    # Start Tranform

    print_is_dev("id and code----------------------------------------------------")
    print_is_dev(dyf_crm_concats.count())

    df_crm_concats = dyf_crm_concats.toDF()
    df_concat_enterprise_clear = df_crm_concats.dropDuplicates(['Id', 'Code'])
    dyf_concat_enterprise_clear=DynamicFrame.fromDF(df_concat_enterprise_clear, glueContext, 'dyf_concat_enterprise_clear')

    print_is_dev(dyf_concat_enterprise_clear.count())

    dyf_toa_enterprise = dyf_toa_enterprise.resolveChoice(specs=[('enterprise_id', 'cast:string')])
    dyf_jon_contact_enterprise = Join.apply(dyf_concat_enterprise_clear, dyf_adv_advisor, 'Code', 'contact_id')
    dyf_jon_contact_enterprise = Join.apply(dyf_toa_enterprise, dyf_jon_contact_enterprise, 'lms_id', 'student_id')
    dyf_jon_contact_enterprise = RenameField.apply(dyf_jon_contact_enterprise, "enterprise_id", "comunication")
    comunication_add_collum['communication_type']=f.lit(8)
    dyf_concat_enterprise = mappingForAll(dyf_jon_contact_enterprise, comunication, comunication_add_collum)

    if is_dev:
        print('dyf_concat_enterprise')
        dyf_concat_enterprise.printSchema()
        dyf_concat_enterprise.show(10)
    ########  save to s3######
    # parquetToS3(dyf_concat_enterprise, path="s3://dtsodin/student_behavior/up/user_comunication")

    save_communication_redshift(dyf_concat_enterprise)
    ##########################################

    # ETL for phone comunication
    dyf_jon_contact_phones = Join.apply(dyf_concat_enterprise_clear, dyf_adv_phones, 'Code', 'contact_id')
    dyf_jon_contact_phones = RenameField.apply(dyf_jon_contact_phones, "phone", "comunication")
    comunication_add_collum['communication_type']=f.lit(1)
    dyf_concat_phone = mappingForAll(dyf_jon_contact_phones,comunication,comunication_add_collum_for_phone)

    if is_dev:
        print('dyf_concat_phone')
        dyf_concat_phone.printSchema()
        dyf_concat_phone.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_phone, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_phone)
    ##########################################


    # ETL for username comunication
    dyf_crm_concat_name = fillterOutNull(dyf_crm_concats, ['Fullname'])
    df_crm_concat_name=dyf_crm_concat_name.toDF()
    df_concat_name_clear = df_crm_concat_name.dropDuplicates(['Id', 'Code','Fullname'])
    dyf_concat_name_clear=DynamicFrame.fromDF(df_concat_name_clear, glueContext, 'dyf_concat_name_clear')
    dyf_crm_concat_name = RenameField.apply(dyf_concat_name_clear, "Fullname", "comunication")
    # set comunication type
    comunication_add_collum['communication_type'] = f.lit(4)

    dyf_concat_name = mappingForAll(dyf_crm_concat_name, comunication, comunication_add_collum)


    if is_dev:
        print('dyf_concat_name')
        dyf_concat_name.printSchema()
        dyf_concat_name.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_name, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_name)
    ##########################################

    # ETL for email comunication

    dyf_crm_concat_emails = fillterOutNull(dyf_crm_concats, ['Email'])
    df_crm_concat_emails=dyf_crm_concat_emails.toDF()
    df_crm_concat_emails = df_crm_concat_emails.dropDuplicates(['Id', 'Code','Email'])
    dyf_crm_concat_emails=DynamicFrame.fromDF(df_crm_concat_emails, glueContext, 'dyf_crm_concat_emails')
    dyf_crm_concat_email = RenameField.apply(dyf_crm_concat_emails, "Email", "comunication")

    comunication_add_collum['communication_type']=f.lit(2)
    dyf_concat_email = mappingForAll(dyf_crm_concat_email,comunication,comunication_add_collum)

    if is_dev:
        print('dyf_concat_email')
        dyf_concat_email.printSchema()
        dyf_concat_email.show(10)
    ########  save to s3######
    #parquetToS3(dyf_concat_email, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_email)
    ##########################################


    # ETL for email2 comunication
    dyf_crm_concat_email_2 = fillterOutNull(dyf_crm_concats, ['Email2'])
    df_crm_concat_email_2=dyf_crm_concat_email_2.toDF()
    df_crm_concat_email_2 = df_crm_concat_email_2.dropDuplicates(['Id', 'Code','Email2'])
    dyf_crm_concat_email_2=DynamicFrame.fromDF(df_crm_concat_email_2, glueContext, 'dyf_crm_concat_email_2')

    dyf_crm_concat_email_2 = RenameField.apply(dyf_crm_concat_email_2, "Email", "comunication")
    comunication_add_collum['is_primary']=f.lit(0)

    dyf_concat_email2 = mappingForAll(dyf_crm_concat_email_2, comunication,comunication_add_collum)

    if is_dev:
        print('dyf_concat_email2')
        dyf_concat_email2.printSchema()
        dyf_concat_email2.show(10)
    ########  save to s3######
    #parquetToS3(dyf_concat_email2, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_email2)
    ##########################################


    # ETL for address comunication
    dyf_crm_concat_address = fillterOutNull(dyf_crm_concats, ['Address'])
    df_crm_concat_address=dyf_crm_concat_address.toDF()
    df_crm_concat_address = df_crm_concat_address.dropDuplicates(['Id', 'Code','Address'])
    dyf_crm_concat_address=DynamicFrame.fromDF(df_crm_concat_address, glueContext, 'dyf_crm_concat_address')

    dyf_crm_concat_address = RenameField.apply(dyf_crm_concat_address, "Address", "comunication")
    comunication_add_collum['is_primary'] = f.lit(1)
    comunication_add_collum['communication_type'] = f.lit(6)
    dyf_concat_address = mappingForAll(dyf_crm_concat_address,comunication, comunication_add_collum)

    if is_dev:
        print('dyf_concat_address')
        dyf_concat_address.printSchema()
        dyf_concat_address.show(10)

    ########  save to s3######
    #parquetToS3(dyf_concat_address, path="s3://dtsodin/student_behavior/up/user_comunication")
    save_communication_redshift(dyf_concat_address)
def etl_user_map(dyf_crm_concats):
    print_is_dev('etl_user_map___*******************************************************************************************')

    # ETL for mapper
    df_crm_concats= dyf_crm_concats.toDF()
    df_crm_concat_clear= df_crm_concats.dropDuplicates(['Id', 'Code'])
    dyf_crm_concat_clear=DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear')

    dyf_crm_concat_map = RenameField.apply(dyf_crm_concat_clear, "Code", "source_id")
    dyf_crm_concat_map = mappingForAll(dyf_crm_concat_map, mapper, mapper_add_collum)

    if is_dev:
        print('dyf_crm_concat_map')
        dyf_crm_concat_map.printSchema()
        dyf_crm_concat_map.show(10)

    ########  save to s3######
    #parquetToS3(dyf_crm_concat_map,path="s3://dtsodin/student_behavior/up/user_mapper")

    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_crm_concat_map,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "up_user_map",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map",
                                                   transformation_ctx="datasink4")

    ############################################################

    print_is_dev('etl_user_map___********************************************************************************************')
    # ---------------------------------------------------------------------------------------------------------------#
    dyf_adv_student = connectGlue(database='tig_advisor', table_name='student_contact',
                                   select_fields=["contact_id", "student_id"],
                                   fillter=['contact_id', 'student_id'])
    count_adv_student = dyf_adv_student.count()
    if count_adv_student < 1:
        return

    dyf_adv_student.show(10)
    # -----------------------------------------------------------------------------------------------------------------#
    # Start Tranform

    # ETL for phone comunication
    dyf_crm_concat_clear = DynamicFrame.fromDF(df_crm_concat_clear, glueContext, 'dyf_crm_concat_clear')

    dyf_join_contact_student = Join.apply(dyf_adv_student, dyf_crm_concat_clear, 'contact_id', 'Code')
    dyf_adv_student = RenameField.apply(dyf_join_contact_student, "student_id", "source_id")
    mapper_add_collum['source_type']=f.lit(2)
    dyf_adv_student = mappingForAll(dyf_adv_student, mapper,mapper_add_collum)
    dyf_adv_student.show(10)
    ########  save to s3######
    #parquetToS3(dyf_adv_student, path="s3://dtsodin/student_behavior/up/user_mapper")
    ############################################################
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_adv_student,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "up_user_map",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/user_profile/up_user_map",
                                                   transformation_ctx="datasink4")
Beispiel #30
0
def main():

    dyf_advisorcall_key = connectGlue(
        database="callcenter",
        table_name="advisorcall",
        select_fields=[
            "_key", "calldate", "idcall", "rating", "device", "hanguphv",
            "totaltime", "answertime"
        ],
        duplicates=[
            "calldate", "idcall", "rating", "device", "hanguphv", "totaltime",
            "answertime"
        ],
        fillter=["idcall"]).rename_field("rating", "value_rating")

    dyf_advisorcall_key = dyf_advisorcall_key.resolveChoice(
        specs=[("_key", "cast:long")])

    try:
        df_flag_phone_rating = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet"
        )
        max_key = df_flag_phone_rating.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_advisorcall_key = Filter.apply(frame=dyf_advisorcall_key,
                                           f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    count = dyf_advisorcall_key.count()
    print(count)
    if count > 0:
        df_advisorcall_key = dyf_advisorcall_key.toDF()
        df_advisorcall = df_advisorcall_key \
            .withColumn("student_behavior_date", f.unix_timestamp(df_advisorcall_key.calldate, "yyyy-MM-dd HH:mm:ss"))
        dyf_advisorcall = DynamicFrame.fromDF(df_advisorcall, glueContext,
                                              "dyf_advisorcall")

        dyf_advisorcall = dyf_advisorcall.resolveChoice(
            specs=[("student_behavior_date", "cast:int")])

        df_advisorcall = dyf_advisorcall.toDF()

        dyf_cdr = connectGlue(
            database="callcenter",
            table_name="cdr",
            select_fields=["ip_phone", "call_id", "student_phone", "status"],
            fillter=["call_id"])
        df_cdr = dyf_cdr.toDF()
        df_advisorcall = df_advisorcall.join(
            df_cdr, (df_advisorcall["idcall"] == df_cdr["call_id"]), "right")

        dyf_student_contact_phone = connectGlue(
            database="tig_advisor",
            table_name="student_contact_phone",
            select_fields=["phone", "contact_id"],
            fillter=["phone", "contact_id"],
            duplicates=["phone", "contact_id"])
        df_student_contact_phone = dyf_student_contact_phone.toDF()

        df_advisorcall = df_advisorcall.join(
            df_student_contact_phone, (df_advisorcall["student_phone"]
                                       == df_student_contact_phone["phone"]))

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            fillter=["student_id", "contact_id"],
            duplicates=["student_id",
                        "contact_id"]).rename_field("contact_id",
                                                    "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()

        df_advisorcall = df_advisorcall.join(
            df_student_contact, (df_advisorcall["contact_id"]
                                 == df_student_contact["contact_id_contact"]))

        df_advisorcall = df_advisorcall.drop("contact_id_contact", "phone",
                                             "call_id")

        df_rating_phone = df_advisorcall.withColumn("transformed_at",
                                                    f.lit(d4))

        flag = df_rating_phone.agg({"_key": "max"}).collect()[0][0]
        prinDev(df_rating_phone)

        convertAndSaveS3(df_rating_phone)
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_phone_care_answertime.parquet",
            mode="overwrite")