Beispiel #1
0
def main():
    # ========== init
    glue_context = GlueContext(SparkContext.getOrCreate())

    # ========== retrieve dynamic frame
    df_advisor = retrieve_dynamic_frame(
        glue_context,
        'tig_advisor',
        'advisor_account',
        ['user_id', 'user_name', 'user_email']
    )
    display(df_advisor, "df_advisor")

    df_advisor = df_advisor.withColumnRenamed('user_id', 'advisor_id').withColumnRenamed('user_email', 'email')
    display(df_advisor, "df_advisor renamed")

    dyf_advisor = DynamicFrame.fromDF(
        df_advisor,
        glue_context,
        "dyf_advisor"
    )
    display(dyf_advisor, "dyf_advisor")

    # ========== save dynamic frame to redshift
    save_data_to_redshift(
        glue_context,
        dyf_advisor,
        'student_learning_fact',
        'advisor_dim',
        's3://datashine-dev-redshift-backup/student_learning_fact/advisor_dim',
        'student_learning_dim'
    )
Beispiel #2
0
def processBatch(data_frame, batchId):
    if (data_frame.count() > 0):
        datasource0 = DynamicFrame.fromDF(data_frame, glueContext,
                                          "from_data_frame")
        now = datetime.datetime.now()
        year = now.year
        month = now.month
        day = now.day
        hour = now.hour
        minute = now.minute
        path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format(
            str(year)) + "/ingest_month=" + "{:0>2}".format(
                str(month)) + "/ingest_day=" + "{:0>2}".format(
                    str(day)) + "/ingest_hour=" + "{:0>2}".format(
                        str(hour)) + "/"
        datasink1 = glueContext.write_dynamic_frame.from_options(
            frame=datasource0,
            connection_type="s3",
            connection_options={"path": path_datasink1},
            format="parquet",
            transformation_ctx="datasink1")
        logger.info(f'{{"batch_process_successful":True}}')
Beispiel #3
0
def processBatch(data_frame, batchId):
    now = datetime.datetime.now()
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute
    if data_frame.count() > 0:
        dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext,
                                            "from_data_frame")
        apply_mapping = ApplyMapping.apply(
            frame=dynamic_frame,
            mappings=[
                ("ventilatorid", "long", "ventilatorid", "long"),
                ("eventtime", "string", "eventtime", "timestamp"),
                ("serialnumber", "string", "serialnumber", "string"),
                ("pressurecontrol", "long", "pressurecontrol", "long"),
                ("o2stats", "long", "o2stats", "long"),
                ("minutevolume", "long", "minutevolume", "long"),
                ("manufacturer", "string", "manufacturer", "string"),
            ],
            transformation_ctx="apply_mapping",
        )

        dynamic_frame.printSchema()

        # Write to S3 Sink
        s3path = (s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) +
                  "/ingest_month=" + "{:0>2}".format(str(month)) +
                  "/ingest_day=" + "{:0>2}".format(str(day)) +
                  "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/")
        s3sink = glueContext.write_dynamic_frame.from_options(
            frame=apply_mapping,
            connection_type="s3",
            connection_options={"path": s3path},
            format="parquet",
            transformation_ctx="s3sink",
        )
Beispiel #4
0
def main():

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    # ------------------------------------------------------------------------------------------------------------------#
    start_year_month_id = 201900
    end_year_month_id = today.strftime("%Y%m")

    try:
        df_flag = spark.read.parquet(FLAG_BC200_ADVISOR_FILE)
        display(df_flag, "df_flag")

        start_year_month_id = df_flag.collect()[0]['flag']
    except:
        print 'read flag file error '

    if start_year_month_id >= start_year_month_id:
        print 'The data was etl on this week', start_year_month_id, end_year_month_id
        pass

    print('start_year_month_id: ', start_year_month_id)
    print('end_year_month_id: ', end_year_month_id)

    # ------------------------------------------------------------------------------------------------------------------#
    push_down_predicate = "( year_month_id >= '" + str(start_year_month_id) + "' " \
                          + " and year_month_id <= '" + str(end_year_month_id) + "') "

    df_student_care_advisor = retrieve_data_frame(
        glue_context,
        database='callcenter',
        table_name='student_care_advisor',
        push_down_predicate=push_down_predicate,
        fields=[
            'idcall', 'student_behavior_date', 'ip_phone', 'student_id',
            'contact_id', 'call_status', 'answer_duration', 'requested_rating',
            'value_rating'
        ])

    if df_student_care_advisor.count() <= 0:
        pass

    # -----------------------------------------------------------------------------------------------------------------#
    df_call = calculate_call(df_student_care_advisor)
    df_call.persist(StorageLevel.DISK_ONLY_2)

    df_student_advisor = calculate_advisor(glue_context=glue_context)
    df_student_advisor.persist(StorageLevel.DISK_ONLY_2)

    # -----------------------------------------------------------------------------------------------------------------#
    df_result = df_call.join(df_student_advisor,
                             on=['ip_phone', 'month_id'],
                             how='inner')

    # -----------------------------------------------------------------------------------------------------------------#
    df_result = df_result \
        .withColumn('period_id', f.lit(2)) \
        .withColumnRenamed('month_id', 'time_id')

    # -----------------------------------------------------------------------------------------------------------------#
    df_result = df_result.dropDuplicates()

    # -----------------------------------------------------------------------------------------------------------------#
    dyf_result = DynamicFrame.fromDF(df_result, glue_context, 'test')
    dyf_result = select_student_advisor_fact(dyf_result)
    save_data_to_redshift(glue_context, dyf_result, 'student_native_report',
                          'bc200_advisor.advisor_care_fact', REDSHIFT_TMP_DIR,
                          'advisor_care_fact')

    # -----------------------------------------------------------------------------------------------------------------#
    df_flag = get_flag(spark=spark, data_frame=df_result)
    if df_flag.collect()[0]['flag'] is not None:
        print 'save_flag done'
        save_flag(df_flag, FLAG_BC200_ADVISOR_FILE)

    # -----------------------------------------------------------------------------------------------------------------#
    df_call.unpersist()
    df_student_advisor.unpersist()
Beispiel #5
0
def etl_gia_han_goi_hoc():
    dyf_ghi_nhan_hoc_phi = connectGlue(
        database="poss",
        table_name="ghinhan_hp",
        select_fields=["_key", "khoa_hoc_makh", "ngay_tao"],
        fillter=["khoa_hoc_makh", "ngay_tao"])

    dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice(
        specs=[("_key", "cast:long")])
    try:
        flag_smile_care = spark.read.parquet(
            "s3://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet"
        )
        max_key = flag_smile_care.collect()[0]["flag"]
        print("max_key:  ", max_key)
        dyf_ghi_nhan_hoc_phi = Filter.apply(frame=dyf_ghi_nhan_hoc_phi,
                                            f=lambda x: x["_key"] > max_key)
    except:
        print("read flag file error ")
    if dyf_ghi_nhan_hoc_phi.count() > 0:
        df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF()
        flag = df_ghi_nhan_hoc_phi.agg({"_key": "max"}).collect()[0][0]
        prinDev(df_ghi_nhan_hoc_phi)
        dyf_khoa_hoc_poss = connectGlue(
            database="poss",
            table_name="khoa_hoc",
            select_fields=["makh", "mahv", "goi_sanpham_id"],
            fillter=["makh", "mahv", "goi_sanpham_id"],
            duplicates=["makh", "mahv", "goi_sanpham_id"
                        ]).rename_field("mahv",
                                        "ma_hv").rename_field("makh", "ma_kh")

        df_khoa_hoc_poss = dyf_khoa_hoc_poss.toDF()

        df_ghi_nhan_hoc_phi = df_khoa_hoc_poss.join(
            df_ghi_nhan_hoc_phi, (df_khoa_hoc_poss["ma_kh"]
                                  == df_ghi_nhan_hoc_phi["khoa_hoc_makh"]))

        df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi \
            .withColumn("student_behavior_date", f.unix_timestamp(df_ghi_nhan_hoc_phi.ngay_tao, "yyyy-MM-dd HH:mm:ss"))
        dyf_ghi_nhan_hoc_phi = DynamicFrame.fromDF(df_ghi_nhan_hoc_phi,
                                                   glueContext,
                                                   "dyf_ghi_nhan_hoc_phi")
        dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice(
            specs=[("student_behavior_date", "cast:long")])

        df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF()
        prinDev(df_ghi_nhan_hoc_phi)
        df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi.drop("khoa_hoc_makh")
        dyf_hoc_vien_poss = connectGlue(database="poss",
                                        table_name="hoc_vien",
                                        select_fields=["mahv", "crm_id"],
                                        fillter=["mahv", "crm_id"],
                                        duplicates=["mahv",
                                                    "crm_id"]).rename_field(
                                                        "crm_id", "contact_id")

        df_hoc_vien_poss = dyf_hoc_vien_poss.toDF()

        df_khoa_hoc_contact = df_ghi_nhan_hoc_phi.join(
            df_hoc_vien_poss,
            (df_ghi_nhan_hoc_phi["ma_hv"] == df_hoc_vien_poss["mahv"]), "left")
        df_khoa_hoc_contact = df_khoa_hoc_contact.drop("mahv")
        if is_dev:
            print "df_khoa_hoc_contact"
            df_khoa_hoc_contact.show(10)
        # -----------------------------------------------------------------------------------------------------------------#
        df_package_code = package_code()

        df_khoa_hoc_contact_package_code = df_khoa_hoc_contact.join(
            df_package_code,
            (df_khoa_hoc_contact["goi_sanpham_id"] == df_package_code["id"]))

        df_khoa_hoc_contact_package_code.drop("goi_sanpham_id", "id")

        # -----------------------------------------------------------------------------------------------------------------#

        # -----------------------------------------------------------------------------------------------------------------#

        # ----------------------------------------------------------------------------------------------------------------
        dyf_test_dauvao_poss = connectGlue(
            database="poss",
            table_name="test_dauvao",
            select_fields=["mahv", "trinhdo_dauvao"],
            duplicates=["mahv", "trinhdo_dauvao"],
            fillter=["mahv", "trinhdo_dauvao"])
        df_test_dauvao_poss = dyf_test_dauvao_poss.toDF()

        df_join_level_code = df_khoa_hoc_contact_package_code.join(
            df_test_dauvao_poss, (df_khoa_hoc_contact_package_code["ma_hv"]
                                  == df_test_dauvao_poss["mahv"]), "left")
        df_join_level_code = df_join_level_code.drop("mahv", "ma_hv")

        dyf_student_contact = connectGlue(
            database="tig_advisor",
            table_name="student_contact",
            select_fields=["student_id", "contact_id"],
            duplicates=["student_id", "contact_id"],
            fillter=["student_id",
                     "contact_id"]).rename_field("contact_id",
                                                 "contact_id_contact")
        df_student_contact = dyf_student_contact.toDF()
        df_join_level_code = df_join_level_code.join(
            df_student_contact, (df_student_contact["contact_id_contact"]
                                 == df_join_level_code["contact_id"]))
        df_join_level_code = df_join_level_code.drop("contact_id_contact")
        df_join_level_code = set_package_advisor_level(df_join_level_code)
        prinDev(df_join_level_code, "end data")
        for k, v in ADD_COLLUM.items():
            df_join_level_code = df_join_level_code.withColumn(k, v)

        convertAndSaveS3(df_join_level_code)

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet",
            mode="overwrite")