def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions):
    df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items'))
    df_tables_array = df_tables.select(df_tables['type'], df_tables['database'],
                                       array(df_tables['item']).alias('items'))
    df_partitions_array_batched = batch_metastore_partitions(sql_context=sql_context, df_parts=df_partitions)
    dyf_databases = DynamicFrame.fromDF(
        dataframe=df_databases_array, glue_ctx=glue_context, name='dyf_databases')
    dyf_tables = DynamicFrame.fromDF(
        dataframe=df_tables_array, glue_ctx=glue_context, name='dyf_tables')
    dyf_partitions = DynamicFrame.fromDF(
        dataframe=df_partitions_array_batched, glue_ctx=glue_context, name='dyf_partitions')
    return dyf_databases, dyf_tables, dyf_partitions
Ejemplo n.º 2
0
def write_df_to_catalog(data_frame, entity_type, glue_context, options):
    # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
    if data_frame.rdd.isEmpty():
        return # nothing to do
    database_name = options['catalog.database']
    nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
    dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
    sink = glue_context.getSink('catalog', **options)
    sink.write(dynamic_frame)
Ejemplo n.º 3
0
glueContext = GlueContext(sparkContext)
sparkSession = glueContext.spark_session

glueJob = Job(glueContext)
glueJob.init(args['JOB_NAME'], args)

collections_input = "COLLECTIONS_REPLACE"
collections = collections_input.split(",")
dfs = []

# Loop over each collection read the collection and push it to dataframes list
for collection in collections:
    source_df = sparkSession.read.format("jdbc").option(
        "url", jdbc_url).option("dbtable", collection).option(
            "driver", "cdata.jdbc.mongodb.MongoDBDriver").load()
    dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext,
                                         "dynamic_df_{}".format(collection))
    dfs.append({"dynamic_frame": dynamic_dframe, "collection": collection})

# Write dataframes to s3
for df in dfs:
    glueContext.write_dynamic_frame.from_options(
        frame=df["dynamic_frame"],
        connection_type="s3",
        connection_options={
            "path": "TARGET_BUCKET{}".format(df["collection"])
        },
        format="csv",
        transformation_ctx="datasink4")

glueJob.commit()
Ejemplo n.º 4
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # date_now = datetime.now()
    # preday = date_now + timedelta(days=-1)
    # d1 = preday.strftime("%Y%m%d")
    # print("d1 =", d1)
    #
    # now = datetime.now()  # current date and time
    # year = now.strftime("%Y%m%d")
    # print("year:", year)

    dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="mapping_lo_student_history"
    )
    print('Count:', dyf_mapping_lo_student_history.count())
    # # Filter nhung ban ghi cua ngay hom truoc, filter nhung ban ghi co diem != 0
    # dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None)
    dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history,
                                                  f=lambda x: x['date_id'] is not None and
                                                              (x['knowledge'] != 0 or x['comprehension'] != 0 or x[
                                                                  'application'] != 0 or x['analysis'] != 0 or x[
                                                                   'synthesis'] != 0 or x['evaluation'] != 0))
    if dyf_mapping_lo_student_history.count() > 0:
        print('START JOB---------------')
        df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF()
        df_mapping_lo_student_history = df_mapping_lo_student_history.groupby('date_id', 'student_id',
                                                                              'learning_object_id').agg(
            f.sum("knowledge").alias("knowledge"),
            f.sum("comprehension").alias("comprehension"), f.sum("application").alias("application"),
            f.sum("analysis").alias("analysis"), f.sum("synthesis").alias("synthesis"),
            f.sum("evaluation").alias("evaluation"))
        df_mapping_lo_student_history.printSchema()
        df_mapping_lo_student_history.show()
        print('END JOB---------------')

        dyf_mapping_lo_student_used = DynamicFrame.fromDF(df_mapping_lo_student_history, glueContext,
                                                          "dyf_student_lo_init")
        # print('COUNT:', dyf_student_lo_init.count())
        # dyf_student_lo_init.printSchema()
        # dyf_student_lo_init.show()

        dyf_mapping_lo_student_used = ApplyMapping.apply(frame=dyf_mapping_lo_student_used,
                                                         mappings=[("student_id", "long", "student_id", "long"),
                                                                   ("learning_object_id", "long", "learning_object_id",
                                                                    "long"),
                                                                   ("date_id", "int", "date_id", "long"),
                                                                   ("knowledge", 'long', 'knowledge', 'long'),
                                                                   ("comprehension", 'long', 'comprehension', 'long'),
                                                                   ("application", 'long', 'application', 'long'),
                                                                   ("analysis", 'long', 'analysis', 'long'),
                                                                   ("synthesis", 'long', 'synthesis', 'long'),
                                                                   ("evaluation", 'long', 'evaluation', 'long')])
        dyf_mapping_lo_student_used = ResolveChoice.apply(frame=dyf_mapping_lo_student_used, choice="make_cols",
                                                          transformation_ctx="resolvechoice2")
        dyf_mapping_lo_student_used = DropNullFields.apply(frame=dyf_mapping_lo_student_used,
                                                           transformation_ctx="dyf_mapping_lo_student_used")
        datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_mapping_lo_student_used,
                                                                   catalog_connection="glue_redshift",
                                                                   connection_options={
                                                                       "dbtable": "mapping_lo_student_used",
                                                                       "database": "dts_odin",
                                                                       "postactions": """ call proc_insert_tbhv();
                                                                       INSERT INTO mapping_lo_student_history SELECT * FROM mapping_lo_student_used;
                                                                       DROP TABLE IF EXISTS mapping_lo_student_used """
                                                                   },
                                                                   redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_init",
                                                                   transformation_ctx="datasink5")
Ejemplo n.º 5
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # TBHV E ngay
    # LO_TYPE: 1: Tu vung; 2: Ngu am; 3: Nghe; 4: Ngu phap
    # Custom function
    def get_length(array_str):
        json_obj = json.loads(array_str)
        # index = 0;
        # for item in json_obj:
        #     index += 1
        length = 0
        if json_obj is not None:
            length = len(json_obj)
        return length

    udf_get_length = udf(get_length, IntegerType())

    arr_aip_tu_vung = ['3', '4', '5', '17']
    arr_aip_ngu_phap = ['6', '7', '8', '9', '18']
    arr_aip_ngu_am = ['16']
    arr_aip_nghe = ['10', '11', '12', '13', '14', '15']

    arr_knowledge = ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']
    arr_comprehension = ['8', '9', '14']

    def do_add_lo_type(code):
        lo_type = -1
        code = str(code)
        for x in arr_aip_tu_vung:
            if x == code:
                lo_type = 1
        for x in arr_aip_ngu_am:
            if x == code:
                lo_type = 2
        for x in arr_aip_nghe:
            if x == code:
                lo_type = 3
        for x in arr_aip_ngu_phap:
            if x == code:
                lo_type = 4
        return lo_type

    add_lo_type = udf(do_add_lo_type, IntegerType())

    def do_add_score_aip(code, type, lo_type, correct_answer, student_answer):
        code = str(code)
        score = 0
        arr = []
        # diem ngu am
        if lo_type == 2 and correct_answer == student_answer:
            score = 2
        if lo_type == 2 and correct_answer != student_answer:
            score = -1
        # truong hop cac diem khac ko phair ngu am
        if lo_type != 2 and correct_answer == student_answer:
            score = 10
        if lo_type != 2 and correct_answer != student_answer:
            score = -5

        if type == 'knowledge':
            arr = arr_knowledge
        for x in arr:
            if x == code:
                return score

        return 0

    add_score_aip = udf(do_add_score_aip, IntegerType())

    def do_add_score_micro(code, type, lo_type, total_step, count_step):
        code = str(code)
        score = 0
        arr = []
        percent_success = 0.7
        if count_step / total_step >= percent_success:
            score = 10
        else:
            score = -5
        if type == 'knowledge':
            arr = arr_knowledge
        if type == 'comprehension':
            arr = arr_comprehension
        for x in arr:
            if x == code:
                return score
        return 0

    add_score_micro = udf(do_add_score_micro, IntegerType())

    def do_add_score_ait(total_step, max_step, received_point, length_answer):
        score = 0
        if total_step == max_step:
            if length_answer <= 3 and received_point >= 3:
                score = 30
            if length_answer <= 3 and received_point <= 2:
                score = 10
            if length_answer >= 4 and received_point <= 2:
                score = -15
        return score

    add_score_ait = udf(do_add_score_ait, IntegerType())

    ########## dyf_ai_study_step
    dyf_ai_study_step = glueContext.create_dynamic_frame.from_catalog(
        database="moodlestarter",
        table_name="ai_study_step"
    )
    dyf_ai_study_step = dyf_ai_study_step.select_fields(
        ['_key', 'user_id', 'lesson_id', 'tag', 'current_step', 'total_step', 'learning_object', 'learning_object_type',
         'correct_answer', 'student_answer', 'student_answer_details', 'max_point', 'received_point', 'created_at',
         'page_style', 'session_id'])


    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet("s3://dts-odin/flag/flag_ai_study_step.parquet")
        max_key = df_flag.collect()[0]['flag']
        print('read from index: ', max_key)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        # dyf_ai_study_step = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['_key'] > max_key)
    except:
        print('read flag error ')

    if dyf_ai_study_step.count() > 0:
        try:
            ## Xu ly tag la: aip
            dyf_aip = Filter.apply(frame=dyf_ai_study_step,
                                   f=lambda x: x['tag'] == 'aip')

            df_aip = dyf_aip.toDF()

            def random_code():
                return random.randint(1, 16)

            add_code = udf(random_code, IntegerType())
            df_aip = df_aip.withColumn("code", add_code())
            df_aip.printSchema()
            df_aip = df_aip.withColumn("lo_type", add_lo_type(df_aip.code))
            df_aip = df_aip.withColumn("knowledge", add_score_aip(df_aip.code, f.lit('knowledge'), df_aip.lo_type,
                                                                  df_aip.correct_answer, df_aip.student_answer)) \
                .withColumn("comprehension",
                            add_score_aip(df_aip.code, f.lit('comprehension'), df_aip.lo_type, df_aip.correct_answer,
                                          df_aip.student_answer)) \
                .withColumn("application",
                            add_score_aip(df_aip.code, f.lit('application'), df_aip.lo_type, df_aip.correct_answer,
                                          df_aip.student_answer)) \
                .withColumn("analysis",
                            add_score_aip(df_aip.code, f.lit('analysis'), df_aip.lo_type, df_aip.correct_answer,
                                          df_aip.student_answer)) \
                .withColumn("synthesis",
                            add_score_aip(df_aip.code, f.lit('synthesis'), df_aip.lo_type, df_aip.correct_answer,
                                          df_aip.student_answer)) \
                .withColumn("evaluation",
                            add_score_aip(df_aip.code, f.lit('evaluation'), df_aip.lo_type, df_aip.correct_answer,
                                          df_aip.student_answer)) \
                .withColumn("date_id",
                            from_unixtime(unix_timestamp(df_aip.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))

            df_aip.printSchema()
            df_aip.show()
            dyf_aip = DynamicFrame.fromDF(df_aip, glueContext, "dyf_aip")

            applymapping = ApplyMapping.apply(frame=dyf_aip,
                                              mappings=[("created_at", "string", "created_at", "timestamp"),
                                                        ("user_id", 'string', 'student_id', 'long'),
                                                        ("correct_answer", "string", "learning_object", "string"),
                                                        ("date_id", "string", "date_id", "int"),
                                                        ("knowledge", "int", "knowledge", "int"),
                                                        ("comprehension", "int", "comprehension", "int"),
                                                        ("application", "int", "application", "int"),
                                                        ("analysis", "int", "analysis", "int"),
                                                        ("synthesis", "int", "synthesis", "int"),
                                                        ("evaluation", "int", "evaluation", "int")])
            resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                transformation_ctx="resolvechoice2")
            dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
            dropnullfields.printSchema()
            dropnullfields.show()
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_lo_student_starter_1",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3n://dts-odin/ai_study_step/",
                                                                       transformation_ctx="datasink5")
        except Exception as e:
            print("###################### Exception ##########################")
            print(e)
        try:
            ## Xu ly tag la: micro
            dyf_micro = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'micro')

            df_micro = dyf_micro.toDF()

            df_micro_max_step = df_micro.groupby('user_id', 'lesson_id', 'session_id').agg(
                f.max('current_step').alias("max_step"))
            df_micro_max_step = df_micro_max_step.where("max_step >= 4")
            df_micro_max_step = df_micro_max_step.withColumnRenamed('user_id', 'user_id1') \
                .withColumnRenamed('lesson_id', 'lesson_id1') \
                .withColumnRenamed('session_id', 'session_id1')

            df_micro_received_point = df_micro.where("max_point = received_point")
            df_micro_received_point = df_micro_received_point.groupby('user_id', 'lesson_id', 'session_id').agg(
                f.count('received_point').alias("count_received_point"))
            df_micro_received_point = df_micro_received_point.withColumnRenamed('user_id', 'user_id2') \
                .withColumnRenamed('lesson_id', 'lesson_id2') \
                .withColumnRenamed('session_id', 'session_id2')

            df_micro = df_micro.join(df_micro_max_step, (df_micro['user_id'] == df_micro_max_step['user_id1'])
                                     & (df_micro['lesson_id'] == df_micro_max_step['lesson_id1'])
                                     & (df_micro['session_id'] == df_micro_max_step['session_id1']))

            df_micro = df_micro.join(df_micro_received_point,
                                     (df_micro['user_id'] == df_micro_received_point['user_id2'])
                                     & (df_micro['lesson_id'] == df_micro_received_point['lesson_id2'])
                                     & (df_micro['session_id'] == df_micro_received_point['session_id2']))

            def random_code1():
                return random.randint(17, 18)

            add_code1 = udf(random_code1, IntegerType())
            df_micro = df_micro.withColumn("code", add_code1())
            df_micro = df_micro.withColumn("lo_type", add_lo_type(df_micro.code))
            df_micro = df_micro.withColumn("knowledge",
                                           add_score_micro(df_micro.code, f.lit('knowledge'), df_micro.lo_type,
                                                           df_micro.total_step, df_micro.count_received_point)) \
                .withColumn("comprehension", add_score_micro(df_micro.code, f.lit('comprehension'), df_micro.lo_type,
                                                             df_micro.total_step, df_micro.count_received_point)) \
                .withColumn("application",
                            add_score_micro(df_micro.code, f.lit('application'), df_micro.lo_type, df_micro.total_step,
                                            df_micro.count_received_point)) \
                .withColumn("analysis",
                            add_score_micro(df_micro.code, f.lit('analysis'), df_micro.lo_type, df_micro.total_step,
                                            df_micro.count_received_point)) \
                .withColumn("synthesis",
                            add_score_micro(df_micro.code, f.lit('synthesis'), df_micro.lo_type, df_micro.total_step,
                                            df_micro.count_received_point)) \
                .withColumn("evaluation",
                            add_score_micro(df_micro.code, f.lit('evaluation'), df_micro.lo_type, df_micro.total_step,
                                            df_micro.count_received_point)) \
                .withColumn("date_id",
                            from_unixtime(unix_timestamp(df_micro.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))
            df_micro.printSchema()
            df_micro.show()
            dyf_micro = DynamicFrame.fromDF(df_micro, glueContext, "dyf_micro")
            applymapping = ApplyMapping.apply(frame=dyf_micro,
                                              mappings=[("created_at", "string", "created_at", "timestamp"),
                                                        ("user_id", 'string', 'student_id', 'long'),
                                                        ("learning_object", "string", "learning_object", "string"),
                                                        ("date_id", "string", "date_id", "int"),
                                                        ("knowledge", "int", "knowledge", "int"),
                                                        ("comprehension", "int", "comprehension", "int"),
                                                        ("application", "int", "application", "int"),
                                                        ("analysis", "int", "analysis", "int"),
                                                        ("synthesis", "int", "synthesis", "int"),
                                                        ("evaluation", "int", "evaluation", "int")])
            resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                transformation_ctx="resolvechoice2")
            dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
            dropnullfields.printSchema()
            dropnullfields.show()
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_lo_student_starter_2",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3n://dts-odin/ai_study_step/",
                                                                       transformation_ctx="datasink5")
        except Exception as e:
            print("###################### Exception ##########################")
            print(e)

        except Exception as e:
            print("###################### Exception ##########################")
            print(e)

        try:
            ## Xu ly tag la: ait
            # dyf_ai_study_step.show(5)
            dyf_ait = Filter.apply(frame=dyf_ai_study_step,
                                   f=lambda x: x['tag'] == 'ait')
            # dyf_ait = Filter.apply(frame=dyf_ai_study_step,
            #                        f=lambda x: x['tag'] == 'ait'
            #                                    and x['student_answer_details'] is not None
            #                                    and x['student_answer_details'] != 'null'
            #                                    and x['correct_answer'] is not None)

            df_ait = dyf_ait.toDF()

            # udf_parse_json = udf(lambda str: parse_json(str), json_schema)

            # age_list = df_ait["student_answer_details"].tolist()
            # print ('list', age_list)

            df_ait = df_ait.withColumn('len_answer', udf_get_length(df_ait["student_answer_details"]))
            # df_ait.printSchema()
            # df_ait.show()

            df_ait_max_step = df_ait.groupby('user_id', 'lesson_id', 'total_step').agg(
                f.max('current_step').alias("max_step"))
            df_ait_max_step = df_ait_max_step.where('total_step = max_step')
            df_ait_max_step = df_ait_max_step.withColumnRenamed('user_id', 'user_id1').withColumnRenamed('lesson_id',
                                                                                                         'lesson_id1').withColumnRenamed(
                'total_step', 'total_step1')
            # df_ait_max_step.printSchema()
            # df_ait_max_step.show()

            df_ait_received_point = df_ait.where(
                "student_answer_details IS NOT NULL AND max_point = received_point AND page_style like '%ait_practice%'")
            df_ait_received_point = df_ait_received_point.groupby('user_id', 'lesson_id').agg(
                f.count('received_point').alias("count_received_point"))
            df_ait_received_point = df_ait_received_point.withColumnRenamed('user_id', 'user_id2').withColumnRenamed(
                'lesson_id',
                'lesson_id2')
            # df_ait_received_point.printSchema()
            # df_ait_received_point.show()

            # ait_pronunciation
            df_ait = df_ait.where("max_point = received_point AND page_style like '%ait_pronunciation%'")
            df_ait = df_ait.join(df_ait_received_point, (
                    df_ait['user_id'] == df_ait_received_point['user_id2']) & (
                                         df_ait['lesson_id'] ==
                                         df_ait_received_point[
                                             'lesson_id2']))
            df_ait = df_ait.join(df_ait_max_step, (
                    df_ait['user_id'] == df_ait_max_step['user_id1']) & (
                                         df_ait['lesson_id'] ==
                                         df_ait_max_step[
                                             'lesson_id1']))
            # print('SCHEMA:::')
            # df_ait.printSchema()
            # df_ait.show()
            df_ait = df_ait.withColumn("knowledge",
                                       add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point,
                                                     df_ait.len_answer)) \
                .withColumn("comprehension",
                            add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point,
                                          df_ait.len_answer)) \
                .withColumn("application",
                            add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point,
                                          df_ait.len_answer)) \
                .withColumn("analysis", f.lit(0)) \
                .withColumn("synthesis", f.lit(0)) \
                .withColumn("evaluation", f.lit(0)) \
                .withColumn("lo_type", f.lit(1)) \
                .withColumn("date_id",
                            from_unixtime(unix_timestamp(df_ait.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))
            # df_ait.printSchema()
            # df_ait.show()

            dyf_ait = DynamicFrame.fromDF(df_ait, glueContext, "dyf_ait")

            applymapping = ApplyMapping.apply(frame=dyf_ait,
                                              mappings=[("created_at", "string", "created_at", "timestamp"),
                                                        ("user_id", 'string', 'student_id', 'long'),
                                                        ("correct_answer", "string", "learning_object", "string"),
                                                        ("date_id", "string", "date_id", "int"),
                                                        ("knowledge", "int", "knowledge", "int"),
                                                        ("comprehension", "int", "comprehension", "int"),
                                                        ("application", "int", "application", "int"),
                                                        ("analysis", "int", "analysis", "int"),
                                                        ("synthesis", "int", "synthesis", "int"),
                                                        ("evaluation", "int", "evaluation", "int")])
            resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                transformation_ctx="resolvechoice2")
            dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
            dropnullfields.printSchema()
            dropnullfields.show()
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_lo_student_starter",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3n://dts-odin/ai_study_step/",
                                                                       transformation_ctx="datasink5")

        except Exception as e:
            print("###################### Exception ##########################")
            print(e)

        df_temp = dyf_ai_study_step.toDF()
        flag = df_temp.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet("s3a://dts-odin/flag/flag_ai_study_step.parquet", mode="overwrite")
Ejemplo n.º 6
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor",
        table_name="advisor_account",
        transformation_ctx="datasource0")

    datasource0 = datasource0.select_fields([
        '_key', 'user_id', 'user_name', 'user_display_name', 'user_email',
        'user_phone', 'ip_phone_number', 'level', 'advisor_deleted'
    ]).rename_field('user_id',
                    'id').rename_field('user_name', 'ten').rename_field(
                        'advisor_deleted', 'advisor_deleted_tmp')

    # doc flag tu s3
    df_flag = spark.read.parquet("s3://dts-odin/flag/flag_CVHD.parquet")

    # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    data = datasource0.toDF()
    data = data.where(data['_key'] > df_flag.collect()[0]['flag'])
    data = data.withColumn('type_eg', f.lit(None))
    data = data.withColumn('advisor_type', f.lit(None))
    data = data.withColumn(
        'advisor_deleted',
        when(data.advisor_deleted_tmp, f.lit(1)).otherwise(f.lit(0)))
    data.printSchema()

    datasource0 = DynamicFrame.fromDF(data, glueContext, "datasource0")
    # datasource0.show()
    if (datasource0.count() > 0):
        try:
            # chon field mong muon
            applymapping1 = ApplyMapping.apply(
                frame=datasource0,
                mappings=[("id", "int", "id", "bigint"),
                          ("ten", "string", "username", "string"),
                          ("user_display_name", "string", "name", "string"),
                          ("user_email", "string", "email", "string"),
                          ("level", "int", "level", "int"),
                          ("advisor_deleted", "int", "advisor_deleted", "int"),
                          ("type_eg", "int", "type_eg", "string"),
                          ("advisor_type", "int", "advisor_type", "string")],
                transformation_ctx="applymapping1")

            resolvechoice2 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice2")

            dropnullfields3 = DropNullFields.apply(
                frame=resolvechoice2, transformation_ctx="dropnullfields3")

            # ghi data vao redshift
            datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields3,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable": "dim_advisor",
                    "database": "dts_odin"
                },
                redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/",
                transformation_ctx="datasink4")

            # lay max _key tren datasource
            datasource = datasource0.toDF()
            flag = datasource.agg({"_key": "max"}).collect()[0][0]

            # tao data frame
            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de flag moi vao s3
            df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet",
                             mode="overwrite")
        except:  # xu ly ngoai le(khi co datasource nhung k co gia tri thoa man dieu kien sau khi loc)
            # ghi flag
            datasource = datasource0.toDF()
            flag = datasource.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet",
                             mode="overwrite")

    # EG
    datasource = glueContext.create_dynamic_frame.from_catalog(
        database="dm_toa", table_name="advisor_eg")

    # Chon cac truong can thiet
    datasource = datasource.select_fields(
        ['_key', 'advisor_id', 'bo_phan', 'eg'])

    datasource = datasource.resolveChoice(specs=[('_key', 'cast:long')])

    data = datasource.toDF()
    # data = data.where(data['_key'] > df_flag.collect()[0]['flag'])
    # data = data.where(data['_key'] < 276961)
    datasource = DynamicFrame.fromDF(data, glueContext, "datasource")

    if (datasource.count() > 0):
        applymapping1 = ApplyMapping.apply(
            frame=datasource,
            mappings=[("advisor_id", "string", "advisor_id", "string"),
                      ("bo_phan", "string", "bo_phan", "string"),
                      ("eg", "string", "eg", "string")])
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields3 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields3")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields3,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable":
                "dim_advisor_eg",
                "database":
                "dts_odin",
                "postactions":
                """update dim_advisor set type_eg = eg, advisor_type = bo_phan
                                                                                            from dim_advisor_eg
                                                                                            where id=advisor_id;
                                                                                            DROP TABLE IF EXISTS public.dim_advisor_eg"""
            },
            redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/",
            transformation_ctx="datasink4")
Ejemplo n.º 7
0
print('database_name is: ', database_name)
print('driver is: ', driver)
print('username is: ', username)
print('password is: ', password)
print('bucket_name is: ', bucket_name)
print('partition_Keys is: ', partition_Keys)

path = 's3://' + str(bucket_name) + "/" + str(database_name) + "/" + str(
    table_name)
print('Path is: ', path)

# Read Data from database using JDBC driver in to DataFrame
source_df = spark.read.format("jdbc").option("url", url).option(
    "dbtable", db_table_name).option("driver", driver).option(
        "user", username).option("password", password).load()

job.init(args['JOB_NAME'], args)

# Convert DataFrames to AWS Glue's DynamicFrames Object
dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df")

glueContext.write_dynamic_frame.from_options(frame=dynamic_dframe,
                                             connection_type="s3",
                                             connection_options={
                                                 "path": path,
                                                 "partitionKeys":
                                                 partition_Keys
                                             },
                                             format="parquet")
job.commit()
Ejemplo n.º 8
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # ----------------------------------------------DYF-----------------------------------------------------------------#
    dyf_tpe_enduser_used_product = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product")

    dyf_tpe_enduser_used_product = dyf_tpe_enduser_used_product.select_fields(
        ['contact_id', 'product_id', 'timecreated'])

    # -----------------------------------------DYF-----------------------------------------------------------------------#

    dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product_details")

    dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
        ['id', 'cat_code'])
    # ----------------------------------------------DYF-----------------------------------------------------------------#
    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['contact_id', 'student_id']).rename_field('contact_id', 'ct_id')

    # dyf_student_contact = Filter.apply(frame=dyf_student_contact,
    #                                    f=lambda x: x["contact_id"] is not None and x["contact_id"] != ''
    #                                                and x["student_id"] is not None and x["student_id"] != ''
    #                                                )

    df_student_contact = dyf_student_contact.toDF()
    print('df_student_contact')
    df_student_contact.show()

    #-------------------------------------------------------------------------------------------------------------------#
    df_tpe_invoice_product_details = dyf_tpe_invoice_product_details.toDF()
    df_tpe_invoice_product_details = df_tpe_invoice_product_details.\
        where("cat_code like 'TAAM%' OR cat_code like 'TENUP%' ")
    df_tpe_invoice_product_details = df_tpe_invoice_product_details.withColumn(
        'to_status_id',
        f.when(df_tpe_invoice_product_details.cat_code.like('TAAM%'),
               f.lit(999L)).when(
                   df_tpe_invoice_product_details.cat_code.like('TENUP%'),
                   f.lit(998L)).otherwise(f.lit(999999999L)))

    df_tpe_invoice_product_details.show(2)

    df_tpe_enduser_used_product = dyf_tpe_enduser_used_product.toDF()

    #-----------------------------------------------____JOIN______------------------------------------------------------#

    df_join = df_tpe_invoice_product_details.join(
        df_tpe_enduser_used_product, df_tpe_invoice_product_details.id ==
        df_tpe_enduser_used_product.product_id)
    df_join.printSchema()
    print('df_join ::', df_join.count())

    df_join1 = df_join.join(df_student_contact,
                            df_student_contact.ct_id == df_join.contact_id)
    df_join1 = df_join1.withColumn(
        'change_status_date_id', from_unixtime(df_join1.timecreated,
                                               "yyyyMMdd"))
    df_join1.printSchema()
    print('df_join1 ::', df_join1.count())
    #-------------------------------------------------------------------------------------------------------------------#
    df_result = df_join1.select('student_id', 'change_status_date_id',
                                'to_status_id', 'contact_id')

    df_result.printSchema()
    df_result.show(3)
    df_result = df_result.drop_duplicates()
    df_result.cache()
    print('count df_result::', df_result.count())
    dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result")
    dyf_result = Filter.apply(
        frame=dyf_result,
        f=lambda x: x["student_id"] is not None and x[
            "change_status_date_id"] is not None and x[
                "to_status_id"] is not None and x["contact_id"] is not None)
    apply_output = ApplyMapping.apply(
        frame=dyf_result,
        mappings=[
            ("student_id", "string", "student_id", "long"),
            # ("user_id", "long", "user_id", "long"),
            ("change_status_date_id", "string", "change_status_date_id", "long"
             ),
            # ("from_status_id", "long", "from_status_id", "long"),
            ("to_status_id", "long", "to_status_id", "long"),
            # ("measure1", "double", "measure1", "double"),
            # ("measure2", "double", "measure2", "double"),
            # ("description", "string", "description", "string"),
            # ("timestamp1", "string", "timestamp1", "string"),
            ("contact_id", "string", "contact_id", "string"),

            # ("teacher_id", "long", "teacher_id", "long"),
            # ("contact_id1", "string", "contact_id1", "string"),
            # ("measure1_int", "int", "measure1_int", "int"),
            # ("measure2_int", "int", "measure2_int", "int"),
            # ("contact_id_str", "string", "contact_id_str", "string"),
            # ("lc", "string", "lc", "string"),
            # ("student_id_string", "string", "student_id_string", "string")
        ])
    df_apply_output = apply_output.toDF()
    df_apply_output.drop_duplicates()
    print('df_apply_output.count', df_apply_output.count())
    dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext,
                                           "dyf_apply_output")

    resolve_choice = ResolveChoice.apply(frame=dyf_apply_output,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dropnullfields = DropNullFields.apply(frame=resolve_choice,
                                          transformation_ctx="dropnullfields")

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields,
        catalog_connection="glue_redshift",
        connection_options={
            "dbtable":
            "temp_mapping_status",
            "database":
            "dts_odin",
            "postactions":
            """ insert into mapping_changed_status_student_v1(student_id, change_status_date_id, to_status_id, contact_id)
                                                                                                    select student_id, change_status_date_id, to_status_id, contact_id from temp_mapping_status;
                                                                                                    update mapping_changed_status_student_v1 set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id)
                                                                                                        where user_id is null;
                                                                                                    DROP TABLE IF EXISTS temp_mapping_status
                                                                                                         """
        },
        redshift_tmp_dir="s3n://datashine-dwh/temp1/",
        transformation_ctx="datasink4")

    df_result.unpersist()
    df_student_contact.unpersist()
    print(
        '------------------------>___complete__________------------------------------>'
    )
Ejemplo n.º 9
0
#Note show function is as action . Action fources the execution of the data frame plan.
# With big data he slowdown would be significant without cacching

data_frame_aggerafated.show(10)

###################################################################
########## LOAD (WRITE DATA)
###################################################################

# CREATE JUST 1 PARTATION , BECAUSE there is little data

data_frame_aggerafated = data_frame_aggerafated.repartition(10)

#Convert back to dynamic frame

dynamic_frame_write = DynamicFrame.fromDF(data_frame_aggerafated,glue_context,"dynamic_frame_write")

#write data back to s3
glue_context.write_dynamic_frame.from_options(
frame = dynamic_frame_write,
connection_type = "s3",
connection_options = {
    "path" : s3_write_path,
    #hrere you cloud create s3 prefic according to a value in seperate cloums 
    #"partationKeys":["decate"]
},
    format = "csv"
)

#log end time
dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def writeCsvFile(datasource, path):
    dataframe = DynamicFrame.toDF(datasource).repartition(1)
    datasource = DynamicFrame.fromDF(dataframe, glueContext, 'write-csv')
    glueContext.write_dynamic_frame.from_options(frame = datasource, connection_type = "s3", connection_options = {"path": path}, format = "csv", transformation_ctx = "write-csv")  
Ejemplo n.º 11
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    dyf_care_call = glueContext.create_dynamic_frame.from_catalog(
        database='tig_advisor', table_name='care_call')

    dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')])
    # print schema and select fields
    print('original schema')
    dyf_care_call.printSchema()
    dyf_care_call.show(10)

    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_care_call = Filter.apply(frame=dyf_care_call,
    #                                            f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')
    # print('the number of new contacts: ', dyf_care_call.count())

    dyf_care_call = dyf_care_call.select_fields(
        ['_key', 'id', 'phone', 'duration', 'call_status',
         'time_created']).rename_field('time_created', 'call_date')

    dy_source_care_call_cache = dyf_care_call.toDF()
    dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates(
        ['id'])
    dy_source_care_call_cache = dy_source_care_call_cache.cache()
    dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext,
                                        'dyf_care_call')
    #
    if (dyf_care_call.count() > 0):
        dyf_care_call = Filter.apply(
            frame=dyf_care_call,
            f=lambda x: x["phone"] is not None and x["phone"] != '' and
            (x["call_status"] == 'success' or x["call_status"] ==
             'call_success') and x["call_date"] is not None and x["call_date"]
            != '' and x["duration"] is not None and x["duration"] > 30)
        #
        print('dyf_care_call::corrcect')
        print('dyf_care_call number', dyf_care_call.count())
        if (dyf_care_call.count() > 0):

            dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog(
                database='tig_advisor', table_name='student_contact_phone')

            dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields(
                ['phone', 'contact_id'])

            dyf_ad_contact_phone = Filter.apply(
                frame=dyf_ad_contact_phone,
                f=lambda x: x["phone"] is not None and x["phone"] != '' and x[
                    "contact_id"] is not None and x["contact_id"] != '')

            print('dyf_ad_contact_phone::schema')
            dyf_ad_contact_phone.printSchema()

            #         dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter',
            #                                                                              table_name='advisor_ip_phone')
            #
            #         dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone,
            #                                             f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '')
            #
            #
            #
            #
            #
            #
            #-----------------------------------------------------------------------------------------------------------#

            join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone,
                                           'phone', 'phone')
            # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id'])
            # print('join_call_contact::schema------------')
            join_call_contact.printSchema()
            join_call_contact.show(2)
            print('join: ', join_call_contact.count())
            #
            #
            #         #-----------------------------------------------------------------------------------------------------------#
            #
            dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog(
                database='poss', table_name='nvn_poss_lich_su_dong_tien')

            dyf_source_ls_dong_tien = Filter.apply(
                frame=dyf_source_ls_dong_tien,
                f=lambda x: x["contact_id"] is not None and x["contact_id"] !=
                '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan"
                                                             ] != '')

            dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([
                '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao',
                'makh'
            ]).rename_field('ngay_tao', 'ngay_a0')

            dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF()
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates(
                ['id'])
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache()
            dyf_source_ls_dong_tien = DynamicFrame.fromDF(
                dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien')
            #
            join_call_contact_ao = Join.apply(join_call_contact,
                                              dyf_source_ls_dong_tien,
                                              'contact_id', 'contact_id')
            #
            print('join_call_contact_ao::schema------------')
            join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join: ', join_call_contact_ao.count())
            #
            #         # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'),
            #         #                                                                  ('ngay_a0', 'cast:timestamp')])
            #
            #
            join_call_contact_ao = Filter.apply(
                frame=join_call_contact_ao,
                f=lambda x: x["call_date"] is not None and x[
                    "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"])
            #
            print(
                'join_call_contact_ao::after filter calldate > ngay_a0------------'
            )
            # join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join_call_contact_ao: ', join_call_contact_ao.count())
            #
            #         #get lich su chao mung thanh cong
            df_join_call_contact_ao = join_call_contact_ao.toDF()
            df_join_call_contact_ao = df_join_call_contact_ao.groupby(
                'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1"))

            df_join_call_contact_ao = df_join_call_contact_ao.withColumn(
                'id_time',
                from_unixtime(
                    unix_timestamp(df_join_call_contact_ao.ngay_a1,
                                   "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))
            dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao,
                                             glueContext, 'dyf_result')
            #
            #         print('dyf_result------------')
            # join_call_contact_ao.printSchema()
            dyf_result.show(2)
            print('dyf_result: ', dyf_result.count())
            #
            #
            #
            #
            #         # # chon field
            applymapping1 = ApplyMapping.apply(
                frame=dyf_result,
                mappings=[("contact_id", "string", "contact_id", "string"),
                          ("id_time", "string", "id_time", "bigint"),
                          ("makh", "int", "makh", "int"),
                          ("ngay_a1", "string", "ngay_a1", "timestamp")])
            #
            resolvechoice2 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice2")
            dropnullfields3 = DropNullFields.apply(
                frame=resolvechoice2, transformation_ctx="dropnullfields3")

            # print('dropnullfields3::printSchema')
            # dropnullfields3.printSchema()
            # dropnullfields3.show(2)

            # ghi data vao redshift
            datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields3,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_ls_dong_tien_a1_v3",
                    "database":
                    "dts_odin",
                    "postactions":
                    """
                                                                                        INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1)
                                                                                        SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1
                                                                                        FROM temp_ls_dong_tien_a1_v3 temp_a1
                                                                                        LEFT JOIN user_map um
                                                                                             ON um.source_type = 1
                                                                                             AND um.source_id = temp_a1.contact_id
                                                                                        ;
                                                                                        DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3;
                                                                                        CALL update_a1_exception_from_eg()
                                                                           """
                },
                redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2",
                transformation_ctx="datasink4")
            df_datasource = dyf_care_call.toDF()
            flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            df.write.parquet(
                "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet",
                mode="overwrite")
            dy_source_care_call_cache.unpersist()
Ejemplo n.º 12
0
from awsglue.transforms import *
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

session = boto3.Session(region_name='ap-northeast-2')
glue_client = session.client(service_name='glue')
s3Bucket = "s3://forecast-demogo-bucket"
s3Folder = "/forecast_data"

# Set source data with raw_data in S3
datasource = glueContext.create_dynamic_frame.from_catalog(
    database="forecast_raw_db", table_name="raw_data")

df1 = datasource.toDF()

# RenameField.apply(frame = df, old_name = "sales_quantity", new_name = "target_value")
df2 = df1.withColumnRenamed("sales_quantity", "target_value")
data_frame = DynamicFrame.fromDF(df2, glueContext, "data_frame")

glueContext.write_dynamic_frame.from_options(
    frame=data_frame,
    connection_type="s3",
    connection_options={"path": s3Bucket + s3Folder},
    format="csv")
Ejemplo n.º 13
0
logger.info(f'Dumping features and labels for training...')
dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

featurs_graph_df = features_df.withColumn(
    'props_values:String',
    to_json(
        struct(
            list(
                filter(lambda x: (x != TRANSACTION_ID),
                       features_df.schema.names)))))
featurs_graph_df = featurs_graph_df.select('TransactionID',
                                           'props_values:String')

logger.info(f'Creating glue dynamic frame from spark dataframe...')
features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext,
                                                'FeaturesDF')
features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')])
features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(
    features_graph_dynamic_df, 'Transaction')
features_graph_dynamic_df = SelectFields.apply(
    frame=features_graph_dynamic_df,
    paths=["~id", '~label', 'props_values:String'])
logger.info(f'Dumping transaction data as graph data...')
dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph=True)

relational_edges = get_relations_and_edgelist(transactions.toDF(),
                                              identities.toDF(), id_cols)
for name, df in relational_edges.items():
    if name != TRANSACTION_ID:
        logger.info(f'Dumping edge {name} for training...')
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    #------------------------------------------------------------------------------------------------------------------#
    dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                table_name='native_talk_history_log_api')

    dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')])

    try:
        df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                       f=lambda x: x["id"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_native_talk = dyf_native_talk.select_fields(
        ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time'])

    dy_cache = dyf_native_talk.toDF()
    dy_cache = dy_cache.cache()
    dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk')

    print('dy_cache------------')
    dy_cache.printSchema()
    print('dy_cache: ', dy_cache.count())
    dy_cache.show(2)

    #------------------------------------------------------------------------------------------------------------------#

    if (dyf_native_talk.count() > 0):

        #---------------------------------------------------------datasource0-----------------------------------------------------#
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                              f=lambda x: x["username"] is not None and x["username"] != ''
                                                          and x["speaking_dialog_score"] is not None
                                                          and x["learning_date"] is not None and x["learning_date"] != '')
        # ----------------------------------datasource1---------------------------------------------------------------------------#
        if (dyf_native_talk.count() > 0):
            dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                        table_name='native_talk_account_mapping')

            dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user')
            dy_cache_2 = dyf_nt_account_mapping.toDF()
            dy_cache_2 = dy_cache_2.cache()
            dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping')

            dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping,
                                                  f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '')
            # ----------------------------------datasource1---------------------------------------------------------------------------#

            # -------------------------------------------------------------------------------------------------------------#
            join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user')
            if(join.count() > 0):
                df_nativetalk = join.toDF()
                df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut
                df_nativetalk = df_nativetalk.withColumn('id_time',
                                                         from_unixtime(
                                                             unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"),
                                                             "yyyyMMdd"))
                df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL")

                data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk')
                data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')])
                # -------------------------------------------------------------------------------------------------------------#
                print('data_nativetalk----------')
                data_nativetalk.printSchema()


                # tinh bang "fact_hieusuathoctap"
                df_hieusuathoctap = data_nativetalk.toDF()
                # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time
                df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'),
                                                                                               f.count('contact_id'))

                df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400))
                data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap')
                data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')])

                print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------')
                data_hieusuathoctap.printSchema();

                applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap,
                                                   mappings=[("contact_id", "string", "contact_id", "string"),
                                                             ("id_time", 'string', 'id_time', 'bigint'),
                                                             ("count(contact_id)", 'long', 'soca', 'int'),
                                                             ("sum(sogio)", 'double', 'sogio', 'double'),
                                                             ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")])


                resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                                     transformation_ctx="resolvechoice2")
                dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('dropnullfields2 number: ', dropnullfields2.count())

                datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                                                                           catalog_connection="glue_redshift",
                                                                           connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk",
                                                                                               "database": "dts_odin",
                                                                                               "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                                            SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4)
                                                                                                                            FROM temp_staging_lich_su_tu_hoc_native_talk hwb
                                                                                                                            LEFT JOIN user_map um
                                                                                                                                ON um.source_type = 1
                                                                                                                                AND um.source_id = hwb.contact_id;
                                                                                                                 DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk    
                                                                                                                """
                                                                                               },
                                                                           redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/",
                                                                           transformation_ctx="datasink2")

                df_datasource = dyf_native_talk.toDF()
                flag = df_datasource.agg({"id": "max"}).collect()[0][0]
                print('flag: ', flag)
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')
                df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite")
                dy_cache.unpersist()
                dy_cache_2.unpersist()
Ejemplo n.º 15
0
ch.last_pos_orientation, \
ch.last_pos_name, \
ch.last_pos_bin, \
ch.last_pos_tier, \
ch.last_pos_anchor, \
ch.last_pos_orientation_degrees, \
ch.last_ops_pos_id, \
ch.last_pos_slot_on_carriage, \
ch.deleted_dt, \
ch.is_deleted \
FROM distxpsche ch \
INNER JOIN maxche mc ON ch.gkey = mc.gkey \
and coalesce(ch.last_time,cast('1900-01-01' as timestamp)) = mc.last_time \
and coalesce(ch.time_dispatch,cast('1900-01-01' as timestamp)) = mc.time_dispatch \
where status = 1")
xpsche_dynDF = DynamicFrame.fromDF(xpsche_distDF, glueContext, "nested")
				
## xps_ecevent connection	
xpsecevent_DS = glueContext.create_dynamic_frame.from_catalog(database = "staging_combined", table_name = "xps_ecevent", transformation_ctx = "xpsecevent_DS")
xpsecevent_regDF = xpsecevent_DS.toDF()
xpsecevent_regDF.createOrReplaceTempView("distxpsecevent")
xpsecevent_distDF = spark.sql("SELECT sourcesystem, \
gkey, \
yard, \
pkey, \
max(timestamp) ectimestamp, \
type, \
che_id, \
che_name, \
operator_name, \
sub_type, \
# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records
medicare_df = medicare_res.toDF()
medicare_df = medicare_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'
chop_f = udf(lambda x: x[1:], StringType())
medicare_df = medicare_df.withColumn("ACC", chop_f(medicare_df["average covered charges"])).withColumn("ATP", chop_f(medicare_df["average total payments"])).withColumn("AMP", chop_f(medicare_df["average medicare payments"]))

# Turn it back to a dynamic frame
medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping
medicare_nest = medicare_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'), 
                             ('provider id', 'long', 'provider.id', 'long'),
                             ('provider name', 'string', 'provider.name', 'string'),
                             ('provider city', 'string', 'provider.city', 'string'),
                             ('provider state', 'string', 'provider.state', 'string'),
                             ('provider zip code', 'long', 'provider.zip', 'long'),
                             ('hospital referral region description', 'string','rr', 'string'),
                             ('ACC', 'string', 'charges.covered', 'double'),
                             ('ATP', 'string', 'charges.total_pay', 'double'),
                             ('AMP', 'string', 'charges.medicare_pay', 'double')])

# Write it out in Parquet
glueContext.write_dynamic_frame.from_options(frame = medicare_nest, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet")
# Cast choices into integers, those values that cannot cast result in null
medicare_res = medicare_dyf.resolveChoice(specs=[('provider id', 'cast:long')])

# Remove erroneous records
medicare_df = medicare_res.toDF()
medicare_df = medicare_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'
chop_f = udf(lambda x: x[1:], StringType())
medicare_df = medicare_df.withColumn(
    "ACC", chop_f(medicare_df["average covered charges"])).withColumn(
        "ATP", chop_f(medicare_df["average total payments"])).withColumn(
            "AMP", chop_f(medicare_df["average medicare payments"]))

# Turn it back to a dynamic frame
medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping
medicare_nest = medicare_tmp.apply_mapping([
    ('drg definition', 'string', 'drg', 'string'),
    ('provider id', 'long', 'provider.id', 'long'),
    ('provider name', 'string', 'provider.name', 'string'),
    ('provider city', 'string', 'provider.city', 'string'),
    ('provider state', 'string', 'provider.state', 'string'),
    ('provider zip code', 'long', 'provider.zip', 'long'),
    ('hospital referral region description', 'string', 'rr', 'string'),
    ('ACC', 'string', 'charges.covered', 'double'),
    ('ATP', 'string', 'charges.total_pay', 'double'),
    ('AMP', 'string', 'charges.medicare_pay', 'double')
])
def hash_cc(s):
    return hashlib.sha256(s).hexdigest()

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0")


## @convert glue DynamicFrame to DataFrame to manipulate the columns
dataframe0 = DynamicFrame.toDF(datasource0)

hash_cc_f = udf(lambda x: hash_cc(x), StringType())

dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"]))
dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password')

## @convert dataframe to glue DynamicFrame and write the output in parquet format
datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1")


datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4")

job.commit()
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    # ETL TBHV
    # Custom function
    def doSplitWord(word):
        size = len(word)
        rs = [word[i:i + 2] for i in range(0, size, 1)]
        rs1 = [word[i:i + 1] for i in range(0, size, 1)]
        rs.extend(rs1)
        return rs

    state_right = 'state_right'
    state_wrong = 'state_wrong'

    # mac dinh duoc cong knowledge
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2
    knowledge = ''
    # cong diem comprehension:
    # Can list cac name duoc cong diem comprehension:
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    comprehension = [
        'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1',
        'P4_D2'
    ]
    # cong diem application:
    # Can list cac name duoc cong diem application:
    # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    application = [
        'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'
    ]
    # cong diem analysis:
    # Can list cac name duoc cong diem analysis
    # P2_D3; P3_D2; P4_D1; P4_D2
    analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem synthesis:
    # Can list cac name duoc cong diem synthesis
    # P4_D1; P4_D2
    synthesis = ['P4_D1', 'P4_D2']
    # cong diem evaluation:
    # Can list cac name duoc cong diem evaluation
    evaluation = ''

    def doAddScore(name, state, type):
        arr = ['']
        score = 0
        if type == 'comprehension':
            arr = comprehension

        if type == 'application':
            arr = application

        if type == 'analysis':
            arr = analysis

        if type == 'synthesis':
            arr = synthesis

        name = name.lower()
        if state == state_right:
            score = 10
        if state == state_wrong:
            score = -5

        if name is not None:
            for x in arr:
                if x.lower() in name:
                    return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    # chuoi ky tu can replace
    special_str = '["] ;'

    splitWord = udf(lambda x: doSplitWord(x))

    ########## top_quiz_attempts
    dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_quiz_attempts")
    dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(
        ['_key', 'id', 'timestart', 'quiz'])

    dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(
        specs=[('_key', 'cast:long')])

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show(2)

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    dyf_top_quiz_attempts = Filter.apply(
        frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp)

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show()

    if dyf_top_quiz_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="do_top_user")
        dyf_top_user = dyf_top_user.select_fields(['id',
                                                   'student_id']).rename_field(
                                                       'id', 'top_user_id')
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question")
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name']).rename_field('id', 'quest_id')
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_result_ai
        dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_result_ai")
        dyf_top_result_ai = dyf_top_result_ai.select_fields([
            'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word',
            'wrong_word'
        ])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question,
                                'question_id', 'quest_id')
        dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts,
                                'attempt_id', 'id')

        dyf_join02 = Filter.apply(frame=dyf_join02,
                                  f=lambda x: x["quiz"] not in [7, 9, 918])
        dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id',
                                'top_user_id')

        # dyf_join02.show()
        df_study = dyf_join02.toDF()
        df_study.cache()
        if (df_study.count() > 0):
            try:
                # print("COUNT 1:", df_study.count())
                # Loc cac ky tu dac biet [ ] ",
                # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ...
                # df_study = df_study.select(
                #     'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word,
                #                                                                                     special_str, ''), f.translate(df_study.wrong_word,
                #                                        special_str, ''))
                df_study = df_study.select('quiz', 'name', 'student_id',
                                           'timestart', 'right_word',
                                           'wrong_word')
                df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \
                                   .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, ''))

                # Tach cau thanh array tu:
                # house, her => [house, her]
                # PHan tich tu dung
                df_study_right = df_study.withColumn(
                    "right_word_list", f.split(df_study.right_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_right = df_study_right.withColumn(
                    "right", f.explode(df_study_right.right_word_list))
                df_study_right = df_study_right.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'right')
                df_study_right = df_study_right.withColumn(
                    "right", f.lower(f.col("right")))
                # print("COUNT 2:", df_study_right.count())
                # df_study_right.printSchema()
                # df_study_right.show()
                dyf_study_right = DynamicFrame.fromDF(df_study_right,
                                                      glueContext,
                                                      "dyf_study_right")
                ## Learning Object
                dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                    database="nvn_knowledge", table_name="learning_object")
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name'])

                df_learning_object = dyf_learning_object.toDF()
                # convert to lowercase
                df_learning_object = df_learning_object.withColumn(
                    "learning_object_name",
                    f.lower(f.col("learning_object_name")))
                dyf_learning_object = DynamicFrame.fromDF(
                    df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_study_right,
                                                 dyf_learning_object, 'right',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_right = dyf_knowledge_right.toDF()
                df_knowledge_right.cache()

                df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \
                        .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \
                        .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \
                        .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \
                        .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \
                        .withColumn("evaluation", f.lit(0)) \
                        .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd'))

                df_knowledge_right = df_knowledge_right.groupby(
                    'student_id', 'date_id', 'learning_object_id').agg(
                        f.count('knowledge').alias("count_plus"),
                        f.sum('knowledge').alias("knowledge_plus"),
                        f.sum('comprehension').alias("comprehension_plus"),
                        f.sum('application').alias("application_plus"),
                        f.sum('analysis').alias("analysis_plus"),
                        f.sum('synthesis').alias("synthesis_plus"),
                        f.sum('evaluation').alias("evaluation_plus"))
                df_knowledge_right = df_knowledge_right.where(
                    'student_id is not null')
                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()

                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                #
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timestart", "long", "timestart", "long"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("learning_object_id", "int", "learning_object_id", "int"),
                #                                             ("date_id", "string", "date_id", "int"),
                #                                             ("knowledge", "int", "knowledge", "int"),
                #                                             ("comprehension", "int", "comprehension", "int"),
                #                                             ("application", "int", "application", "int"),
                #                                             ("analysis", "int", "analysis", "int"),
                #                                             ("synthesis", "int", "synthesis", "int"),
                #                                             ("evaluation", "int", "evaluation", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice2")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
                #
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "temp_right_wrong_learning_object",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")

                # END Cong diem cac tu dung

                #################################################
                # Tru diem cac tu sai: Xu lu tuong tu tu dung.
                # rule tru diem la -5 diem neu sai

                df_study_wrong = df_study.withColumn(
                    "wrong_word_list", f.split(df_study.wrong_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.explode(df_study_wrong.wrong_word_list))
                #convert to lowercase
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.lower(f.col("wrong")))

                df_study_wrong = df_study_wrong.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'wrong')
                # print("COUNT 2:", df_study_wrong.count())
                # df_study_wrong.printSchema()
                # df_study_wrong.show()

                dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong,
                                                      glueContext,
                                                      "dyf_study_wrong")
                ## Learning Object
                dyf_knowledge_wrong = Join.apply(dyf_study_wrong,
                                                 dyf_learning_object, 'wrong',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_wrong = dyf_knowledge_wrong.toDF()
                df_knowledge_wrong.cache()

                df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \
                    .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd'))

                df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_minus"),
                    f.sum('knowledge').alias("knowledge_minus"),
                    f.sum('comprehension').alias("comprehension_minus"),
                    f.sum('application').alias("application_minus"),
                    f.sum('analysis').alias("analysis_minus"),
                    f.sum('synthesis').alias("synthesis_minus"),
                    f.sum('evaluation').alias("evaluation_minus"))\
                    .withColumnRenamed('student_id', 'student_id_wrong') \
                    .withColumnRenamed('date_id', 'date_id_wrong') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_wrong')

                df_knowledge_wrong = df_knowledge_wrong.where(
                    'student_id_wrong is not null')
                # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all')

                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                df_knowledge = df_knowledge_right.join(
                    df_knowledge_wrong,
                    (df_knowledge_right['student_id']
                     == df_knowledge_wrong['student_id_wrong']) &
                    (df_knowledge_right['date_id']
                     == df_knowledge_wrong['date_id_wrong']) &
                    (df_knowledge_right['learning_object_id']
                     == df_knowledge_wrong['learning_object_id_wrong']),
                    'outer')

                df_knowledge = df_knowledge.withColumn("user_id",
                                check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \
                    .withColumn("learning_object_id",
                                check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \
                    .withColumn("created_date_id",
                                check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \
                    .withColumn("source_system", f.lit('top_result_ai')) \
                    .withColumn("lu_id", f.lit(0))

                dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext,
                                                    "df_knowledge")

                applymapping2 = ApplyMapping.apply(
                    frame=dyf_knowledge,
                    mappings=[
                        ("user_id", 'string', 'student_id', 'long'),
                        ("learning_object_id", "string", "learning_object_id",
                         "long"),
                        # ("knowledge", "int", "knowledge", "long"),
                        # ("comprehension", "int", "comprehension", "long"),
                        # ("application", "int", "application", "long"),
                        # ("analysis", "int", "analysis", "long"),
                        # ("synthesis", "int", "synthesis", "long"),
                        # ("evaluation", "int", "evaluation", "long"),
                        ("knowledge_plus", "long", "knowledge_plus", "long"),
                        ("comprehension_plus", "long", "comprehension_plus",
                         "long"),
                        ("application_plus", "long", "application_plus",
                         "long"),
                        ("analysis_plus", "long", "analysis_plus", "long"),
                        ("synthesis_plus", "long", "synthesis_plus", "long"),
                        ("evaluation_plus", "long", "evaluation_plus", "long"),
                        ("knowledge_minus", "long", "knowledge_minus", "long"),
                        ("comprehension_minus", "long", "comprehension_minus",
                         "long"),
                        ("application_minus", "long", "application_minus",
                         "long"),
                        ("analysis_minus", "long", "analysis_minus", "long"),
                        ("synthesis_minus", "long", "synthesis_minus", "long"),
                        ("evaluation_minus", "long", "evaluation_minus",
                         "long"),
                        ("count_plus", "long", "plus_number", "long"),
                        ("count_minus", "long", "minus_number", "long"),
                        # ("lo_type", "string", "lo_type", "long"),
                        ("source_system", "string", "source_system", "string"),
                        ("created_date_id", "string", "created_date_id",
                         "long"),
                        ("lu_id", "int", "lu_type", "long")
                        # ("student_level", "string", "student_level", "string"),
                        # ("advisor_id", "string", "advisor_id", "long"),
                        # ("package_code", "string", "package_code", "string")
                    ])

                applymapping2.printSchema()
                applymapping2.show(20)

                resolvechoice2 = ResolveChoice.apply(
                    frame=applymapping2,
                    choice="make_cols",
                    transformation_ctx="resolvechoice3")
                dropnullfields2 = DropNullFields.apply(
                    frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('COUNT df_knowledge: ', dropnullfields2.count())
                dropnullfields2.printSchema()
                dropnullfields2.show(2)

                print('START WRITE TO S3-------------------------')

                datasink6 = glueContext.write_dynamic_frame.from_options(
                    frame=dropnullfields2,
                    connection_type="s3",
                    connection_options={
                        "path":
                        "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/",
                        "partitionKeys": ["created_date_id", "source_system"]
                    },
                    format="parquet",
                    transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/",
                #                                                            transformation_ctx="datasink5")

                # END Tru diem cac tu sai

                # xoa cache
                df_study.unpersist()
                df_knowledge_right.unpersist()
                df_knowledge_wrong.unpersist()
                # df_knowledge_right.unpersist()
            except Exception as e:
                print(
                    "###################### Exception ##########################"
                )
                print(e)

            # ghi flag
            # lay max key trong data source
            mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF()
            flag = mdl_dyf_top_quiz_attempts.agg({
                "_key": "max"
            }).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet",
                mode="overwrite")
Ejemplo n.º 20
0
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource = glueContext.create_dynamic_frame.from_catalog(
    database=args['GLUE_DB_NAME'], table_name=args['GLUE_TABLE_NAME'])

sourcedata = datasource.toDF()

split_col = split(sourcedata["quarter"], " ")
sourcedata = sourcedata.withColumn("quarter new", split_col.getItem(0))
sourcedata = sourcedata.withColumn("profit",
                                   col("revenue") * col("gross margin"))
sourcedata = sourcedata.withColumn("current date", current_date())

# Convert back to Glue Dynamic Frame
datasource = DynamicFrame.fromDF(sourcedata, glueContext, "datasource")

applymapping = ApplyMapping.apply(
    frame=datasource,
    mappings=[
        ("retailer country", "string", "retailer_country", "varchar(20)"),
        ("order method type", "string", "order_method_type", "varchar(15)"),
        ("retailer type", "string", "retailer_type", "varchar(30)"),
        ("product line", "string", "product_line", "varchar(30)"),
        ("product type", "string", "product_type", "varchar(30)"),
        ("product", "string", "product", "varchar(50)"),
        ("year", "bigint", "year", "varchar(4)"),
        ("quarter new", "string", "quarter", "varchar(2)"),
        ("revenue", "double", "revenue", "numeric"),
        ("quantity", "bigint", "quantity", "integer"),
        ("gross margin", "double", "gross_margin", "decimal(15,10)"),
Ejemplo n.º 21
0
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_struct",
                                     transformation_ctx="resolvechoice2")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://go-lambda-bucket/Taxi_Data"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
##----------------------------------
#convert to a Spark DataFrame...
customDF = datasource0.toDF()

#add a new column for "type"
customDF = customDF.withColumn("type", lit('yellow'))

# Convert back to a DynamicFrame for further processing.
customDynamicFrame = DynamicFrame.fromDF(customDF, glueContext, "customDF_df")
##----------------------------------
datasink4 = glueContext.write_dynamic_frame.from_options(
    frame=customDynamicFrame,
    connection_type="s3",
    connection_options={"path": "s3://go-lambda-bucket"},
    format="parquet",
    transformation_ctx="datasink4")
job.commit()
         None).otherwise(df['Province-State']).alias('Province-State'),
    'Country-Region', 'Lat', 'Long',
    when(df['Recovered_int'].isNull(),
         0).otherwise(df['Recovered_int']).alias('Recovered'),
    when(df['Confirmed_int'].isNull(),
         0).otherwise(df['Confirmed_int']).alias('Confirmed'),
    when(df['Deaths_int'].isNull(),
         0).otherwise(df['Deaths_int']).alias('Deaths'),
    when(
        to_date(col("Date"), "yyyy-MM-dd").isNotNull(),
        to_date(col("Date"), "yyyy-MM-dd")).when(
            to_date(col("Date"), "yyyy/MM/dd").isNotNull(),
            to_date(col("Date"), "yyyy/MM/dd")).when(
                to_date(col("Date"), "yyyy-MMM-dd").isNotNull(),
                to_date(col("Date"), "yyyy-MMM-dd")).when(
                    to_date(col("Date"), "yyyy/MMMM/dd").isNotNull(),
                    to_date(col("Date"), "yyyy/MMMM/dd")).when(
                        to_date(col("Date"), "yyyy, MMMM, dd").isNotNull(),
                        to_date(col("Date"), "yyyy, MMMM, dd")).otherwise(
                            "Unknown Format").alias("Date"), 'id')

datasource_transformed = DynamicFrame.fromDF(df, glueContext, "ds0")

datasink2 = glueContext.write_dynamic_frame.from_options(
    frame=datasource_transformed,
    connection_type="s3",
    connection_options={"path": "s3://pochetti-covid-19-output"},
    format="json",
    transformation_ctx="datasink2")

job.commit()
Ejemplo n.º 23
0
def main():
    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    start_date_id = 20200101
    end_date_id = 20200305
    print('start_date_id: ', start_date_id)
    print('end_date_id: ', end_date_id)
    #
    start_year_month_id, end_year_month_id = get_year_month_id_from_date(start_date_id, end_date_id)
    start_year_week_id, end_year_week_id = get_year_week_id_from_date(start_date_id, end_date_id)
    #
    print('start_year_month_id: ', start_year_month_id)
    print('end_year_month_id: ', end_year_month_id)

    print('start_year_week_id: ', start_year_week_id)
    print('end_year_week_id: ', end_year_week_id)

    print('start_year_week_id: ', start_year_week_id)
    print('end_year_week_id: ', end_year_week_id)

    # ------------------------------------------------------------------------------------------------------------------#
    df_student_package_status_by_date = get_student_package_adivsor_level(start_date_id, end_date_id)
    df_student_package_status_by_date.cache()

    df_student_learning_and_duration_by_date = get_total_student_lerning_and_duration_by_date(glueContext,
                                                                              start_year_month_id,
                                                                              end_year_month_id)

    df_student_package_status_by_date_learning = df_student_package_status_by_date\
        .join(df_student_learning_and_duration_by_date,
              on=['contact_id', 'date_id'],
              how='left')

    df_student_package_status_by_date_learning = df_student_package_status_by_date_learning.na.fill({
        'total_learning_ls_sc_lt_le2': 0L,
        'total_learning_ls_sc_lt_le2_success': 0L,

        'total_learning_ls_sc_lt': 0L,
        'total_learning_ls_sc_lt_success': 0L,

        'total_learning_ls_success': 0L,
        'total_learning_sc_success': 0L,
        'total_learning_lt_success': 0L,

        'total_duration_ls_sc_lt': 0L,

        'total_learning_le2': 0L,
        'total_learning_le2_success': 0L,

        'total_learning_voxy_success': 0L,
        'total_learning_native_talk_success': 0L,
        'total_learning_home_work_success': 0L,
        'total_learning_ncsbasic_success': 0L,

        'total_duration_le2': 0L,
        'total_duration_voxy': 0L,
        'total_duration_native_talk': 0L,
        'total_duration_home_work': 0L,
        'total_duration_ncsbasic': 0L
    })

    df_student_package_status_by_date_learning.cache()

    print('df_student_package_status_by_date_learning')
    df_student_package_status_by_date_learning.printSchema()
    df_student_package_status_by_date_learning.show(3)

    if is_dev:
        dyf_student_package_status_by_date_learning = DynamicFrame \
            .fromDF(df_student_package_status_by_date_learning, glueContext, 'dyf_student_package_status_by_date_learning')
        atasink4 = glueContext.write_dynamic_frame \
            .from_jdbc_conf(frame=dyf_student_package_status_by_date_learning,
                            catalog_connection="glue_redshift",
                            connection_options={
                                "dbtable": "dev.df_student_package_status_by_date_learning",
                                "database": "student_native_report"
                            },
                            redshift_tmp_dir="s3://dts-odin/temp/nvn/knowledge/student/df_student_package_status_by_date_learning",
                            transformation_ctx="datasink4")


    #-------------- save to bc200_fact

    df_student_package_status_by_date_learning = df_student_package_status_by_date_learning \
        .select('date_id', 'package_id', 'student_level_id', 'contact_id', 'advisor_id',
            'is_activated',

            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_le2'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_le2_success'),

            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_sc_success'] > 0L, 1L)
            .otherwise(0L).alias('is_sc_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_lt_success'] > 0L, 1L)
            .otherwise(0L).alias('is_lt_success'),

            f.when(df_student_package_status_by_date_learning['total_learning_le2'] > 0L, 1L)
            .otherwise(0L).alias('is_le2'),
            f.when(df_student_package_status_by_date_learning['total_learning_le2_success'] > 0L, 1L)
            .otherwise(0L).alias('is_le2_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_voxy_success'] > 0L, 1L)
            .otherwise(0L).alias('is_voxy_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_native_talk_success'] > 0L, 1L)
            .otherwise(0L).alias('is_native_talk_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_home_work_success'] > 0L, 1L)
            .otherwise(0L).alias('is_home_work_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_ncsbasic_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ncsbasic_success'),

            'total_learning_ls_sc_lt_le2',
            'total_learning_ls_sc_lt_le2_success',

            'total_learning_ls_sc_lt',
            'total_learning_ls_sc_lt_success',
            'total_learning_ls_success',
            'total_learning_sc_success',
            'total_learning_lt_success',

            'total_duration_ls_sc_lt',

            'total_learning_le2',
            'total_learning_le2_success',
            'total_learning_voxy_success',
            'total_learning_native_talk_success',
            'total_learning_home_work_success',
            'total_learning_ncsbasic_success',

            'total_duration_le2',
            'total_duration_voxy',
            'total_duration_native_talk',
            'total_duration_home_work',
            'total_duration_ncsbasic'
        )

    df_student_package_status_group_week = df_student_package_status_by_date_learning \
        .groupBy('date_id', 'package_id', 'student_level_id', 'advisor_id') \
        .agg(f.count('contact_id').alias('total_student'),
             f.sum('is_activated').alias('total_student_active'),

             f.sum('is_ls_sc_lt_le2').alias('total_student_ls_sc_lt_le2'),
             f.sum('is_ls_sc_lt_le2_success').alias('total_student_ls_sc_lt_le2_success'),

             f.sum('total_learning_ls_sc_lt_le2').alias('total_learning_ls_sc_lt_le2'),
             f.sum('total_learning_ls_sc_lt_le2_success').alias('total_learning_ls_sc_lt_le2_success'),

             f.sum('is_ls_sc_lt').alias('total_student_ls_sc_lt'),
             f.sum('is_ls_sc_lt_success').alias('total_student_ls_sc_lt_success'),
             f.sum('is_ls_success').alias('total_student_ls_success'),
             f.sum('is_sc_success').alias('total_student_sc_success'),
             f.sum('is_lt_success').alias('total_student_lt_success'),

             f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt'),
             f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt_success'),
             f.sum('total_learning_ls_success').alias('total_learning_ls_success'),
             f.sum('total_learning_sc_success').alias('total_learning_sc_success'),
             f.sum('total_learning_lt_success').alias('total_learning_lt_success'),

             f.sum('total_duration_ls_sc_lt').alias('total_duration_ls_sc_lt'),

             f.sum('is_le2').alias('total_student_le2'),
             f.sum('is_le2_success').alias('total_student_le2_success'),
             f.sum('is_voxy_success').alias('total_student_voxy_success'),
             f.sum('is_native_talk_success').alias('total_student_native_talk_success'),
             f.sum('is_home_work_success').alias('total_student_home_work_success'),
             f.sum('is_ncsbasic_success').alias('total_student_ncsbasic_success'),

             f.sum('total_learning_le2').alias('total_learning_le2'),
             f.sum('total_learning_le2_success').alias('total_learning_le2_success'),
             f.sum('total_learning_voxy_success').alias('total_learning_voxy__success'),
             f.sum('total_learning_native_talk_success').alias('total_learning_native_talk_success'),
             f.sum('total_learning_home_work_success').alias('total_learning_home_work_success'),
             f.sum('total_learning_ncsbasic_success').alias('total_learning_ncsbasic_success'),

             f.sum('total_duration_le2').alias('total_duration_le2'),
             f.sum('total_duration_voxy').alias('total_duration_voxy'),
             f.sum('total_duration_native_talk').alias('total_duration_native_talk'),
             f.sum('total_duration_home_work').alias('total_duration_home_work'),
             f.sum('total_duration_ncsbasic').alias('total_duration_ncsbasic')
             ) \
        .withColumn('period_id', f.lit(DAILY_PERIOD_ID)) \
        .withColumn('report_role_id', f.lit(REPORT_ROLE_MANAGER_ID))

    # display(df_student_package_status_group_week, "df_student_package_status_group_week")

    dyf_student_package_status_group_week = DynamicFrame.fromDF(df_student_package_status_group_week,
                                                                glueContext,
                                                                'dyf_student_package_status_group_week')

    apply_ouput = ApplyMapping \
        .apply(frame=dyf_student_package_status_group_week,
               mappings=[("report_role_id", "long", "report_role_id", "long"),
                         ("period_id", "long", "period_id", "long"),
                         ("date_id", "long", "time_id", "long"),

                         ("package_id", "long", "package_id", "long"),
                         ("student_level_id", "long", "student_level_id", "long"),
                         ("advisor_id", "long", "advisor_id", "long"),

                         ("total_student", "long", "total_student", "long"),
                         ("total_student_active", "long", "total_student_active", "long"),

                         ("total_student_ls_sc_lt_le2", "long", "total_student_ls_sc_lt_le2", "long"),
                         ("total_student_ls_sc_lt_le2_success", "long", "total_student_ls_sc_lt_le2_success", "long"),
                         ("total_learning_ls_sc_lt_le2", "long", "total_learning_ls_sc_lt_le2", "long"),
                         ("total_learning_ls_sc_lt_le2_success", "long", "total_learning_ls_sc_lt_le2_success", "long"),

                         ("total_student_ls_sc_lt", "long", "total_student_ls_sc_lt", "long"),
                         ("total_student_ls_sc_lt_success", "long", "total_student_ls_sc_lt_success", "long"),
                         ("total_student_ls_success", "long", "total_student_ls_success", "long"),
                         ("total_student_sc_success", "long", "total_student_sc_success", "long"),
                         ("total_student_lt_success", "long", "total_student_lt_success", "long"),

                         ("total_learning_ls_sc_lt", "long", "total_learning_ls_sc_lt", "long"),
                         ("total_learning_ls_sc_lt_success", "long", "total_learning_ls_sc_lt_success", "long"),
                         ("total_learning_ls_success", "long", "total_learning_ls_success", "long"),
                         ("total_learning_sc_success", "long", "total_learning_sc_success", "long"),
                         ("total_learning_lt_success", "long", "total_learning_lt_success", "long"),

                         ("total_duration_ls_sc_lt", "long", "total_duration_ls_sc_lt", "long"),

                         ("total_student_le2", "long", "total_student_le2", "long"),
                         ("total_student_le2_success", "long", "total_student_le2_success", "long"),
                         ("total_student_voxy_success", "long", "total_student_voxy_success", "long"),
                         ("total_student_native_talk_success", "long", "total_student_native_talk_success", "long"),
                         ("total_student_home_work_success", "long", "total_student_home_work_success", "long"),
                         ("total_student_ncsbasic_success", "long", "total_student_ncsbasic_success", "long"),

                         ("total_learning_le2", "long", "total_learning_le2", "long"),
                         ("total_learning_le2_success", "long", "total_learning_le2_success", "long"),
                         ("total_learning_voxy__success", "long", "total_learning_voxy__success", "long"),
                         ("total_learning_native_talk_success", "long", "total_learning_native_talk_success", "long"),
                         ("total_learning_home_work_success", "long", "total_learning_home_work_success", "long"),
                         ("total_learning_ncsbasic_success", "long", "total_learning_ncsbasic_success", "long"),

                         ("total_duration_le2", "long", "total_duration_le2", "long"),
                         ("total_duration_voxy", "long", "total_duration_voxy", "long"),
                         ("total_duration_native_talk", "long", "total_duration_native_talk", "long"),
                         ("total_duration_home_work", "long", "total_duration_home_work", "long"),
                         ("total_duration_ncsbasic", "long", "total_duration_ncsbasic", "long")
                         ])

    dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2")

    display(dfy_output, "dfy_output")

    # save_data_to_redshift(
    #     glueContext,
    #     dfy_output,
    #     'student_native_report',
    #     'bc200.bc200_fact_v2_1',
    #     "s3n://dts-odin/temp/bc200/bc200_fact_v2_1",
    #     "datasink4")

    preactions = "DELETE from bc200.bc200_fact_v2_1 WHERE period_id = " + str(DAILY_PERIOD_ID) + " and time_id >= " + str(start_date_id)
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "preactions": preactions,
                                                       "dbtable": "bc200.bc200_fact_v2_1",
                                                       "database": "student_native_report"
                                                   },
                                                   redshift_tmp_dir="s3n://dts-odin/temp/bc200/bc200_fact_v2",
                                                   transformation_ctx="datasink4")




    #-------------------------------------------------------

    df_student_package_status_by_date_learning.unpersist()
    df_student_package_status_by_date.unpersist()
Ejemplo n.º 24
0
              ("body", "string", "body", "string"),
              ("stocktwitssentiment", "string", "stocktwitssentiment",
               "string")])

#convert aws glue dynamicframes to spark dataframes
stw = dynframe_stocktwits.toDF()

#transform time format
#from e.g. 2019-10-25T00:11:11Z to 2019-10-25 00:11:11
stw = stw.withColumn("createdat",
                     f.regexp_replace(f.col("createdat"), "[T]", " "))
stw = stw.withColumn("createdat",
                     f.regexp_replace(f.col("createdat"), "[Z]", ""))

#remove [\\n\\t\$#]
stw = stw.withColumn("body", f.regexp_replace(f.col("body"), "[\\n\\t\$#]",
                                              ""))

#convert spark dataframes back to aws glue dynamicframes
dynframe_stocktwits = DynamicFrame.fromDF(stw, glueContext, "nested")

#partition to 1 to get a single s3 file as output
dynframe_output = dynframe_stocktwits.repartition(1)

datasink = glueContext.write_dynamic_frame.from_options(
    frame=dynframe_output,
    connection_type="s3",
    connection_options={"path": "s3://541304926041-stocktwits"},
    format="csv")
job.commit()
		return True
	else:
		return False

# Apply filter function to dynamic frame
interactions = Filter.apply(frame = datasource0, f = filter_function, transformation_ctx = "interactions")
print("Filtered record count: ", interactions.count())

# Map only the fields we want in the output CSV, changing names to match target schema.
applymapping1 = ApplyMapping.apply(frame = interactions, mappings = [ \
	("anonymousId", "string", "ANONYMOUS_ID", "string"), \
	("userId", "string", "USER_ID", "string"), \
	("properties.sku", "string", "ITEM_ID", "string"), \
	("event", "string", "EVENT_TYPE", "string"), \
	("timestamp", "string", "TIMESTAMP_ISO", "string")], \
	transformation_ctx = "applymapping1")

# Repartition to a single file since that is what is required by Personalize
onepartitionDF = applymapping1.toDF().repartition(1)
# Coalesce timestamp into unix timestamp
onepartitionDF = onepartitionDF.withColumn("TIMESTAMP", \
	unix_timestamp(onepartitionDF['TIMESTAMP_ISO'], "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
# Convert back to dynamic frame
onepartition = DynamicFrame.fromDF(onepartitionDF, glueContext, "onepartition_df")

# Write output back to S3 as a CSV
glueContext.write_dynamic_frame.from_options(frame = onepartition, connection_type = "s3", \
	connection_options = {"path": args['S3_CSV_OUTPUT_PATH']}, \
	format = "csv", transformation_ctx = "datasink2")

job.commit()
Ejemplo n.º 26
0
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_struct",
                                     transformation_ctx="resolvechoice2")

filtered_dyDF = Filter.apply(
    frame=resolvechoice2,
    f=lambda x: x["pickup_longitude"] != 0 and x["pickup_latitude"] != 0 and x[
        "dropoff_longitude"] != 0 and x["dropoff_latitude"] != 0 and x[
            "tpep_dropoff_datetime"] > x["tpep_pickup_datetime"])
yellow_DF = filtered_dyDF.toDF()
yellow_DF = yellow_DF.withColumn('cab_type', lit('yellow').astype('string')) \
    .withColumn('pickup_location_id', lit(None).astype('byte')) \
    .withColumn('dropoff_location_id', lit(None).astype('byte'))
target_df = DynamicFrame.fromDF(yellow_DF, glueContext, "target_df")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
# dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://taxi-data-etl/staging/yellow"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
sink = glueContext.write_dynamic_frame.from_options(
    frame=target_df,
    connection_type="s3",
    connection_options={"path": "s3://taxi-data-etl/staging/trips"},
    format="parquet",
    transformation_ctx="sink")
Ejemplo n.º 27
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    #------------------------------------------------------------------------------------------------------------------#

    def getSolanBaoLuu(solan_baoluu, songay_baoluu):
        if solan_baoluu is None:
            solan_baoluu = 0
        if songay_baoluu is None:
            songay_baoluu = 0
        if solan_baoluu > songay_baoluu:
            return songay_baoluu
        return solan_baoluu

    getSolanBaoLuu = udf(getSolanBaoLuu, LongType())

    def getSoNgayBaoLuu(solan_baoluu, songay_baoluu):
        if solan_baoluu is None:
            solan_baoluu = 0
        if songay_baoluu is None:
            songay_baoluu = 0
        if songay_baoluu > solan_baoluu:
            return songay_baoluu
        return solan_baoluu

    getSoNgayBaoLuu = udf(getSoNgayBaoLuu, LongType())

    def getContactId(code, contact_id_advisor):
        if code is not None:
            return code
        return contact_id_advisor

    getContactId = udf(getContactId, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())

    # ------------------------------------------------------------------------------------------------------------------#

    #------------------------------------------------------------------------------------------------------------------#
    dyf_poss_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog(
        database='poss', table_name='ghinhan_hp')

    dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.select_fields([
        '_key', 'id', 'ngay_thanhtoan', 'so_tien', 'khoa_hoc_makh',
        'trang_thai'
    ])
    dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.resolveChoice(
        specs=[('_key', 'cast:long')])

    try:
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_poss_ghinhan_hp = Filter.apply(
            frame=dyf_poss_ghinhan_hp, f=lambda x: x["_key"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_poss_ghinhan_hp_number = dyf_poss_ghinhan_hp.count()
    print('dyf_poss_ghinhan_hp_number: ', dyf_poss_ghinhan_hp_number)
    if dyf_poss_ghinhan_hp_number < 1:
        return

    #-------------------------------------------------------------------------------------------------------------------#
    dyf_poss_khoa_hoc = glueContext.create_dynamic_frame.from_catalog(
        database='poss', table_name='khoa_hoc')

    dyf_poss_khoa_hoc = dyf_poss_khoa_hoc.select_fields(
        ['makh', 'mahv', 'goi_sanpham_id', 'trang_thai'])

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_poss_hoc_vien = glueContext.create_dynamic_frame.from_catalog(
        database='poss', table_name='hoc_vien')

    dyf_poss_hoc_vien = dyf_poss_hoc_vien.select_fields(
        ['mahv', 'crm_id', 'trang_thai']).rename_field('mahv', 'mahv_hv')

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_poss_goi_sanpham = glueContext.create_dynamic_frame.from_catalog(
        database='poss', table_name='goi_sanpham')

    dyf_poss_goi_sanpham = dyf_poss_goi_sanpham.select_fields(
        ['ma', 'id', 'solan_baoluu', 'songay_baoluu', 'trang_thai'])

    # -------------------------------------------------------------------------------------------------------------------#

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_crm_goi_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    # print('dyf_crm_goi_contacts::full')
    #     # dyf_crm_goi_contacts.printSchema()

    dyf_crm_goi_contacts = dyf_crm_goi_contacts.select_fields(
        ['Code']).rename_field('Code', 'code')
    dyf_crm_goi_contacts = Filter.apply(
        frame=dyf_crm_goi_contacts,
        f=lambda x: x["code"] is not None and x["code"] != '')
    dy_crm_goi_contacts = dyf_crm_goi_contacts.toDF()
    dy_crm_goi_contacts = dy_crm_goi_contacts.dropDuplicates()
    # print('dy_crm_goi_contacts')
    # dy_crm_goi_contacts.printSchema()

    # -------------------------------------------------------------------------------------------------------------------#

    dyf_advisor_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database='tig_advisor', table_name='student_contact')

    dyf_advisor_student_contact = dyf_advisor_student_contact.select_fields(
        ['student_id', 'contact_id'])
    dyf_advisor_student_contact = Filter.apply(frame=dyf_advisor_student_contact,
                                        f=lambda x: x["student_id"] is not None and x["student_id"] != ''
                                               and x["contact_id"] is not None and x["contact_id"] != '')\
                                        .rename_field('student_id', 'student_id_advisor')\
                                        .rename_field('contact_id', 'contact_id_advisor')

    dy_advisor_student_contact = dyf_advisor_student_contact.toDF()
    dy_advisor_student_contact = dy_advisor_student_contact.dropDuplicates(
        ['student_id_advisor'])

    # print('dy_advisor_student_contact')
    # dy_advisor_student_contact.printSchema()

    # -------------------------------------------------------------------------------------------------------------------#

    # print('dyf_poss_ghinhan_hp')
    # dyf_poss_ghinhan_hp.printSchema()
    #
    # print('dyf_poss_khoa_hoc')
    # dyf_poss_khoa_hoc.printSchema()
    #
    # print('dyf_poss_hoc_vien')
    # dyf_poss_hoc_vien.printSchema()
    #
    # print('dyf_poss_goi_sanpham')
    # dyf_poss_goi_sanpham.printSchema()

    dy_poss_ghinhan_hp = dyf_poss_ghinhan_hp.toDF()
    dy_poss_ghinhan_hp = dy_poss_ghinhan_hp.dropDuplicates(['id'])

    dy_poss_khoa_hoc = dyf_poss_khoa_hoc.toDF()
    dy_poss_khoa_hoc = dy_poss_khoa_hoc.dropDuplicates(['makh', 'mahv'])

    dy_poss_hoc_vien = dyf_poss_hoc_vien.toDF()
    dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['mahv_hv'])

    dy_poss_goi_sanpham = dyf_poss_goi_sanpham.toDF()
    dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['crm_id'])

    poss_ghinhan_hp_number = dy_poss_ghinhan_hp.count()
    # print('poss_ghinhan_hp_number: ', poss_ghinhan_hp_number)

    if poss_ghinhan_hp_number < 1:
        return

    df_dong_tien = dy_poss_ghinhan_hp.join(dy_poss_khoa_hoc,
                                           dy_poss_ghinhan_hp.khoa_hoc_makh == dy_poss_khoa_hoc.makh, 'left')\
        .join(dy_poss_hoc_vien, dy_poss_hoc_vien.mahv_hv == dy_poss_khoa_hoc.mahv, 'left')\
        .join(dy_poss_goi_sanpham, dy_poss_goi_sanpham.id == dy_poss_khoa_hoc.goi_sanpham_id, 'left')

    df_dong_tien = df_dong_tien.select(
        'ngay_thanhtoan', 'ma', 'crm_id', 'so_tien',
        getSolanBaoLuu(df_dong_tien['solan_baoluu'],
                       df_dong_tien['songay_baoluu']).alias('solan_baoluu_t'),
        getSoNgayBaoLuu(
            df_dong_tien['solan_baoluu'],
            df_dong_tien['songay_baoluu']).alias('songay_baoluu_t'))

    # print('df_dong_tien')
    # df_dong_tien.printSchema()

    #check lms_id and contact_id

    df_dong_tien_student = df_dong_tien.join(dy_crm_goi_contacts, df_dong_tien.crm_id == dy_crm_goi_contacts.code, 'left')\
        .join(dy_advisor_student_contact, df_dong_tien.crm_id == dy_advisor_student_contact.student_id_advisor, 'left')

    # print('df_dong_tien_student-----')
    # df_dong_tien_student.printSchema()

    df_dong_tien_student = df_dong_tien_student.filter(
        df_dong_tien_student.code.isNotNull()
        | (df_dong_tien_student.contact_id_advisor.isNotNull()))

    df_dong_tien_student = df_dong_tien_student.limit(100)

    student_id_unavailable = 0L
    package_endtime_unavailable = 0L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'
    measure1_unavailable = 0
    measure2_unavailable = 0
    measure3_unavailable = 0
    measure4_unavailable = float(0.0)

    df_dong_tien_student = df_dong_tien_student.select(
        f.unix_timestamp(df_dong_tien_student.ngay_thanhtoan,
                         'yyyy-MM-dd').alias('student_behavior_date'),
        f.lit(1L).alias('behavior_id'),
        f.lit(student_id_unavailable).cast('long').alias('student_id'),
        getContactId(
            df_dong_tien_student.code,
            df_dong_tien_student.contact_id_advisor).alias('contact_id'),
        df_dong_tien_student.ma.alias('package_code'),
        f.lit(package_endtime_unavailable).cast('long').alias(
            'package_endtime'),
        f.lit(package_starttime_unavailable).cast('long').alias(
            'package_starttime'),
        f.lit(student_level_code_unavailable).cast('string').alias(
            'student_level_code'),
        f.lit(student_status_code_unavailable).cast('string').alias(
            'student_status_code'),
        f.lit(today_second).alias('transformed_at'), 'so_tien',
        'solan_baoluu_t', 'songay_baoluu_t',
        f.lit(measure4_unavailable).alias('measure4'))

    print('df_dong_tien_student--1')
    df_dong_tien_student.printSchema()
    df_dong_tien_student.show(1)

    df_dong_tien_student2 = df_dong_tien_student.withColumn(
        'student_behavior_id',
        f.md5(
            concaText(df_dong_tien_student.student_behavior_date,
                      df_dong_tien_student.behavior_id,
                      df_dong_tien_student.student_id,
                      df_dong_tien_student.contact_id,
                      df_dong_tien_student.package_code,
                      df_dong_tien_student.package_endtime,
                      df_dong_tien_student.package_starttime,
                      df_dong_tien_student.student_level_code,
                      df_dong_tien_student.student_status_code,
                      df_dong_tien_student.transformed_at)))

    print('df_dong_tien_student2--2')
    df_dong_tien_student2.printSchema()
    df_dong_tien_student2.show(5)

    dyf_dong_tien_student = DynamicFrame.fromDF(df_dong_tien_student2,
                                                glueContext,
                                                'dyf_dong_tien_student')

    dyf_dong_tien_student = Filter.apply(
        frame=dyf_dong_tien_student,
        f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

    apply_ouput = ApplyMapping.apply(
        frame=dyf_dong_tien_student,
        mappings=[
            ("student_behavior_id", "string", "student_behavior_id", "string"),
            ("student_behavior_date", "long", "student_behavior_date", "long"),
            ("behavior_id", "long", "behavior_id", "long"),
            ("student_id", "long", "student_id", "long"),
            ("contact_id", "string", "contact_id", "string"),
            ("package_code", "long", "package_code", "string"),
            ("package_endtime", "long", "package_endtime", "long"),
            ("package_starttime", "long", "package_starttime", "long"),
            ("student_level_code", "string", "student_level_code", "string"),
            ("student_status_code", "string", "student_status_code", "string"),
            ("transformed_at", "long", "transformed_at", "long")
        ])

    dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")

    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output,
        connection_type="s3",
        connection_options={
            "path": "s3://dtsodin/student_behavior/student_behavior",
            "partitionKeys": ["behavior_id"]
        },
        format="parquet")

    apply_general = ApplyMapping.apply(
        frame=dyf_dong_tien_student,
        mappings=[("student_behavior_id", "string", "student_behavior_id",
                   "string"), ("so_tien", "double", "measure1", "float"),
                  ("solan_baoluu_t", "long", "measure2", "float"),
                  ("songay_baoluu_t", "long", "measure3", "float"),
                  ("measure4", "float", "measure4", "float"),
                  ("behavior_id", "long", "behavior_id", "long")])

    dfy_output2 = ResolveChoice.apply(frame=apply_general,
                                      choice="make_cols",
                                      transformation_ctx="resolvechoice2")

    print('dfy_output2::')
    dfy_output2.show(5)

    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output2,
        connection_type="s3",
        connection_options={
            "path": "s3://dtsodin/student_behavior/student_general_behavior",
            "partitionKeys": ["behavior_id"]
        },
        format="parquet")

    flag = dy_poss_ghinhan_hp.agg({"_key": "max"}).collect()[0][0]
    flag_data = [flag]
    df = spark.createDataFrame(flag_data, "long").toDF('flag')
    df.write.parquet(
        "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet",
        mode="overwrite")
Ejemplo n.º 28
0
        table_columns = table['Table']['StorageDescriptor']['Columns']
        s3_destination = str(table['Table']['StorageDescriptor']['Location'])

        # Create Dynamic Frame from S3 CSV Object
        dynamicFrame = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": [s3_source_path]}, format_options={"withHeader": True,"separator": ","}, format = "csv")

        # Convert to Spark Data Frame
        dataFrame = dynamicFrame.toDF()

        # Cast Column types from Glue Table into Spark Data Frame
        for column in table_columns:
            dataFrame = dataFrame.withColumn(column['Name'], dataFrame[column['Name']].cast(column['Type']))

        # Convert back to Glue Dynamic Frame for S3 upload
        final_dynamicFrame = DynamicFrame.fromDF(dataFrame, glueContext, "final_dynamicFrame")

        # Delete any unnecessary columns
        final_dynamicFrame = final_dynamicFrame.drop_fields(['col4', 'col5', 'col6'])

        # Send dynamic frame to S3 as parquet files. S3 location specified by the given Glue table
        glueContext.write_dynamic_frame.from_options(frame = final_dynamicFrame, connection_type = "s3", connection_options = {"path":s3_destination}, format = "parquet")

        # Successfully converted CSV file. Move CSV file to processed folder.
        s3_resource.Object(bucket, "processed/"+key).copy_from( CopySource=bucket+"/"+key)
        s3_resource.Object(bucket, key).delete()

    except Exception as e:
        print("Conversion failed. Moving object to error folder. error message: "+str(e))
        s3_resource.Object(bucket, "error/"+key).copy_from( CopySource=bucket+"/"+key)
        s3_resource.Object(bucket, key).delete()
                    endpoint_url='https://glue.us-west-2.amazonaws.com')

######################################
####        CONNECTION BLOCK      ####
######################################

## ref_bizunit_scoped connection
refBizScopedCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_incremental",
    table_name="ref_bizunit_scoped",
    transformation_ctx="refBizScopedCon_ds")
refBizScopedCon_regDF = refBizScopedCon_ds.toDF()
refBizScopedCon_regDF = refBizScopedCon_regDF.withColumn(
    "sourcesystem", lit("PNCT")).withColumn("audtdateadded",
                                            lit(current_timestamp))
refBizScopedCon_dynDF = DynamicFrame.fromDF(refBizScopedCon_regDF, glueContext,
                                            "nested")

## ref_carrier_itinerary connection
refCarItinCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_incremental",
    table_name="ref_carrier_itinerary",
    transformation_ctx="refCarItinCon_ds")
refCarItinCon_regDF = refCarItinCon_ds.toDF()
refCarItinCon_regDF = refCarItinCon_regDF.withColumn(
    "sourcesystem", lit("PNCT")).withColumn("audtdateadded",
                                            lit(current_timestamp))
refCarItinCon_dynDF = DynamicFrame.fromDF(refCarItinCon_regDF, glueContext,
                                          "nested")

## ref_carrier_service connection
refCarServCon_ds = glueContext.create_dynamic_frame.from_catalog(
Ejemplo n.º 30
0
    # Returns a List['Natural_Key']
    NATURAL_KEY = FINAL_TUPLE_WITH_DF_AND_MD5[1]

    # Taking the natual key that passed in Json File.
    NATURAL_KEY_1 = NATURAL_KEY[0]

    # Taking the value from SOURCE_NAME column (example : "HR PERSON") from
    # FINAL_MD5_DF
    POST_QUERY_SOURCE_NAME = FINAL_MD5_DF.select(
        "source_name").limit(1).rdd.map(lambda a: a[0]).collect()[0]
    print('#######>>>>>>>POST_QUERY_SOURCE_NAME', POST_QUERY_SOURCE_NAME)

    # Final Data frame is converted to Dynamic frame
    # Final Dynamic Frame will be written to Stage Table
    FINAL_DYNAMIC_FRAME = DynamicFrame.fromDF(FINAL_MD5_DF,
                                              GLUECONTEXT,
                                              "Final_dynamic_frame")

    # Updates,Inserts and Deletes counts logic here
    # 1. Create a DF with counts and op_val, Group by JobId,op_val
    # 2. Extract inserts, updates and deletes
    # 3. Add it to Cloud Watch Logs.

    COUNT_DF = FINAL_MD5_DF.withColumn('JobRunId', F.lit(str(RUN_ID)))\
                           .withColumn('JobName', F.lit(str(RUN_ID)))

    # Truncating the stage table
    PRE_QUERY = """begin;
    truncate table {stage_database_name}.{stage_table};
    end;""".format(stage_database_name=STAGE_DATABASE_NAME,
                   stage_table=STAGE_TABLE)
Ejemplo n.º 31
0
    .withColumn("queued_time", expr("CAST(qtime AS LONG)")) \
    .withColumn("start_time", expr("CAST(start AS LONG)")) \
    .withColumn("created_time", expr("CAST(ctime AS LONG)")) \
    .withColumn("etime", expr("CAST(etime AS LONG)")) \
    .withColumn("end_time", expr("CAST(end AS LONG)")) \
    .withColumn("exit_status", expr("CAST(exit_status AS INTEGER)")) \
    .withColumnRenamed("group", "group_name") \
    .withColumnRenamed("jobname", "job_name") \
    .withColumnRenamed("resource_list_gpu_type", "gpu_type") \
    .withColumn("num_cores", expr("CAST(node_ct as LONG) * CAST(num_cpus as INTEGER)")) \
    .withColumn("walltime_hrs", expr("cast(round((walltime_secs / 60.00 / 60.00), 3) as float)")) \
    .withColumn("cpu_time_hrs", expr("cast(round((cpu_time / 60.00 / 60.00), 3) as float)")) \
    .drop('resources_used_vmem', 'kvs', 'session', 'exec_host', 'resource_list_neednodes', 'resource_list_walltime', 'detail',
          'resources_used_walltime', 'resources_used_cput', 'resources_used_mem', 'resource_list_nodect', 'resource_list_cpu',
          'resource_list_gpu', 'qtime', 'start', 'ctime', 'etime', 'end', 'o_dt', 'date', 'resource_list_mem', 'resource_list_nodes')
# eventually drop detail and the asked resources to only use actually used

torq = DynamicFrame.fromDF(with_map, glueContext, "joined")

datasink5 = glueContext.write_dynamic_frame.from_options(
    frame=torq,
    connection_type="s3",
    connection_options={
        "path": args['S3_OUTPUT_PATH'],
        "partitionKeys": ["year", "month", "day"]
    },
    format="parquet",
    transformation_ctx="datasink5")

job.commit()
Ejemplo n.º 32
0
#convert to spark dataframe
df = dynamic_frame.toDF()
df.show()

# convert date columns to day & month
df = df.withColumn("date_added", to_date(split(df["date"], " ").getItem(0).cast("string"), 'MM/dd/yyyy')) \
    .withColumn("month", split(col("date_added"), "-").getItem(1)) \
    .withColumn("day", split(col("date_added"), "-").getItem(2)) \
    .orderBy('date_added')
print("Dataframe sorted")

partitioned_dataframe = df.repartition("day")

# Convert back to dynamic frame
dynamic_frame2 = DynamicFrame.fromDF(partitioned_dataframe,
                                     glue_context,
                                     "dynamic_frame_write",
                                     transformation_ctx="applymapping1")
#resolve discrepency in columns data types
resolvechoice = ResolveChoice.apply(frame=dynamic_frame2,
                                    choice="make_struct",
                                    transformation_ctx="resolvechoice2")


#transformation function
def ReplaceValue(rec):
    for field in rec:
        if rec[field] == '999' or rec[field] == 999.0 or rec[
                field] == 'nan' or rec[field] == 0 or rec[field] == '0':
            rec[field] = None
    rec["category_a"] = False
    rec["category_b"] = False
Ejemplo n.º 33
0
    collect_list("tag").alias("tags"))
tags_dataset_agg.printSchema()

tedx_dataset_agg = tedx_dataset.join(
    tags_dataset_agg, tedx_dataset.idx == tags_dataset_agg.idx_ref,
    "left").drop("idx_ref").select(col("idx").alias("_id"),
                                   col("*")).drop("idx")
tedx_dataset_agg.printSchema()

##### CONNECT TO MONGODB ATLAS
# change uri
mongo_uri = "xxxx"
# change username and password
write_mongo_options = {
    "uri": mongo_uri,
    "database": "unibg_tedx",
    "collection": "tedx_data",
    "username": "******",
    "password": "******",
    "ssl": "true",
    "ssl.domain_match": "false"
}

from awsglue.dynamicframe import DynamicFrame
tedx_dataset_dynamic_frame = DynamicFrame.fromDF(tedx_dataset_agg, glueContext,
                                                 "nested")
glueContext.write_dynamic_frame.from_options(
    tedx_dataset_dynamic_frame,
    connection_type="mongodb",
    connection_options=write_mongo_options)
Ejemplo n.º 34
0
# s3 output directories
medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast"
medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project"
medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols"
medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct"
medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql"

# Read data into a dynamic frame
medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res_cast = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])
medicare_res_project = medicare_dyf.resolveChoice(specs = [('provider id','project:long')])
medicare_res_make_cols = medicare_dyf.resolveChoice(specs = [('provider id','make_cols')])
medicare_res_make_struct = medicare_dyf.resolveChoice(specs = [('provider id','make_struct')])

# Spark SQL on a Spark dataframe
medicare_df = medicare_dyf.toDF()
medicare_df.createOrReplaceTempView("medicareTable")
medicare_sql_df = spark.sql("SELECT * FROM medicareTable WHERE `total discharges` > 30")
medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf")

# Write it out in Json
glueContext.write_dynamic_frame.from_options(frame = medicare_res_cast, connection_type = "s3", connection_options = {"path": medicare_cast}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_project, connection_type = "s3", connection_options = {"path": medicare_project}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_cols, connection_type = "s3", connection_options = {"path": medicare_cols}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_struct, connection_type = "s3", connection_options = {"path": medicare_struct}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_sql_dyf, connection_type = "s3", connection_options = {"path": medicare_sql}, format = "json")
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(database="tig_market",
                                                                                         table_name="tpe_enduser_used_product_history")
    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields(
        ['_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated'])

    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')])
    # xu ly truong hop start_read is ngitull
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet")
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', mdl_tpe_enduser_used_product_history.count())


    # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet")
    #
    # max_key = df_flag.collect()[0]['flag']
    #
    # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history,
    #                                                     f=lambda x: x["_key"] > max_key)

    if (mdl_tpe_enduser_used_product_history.count() > 0):
        mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history,
                                                            f=lambda x: x["timecreated"] is not None and x[
                                                                "contact_id"] is not None and x[
                                                                            "used_product_id"] is not None
                                                                        and (x["status_old"] == 'ACTIVED' and x["status_new"] in ['SUSPENDED','SUPPENDED']))

        # print(mdl_tpe_enduser_used_product_history.count())

        mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(
            specs=[('timecreated', 'cast:long')])
        df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF()

        df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn('change_status_date_id',
                                                                                                     from_unixtime(
                                                                                                         df_mdl_tpe_enduser_used_product_history[
                                                                                                             'timecreated'],
                                                                                                         "yyyyMMdd"))\
                                                                                        .withColumn('to_status_id',f.lit(55))\
                                                                                        .withColumn('timestamp1',df_mdl_tpe_enduser_used_product_history[
                                                                                                         'timecreated'] * f.lit(
                                                                                                         1000))

        # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id',
        #                                                                                    'contact_id',
        #                                                                                    'ngay_kich_hoat',
        #                                                                                    'id').withColumnRenamed(
        #     'used_product_id', 'id_product_buy')
        data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(df_mdl_tpe_enduser_used_product_history,
                                                                        glueContext,
                                                                        "data_mdl_tpe_enduser_used_product_history")


        data_mdl_tpe_enduser_used_product_history.printSchema()
        data_mdl_tpe_enduser_used_product_history.show(3)

        applymapping1 = ApplyMapping.apply(frame=data_mdl_tpe_enduser_used_product_history,
                                           mappings=[("contact_id", "string", "contact_id", "string"),
                                                     ("change_status_date_id", "string", "change_status_date_id", "long"),
                                                     ("timestamp1", "long", "timestamp1", "timestamp"),
                                                     ('to_status_id','int','to_status_id','long')])

        resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                                             transformation_ctx="resolvechoice2")

        dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3,
                                                                   catalog_connection="glue_redshift",
                                                                   connection_options={
                                                                       "dbtable": "mapping_changed_status_student",
                                                                       "database": "dts_odin",
                                                                        "postactions":"""UPDATE mapping_changed_status_student
		                                                                                 SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 )
	                                                                                     WHERE user_id IS NULL and to_status_id=55"""
                                                                   },
                                                                   redshift_tmp_dir="s3n://datashine-dwh/temp1/",
                                                                   transformation_ctx="datasink4")

        # ghi data vao s3
        datasink5 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields3, connection_type="s3",
                                                                 connection_options={
                                                                     "path": "s3://datashine-dev-redshift-backup/A_55_tam_dung_goi"},
                                                                 format="parquet", transformation_ctx="datasink5")
        # ghi flag
        # lay max key trong data source
        datasourceTmp = mdl_tpe_enduser_used_product_history.toDF()
        flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet", mode="overwrite")
Ejemplo n.º 36
0
def write_df_to_s3(glue_context, data_frame, backup_location):
    dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
    sink = glue_context.getSink("s3", path=backup_location)
    sink.setFormat("json")
    sink.write(dynamic_frame)
trimmedLEOriginRequestLogs = DropFields.apply(frame = labdaEdgeOriginRequestLogs, paths=["executionregion", "distributionid", "distributionname", "requestdata", "customtraceid", "eventtype", "year", "month", "date", "hour"], transformation_ctx ="trimmedLEOriginRequestLogs")

## Rename the requestid field for Lambda@Edge origin request logs to origin requestid
modifiedLEOriginRequestLogs = RenameField.apply(frame = trimmedLEOriginRequestLogs, old_name = "requestid", new_name = "origin_requestid", transformation_ctx ="modifiedLEOriginRequestLogs" )

## Convert to DataFrame
modifiedLEOriginRequestLogsDF = modifiedLEOriginRequestLogs.toDF()

## Convert to DataFrame
modifiedLEViewerRequestLogsDF = modifiedLEViewerRequestLogs.toDF()

## Join(left outer join) the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
combinedLambdaEdgeLogsDF = modifiedLEViewerRequestLogsDF.join(modifiedLEOriginRequestLogsDF, modifiedLEViewerRequestLogsDF["requestid"] == modifiedLEOriginRequestLogsDF["origin_requestid"], "left_outer")

## Convert to DynamicFrame
combinedLambdaEdgeLogs = DynamicFrame.fromDF(combinedLambdaEdgeLogsDF, glueContext, "combinedLambdaEdgeLogs")

## Join the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
#combinedLambdaEdgeLogs = Join.apply(modifiedLEViewerRequestLogs, modifiedLEOriginRequestLogs, 'requestid', 'origin_requestid')

## Drop the origin_requestid field
lambdaEdgeLogs = DropFields.apply(frame = combinedLambdaEdgeLogs, paths=["origin_requestid"], transformation_ctx ="lambdaEdgeLogs")

## Drop the "year", "month", "date", "hour" fields
trimmedLambdaEdgeLogs = DropFields.apply(frame =lambdaEdgeLogs, paths=["year", "month", "date", "hour", "useragentstring"], transformation_ctx ="trimmedLambdaEdgeLogs")

## Convert to DataFrame
trimmedLambdaEdgeLogsDF = trimmedLambdaEdgeLogs.toDF()

#Destnation S3 loaction for combine Lambda@Edge logs
leLogDestPath = "s3://" + args['target_s3_bucket'] + "/combined/lelogs"