Python DynamicFrame Exemples, awsglue.dynamicframe.DynamicFrame Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : import_into_datacatalog.py Projet : mazelx/aws-glue-samples

def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions):
    df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items'))
    df_tables_array = df_tables.select(df_tables['type'], df_tables['database'],
                                       array(df_tables['item']).alias('items'))
    df_partitions_array_batched = batch_metastore_partitions(sql_context=sql_context, df_parts=df_partitions)
    dyf_databases = DynamicFrame.fromDF(
        dataframe=df_databases_array, glue_ctx=glue_context, name='dyf_databases')
    dyf_tables = DynamicFrame.fromDF(
        dataframe=df_tables_array, glue_ctx=glue_context, name='dyf_tables')
    dyf_partitions = DynamicFrame.fromDF(
        dataframe=df_partitions_array_batched, glue_ctx=glue_context, name='dyf_partitions')
    return dyf_databases, dyf_tables, dyf_partitions

Exemple #2

0

Afficher le fichier

Fichier : scripts_utils.py Projet : mazelx/aws-glue-samples

def write_df_to_catalog(data_frame, entity_type, glue_context, options):
    # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
    if data_frame.rdd.isEmpty():
        return # nothing to do
    database_name = options['catalog.database']
    nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
    dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
    sink = glue_context.getSink('catalog', **options)
    sink.write(dynamic_frame)

Exemple #3

0

Afficher le fichier

Fichier : DO_L3150_trang_thai_tai_khoan_ACTIVE.py Projet : 01662024622/dts-odin-etl

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    ACTIVED = 'ACTIVED'

    dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market",
        table_name="tpe_enduser_used_product_history"
    )
    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields(
        ['_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated'])
        # .rename_field('contact_id', 'contactid')

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')])
    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')
    print dyf_tpe_enduser_used_product_history.count()
    if dyf_tpe_enduser_used_product_history.count() > 0:
        try:
            dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
                database="tig_market",
                table_name="tpe_invoice_product_details"
            )
            dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
                ['id', 'cat_code'])

            dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
                database="tig_advisor",
                table_name="student_contact"
            )
            dyf_student_contact = dyf_student_contact.select_fields(
                ['contact_id', 'student_id']).rename_field('contact_id', 'contactid')

            ##################### Join and Filter data
            df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF()
            df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id',
                                                                                            'used_product_id').agg(
                f.max("timecreated").alias("max_timecreated")) \
                .withColumnRenamed("contact_id", "contact_id_temp")
            print df_tpe_used_product_history_step1.count()
            df_tpe_used_product_history_step1.show()

            df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby('contact_id_temp').agg(
                f.max("max_timecreated").alias("max_timecreated"),
                f.count("used_product_id").alias("count_used_product_id"))
            print df_tpe_used_product_history_step2.count()
            df_tpe_used_product_history_step2.show()
            print "EEEEEEEEEEEEEEEEEEEEEEEEE"

            dyf_tpe_used_product_history = DynamicFrame.fromDF(df_tpe_used_product_history_step2, glueContext,
                                                               "dyf_tpe_used_product_history")

            dyf_part_one = Filter.apply(frame=dyf_tpe_used_product_history,
                                        f=lambda x: x["count_used_product_id"] <= 1)

            # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history,
            #                             f=lambda x: x["used_product_id"] > 1)
            df_part_one = dyf_part_one.toDF()
            df_part_one = df_part_one.join(df_tpe_enduser_used_product_history,
                                           (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id)
                                           & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated))

            dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one")
            dyf_part_one = dyf_part_one.select_fields(['contact_id', 'used_product_id', 'status_old',
                                                       'status_new', 'status_description', 'timecreated'])


            dyf_join_part_one_product_details = Join.apply(dyf_part_one,
                                                           dyf_tpe_invoice_product_details, 'used_product_id', 'id')

            dyf_join_part_one_product_details.printSchema()
            print "total 01: ", dyf_join_part_one_product_details.count()
            dyf_join_part_one_product_details.toDF().show(2)

            dyf_join_part_one_contact = Join.apply(dyf_join_part_one_product_details,
                                                   dyf_student_contact, 'contact_id', 'contactid')
            dyf_join_part_one_contact = dyf_join_part_one_contact \
                .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated'])


            dyf_join_part_one_contact.printSchema()
            print "total 02: ", dyf_join_part_one_contact.count()
            dyf_join_part_one_contact.toDF().show(2)
            # df_join_part_one = dyf_join_part_one_contact.toDF()

            ######################################
            ######## START active
            dyf_join_active_status = Filter.apply(frame=dyf_join_part_one_contact,
                                                     f=lambda x: x["status_new"] == ACTIVED)
            print "dyf_join_active_status ", dyf_join_active_status.count()
            dyf_join_active_status.toDF().show(2)
            df_join_active_status = dyf_join_active_status.toDF()

            df_join_active_status = df_join_active_status \
                .withColumn("change_status_date_id",
                            from_unixtime(df_join_active_status.timecreated, 'yyyyMMdd').cast("long")) \
                .withColumn("from_status_id", f.lit(None).cast("long")) \
                .withColumn("to_status_id", f.lit(206).cast("long")) \
                .withColumn("measure1", f.lit(None).cast("long")) \
                .withColumn("measure2", f.lit(None).cast("long")) \
                .withColumn("description", df_join_active_status.status_description) \
                .withColumn("timestamp1", f.lit(None).cast("long"))
            df_join_active_status.show(3)
            dyf_join_active_status = DynamicFrame.fromDF(df_join_active_status, glueContext,
                                                            "dyf_join_active_status")

            dyf_join_active_status = dyf_join_active_status \
                .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id',
                                'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1'])
            dyf_join_active_status.printSchema()
            df_join_active_status = dyf_join_active_status.toDF()
            ####### END

            df_join_active_status = df_join_active_status.withColumn("user_id", f.lit(None).cast("long"))

            dyf_join_status = DynamicFrame.fromDF(df_join_active_status, glueContext, "dyf_join_status")

            applymapping1 = ApplyMapping.apply(frame=dyf_join_status,
                                               mappings=[
                                                   ("student_id", "string", "student_id", "long"),
                                                   ("user_id", "long", "user_id", "long"),
                                                   ("change_status_date_id", "long", "change_status_date_id", "long"),
                                                   ("from_status_id", "long", "from_status_id", "long"),
                                                   ("to_status_id", "long", "to_status_id", "long"),
                                                   ("measure1", "long", "measure1", "double"),
                                                   ("measure2", "long", "measure2", "double"),
                                                   ("description", "string", "description", "string"),
                                                   ("timestamp1", "long", "timestamp1", "long"),
                                                   ("contact_id", "string", "contact_id", "string")
                                               ])

            resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                                                 transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            resolvechoice1.printSchema()
            resolvechoice1.show(5)
            print('START WRITE TO REDSHIFT -------------------------')
            datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_changed_status_student",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3a://dtsodin/temp/mapping_changed_status_student/",
                                                                       transformation_ctx="datasink1")

            print('START WRITE TO S3-------------------------')
            # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
            #                                                          connection_options={
            #                                                              "path": "s3://dtsodin/student_behavior/student_behavior/",
            #                                                              "partitionKeys": ["behavior_id"]},
            #                                                          format="parquet",
            #                                                          transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            df_temp = dyf_tpe_enduser_used_product_history.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet("s3a://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet", mode="overwrite")
        except Exception as e:
            print "Something was wrong ",e

Exemple #4

0

Afficher le fichier

Fichier : glue_job_dd_v0.3.py Projet : angeloschionis/DataDictionary

    format=file_format,  ## "csv",
    format_options={
        "withHeader": True
    },
    transformation_ctx="data_df").toDF()
data_df.show(10)

## read data from input table to a data frame
#data_df=glueContext.create_dynamic_frame.from_catalog(database=database,table_name=table_name).toDF()

## running sql query on the dataframe created with input dataset
data_df.createOrReplaceTempView('data_df')
data_df = spark.sql('{} from data_df'.format(querySql))

query_columns = ['werk', 'spj', 'knr', 'result', 'probability', 'time']
data_df.toDF(*query_columns)

## convert the dataframe made by transformed dataset to dynamic frame again
data_df = DynamicFrame.fromDF(data_df, glueContext, "data_df")

## Define target s3 output location

rtp_dd_output = "s3://" + s3_output_data_folder + "/" + "plant=" + plant + "/" + "appid=" + applicationId + "/"

# Store the output/final dynamicFrame to the target s3 location
outputGDF = glueContext.write_dynamic_frame.from_options(
    frame=data_df,
    connection_type="s3",
    connection_options={"path": rtp_dd_output},
    format="csv")

Exemple #5

0

Afficher le fichier

Fichier : dim_user_job.py Projet : ccarcamog/as-tech-test

                                     choice="MATCH_CATALOG",
                                     database="as-redshift-dw",
                                     table_name="as_tech_test_public_dim_user",
                                     transformation_ctx="resolvechoice3")
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice4"]
## @return: resolvechoice4
## @inputs: [frame = resolvechoice3]
resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice4")

##get Insert Date
timestampedDf = resolvechoice4.toDF().withColumn("dim_user_insert_dt",
                                                 current_timestamp())

#Back to DynamicFrame
cleaned_datasource = DynamicFrame.fromDF(timestampedDf, glueContext,
                                         "cleaned_datasource")

## @type: DataSink
## @args: [database = "as-redshift-dw", table_name = "as_tech_test_public_dim_user", redshift_tmp_dir = TempDir, transformation_ctx = "datasink5"]
## @return: datasink5
## @inputs: [frame = resolvechoice4]
datasink5 = glueContext.write_dynamic_frame.from_catalog(
    frame=cleaned_datasource,
    database="as-redshift-dw",
    table_name="as_tech_test_public_dim_user",
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink5")
job.commit()

Exemple #6

0

Afficher le fichier

Fichier : P_ManageForecastdata.py Projet : akashdan/antlr

            		source_cd, forecast_dt, hour_num, usage_factor, esiid_cnt, unadj_load, distrib_loss_load, 
            		transmission_loss_load, ufe_loss_load, ancillary_loss_load, deration_loss_load, cap_ob, 
            		tran_ob, crdt, batch_dt, batch_hr
            		from ams__iw_growth_stnorm_hourly__df
            		order by forecast_dt ,hour_num""")
            select__df.createOrReplaceTempView('select__df')
            rowcount_df = select__df
            
    
    except Exception as e:
        raise
    
        
        errormessage = str(spark.sql("""select error_message()
              ,{} = error_severity()
              ,{} = error_state()""".format(errorseverity, errorstate)).collect()[0][0])


#Write modified data frames to target
if __name__ == '__main__':
    p_manageforecastdata(*sys.argv[1:])
    try:
        for tab_df in mod_df.keys():
            if mod_df[tab_df] == org_df[tab_df]:
                continue
            dym__trans__df = DynamicFrame.fromDF(mod_df[tab_df],glueContext,'dym__trans__df')
            glueContext.write_dynamic_frame.from_options(frame = dym__trans__df, connection_type = 's3', connection_options = {'path': 's3://target/s3tables'}, format = 'csv')
    except:
        raise

Exemple #7

0

Afficher le fichier

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    class_topica_id = 1

    now = datetime.now()  # current date and time
    year = now.strftime("%Y%m%d")
    year = '20190901'
    print("year:", int(year))
    cur_date = int(year)
    pre_date = cur_date - 1
    print("year:", pre_date)
    ########## dyf_mapping_lo_student
    dyf_mapping_lo_student = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="mapping_lo_student"
    )

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_mapping_lo_student.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     # dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x['time_lms_created'] > start_read)
    # except:
    #     print('read flag file error ')
    # dyf_mapping_lo_student = Filter.apply(frame=dyf_mapping_lo_student, f=lambda x: x['knowledge_pass_date_id'] >= f.lit(int(year)))

    print('df_student_contact count 1:', dyf_mapping_lo_student.count())
    if dyf_mapping_lo_student.count() > 0:
        try:
            print("START......................")
            ########## dyf_mapping_lo_student
            dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="learning_object"
            )

            ########## dyf_learning_object_class
            dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="learning_object_class"
            )
            dyf_learning_object_class = dyf_learning_object_class.select_fields(['class_id', 'class_parent_id'])
            dyf_learning_object_class = Filter.apply(frame=dyf_learning_object_class, f=lambda x: x["class_parent_id"] == class_topica_id)

            ########## dyf_mapping_lo_class
            dyf_mapping_lo_class = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="mapping_lo_class"
            )
            dyf_mapping_lo_class = dyf_mapping_lo_class.select_fields(['class_id', 'learning_object_id'])\
                .rename_field('class_id', 'map_class_id').rename_field('learning_object_id', 'map_lo_id')
            ## JOIN chi lay nhung trinh do cua TOPICA
            dyf_mapping_lo_class = Join.apply(dyf_mapping_lo_class, dyf_learning_object_class, 'map_class_id', 'class_id')


            dyf_learning_object = dyf_learning_object.select_fields(
                ['learning_object_id', 'learning_object_type']).rename_field('learning_object_id', 'lo_id')
            dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_learning_object, 'learning_object_id',
                                                'lo_id')
            dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_mapping_lo_class, 'learning_object_id',
                                                'map_lo_id')

            # dyf_mapping_lo_student.printSchema()
            # dyf_mapping_lo_student.show()

            df_mapping_lo_student = dyf_mapping_lo_student.toDF()

            df_mapping_lo_student = df_mapping_lo_student.groupby('student_id', 'learning_object_type', 'class_id').agg(
                f.count('knowledge_pass_date_id').alias("knowledge_number"),
                f.count('comprehension_pass_date_id').alias("comprehension_number"),
                f.count('application_pass_date_id').alias("application_number"),
                f.count('analysis_pass_date_id').alias("analysis_number"),
                f.count('synthesis_pass_date_id').alias("synthesis_number"),
                f.count('evaluation_pass_date_id').alias("evaluation_number"))
            df_mapping_lo_student = df_mapping_lo_student.withColumn("created_date_id", f.lit(str(year)))
            # print('Count:' , df_mapping_lo_student.count())
            # df_mapping_lo_student.printSchema()
            # df_mapping_lo_student.show(5)

            dyf_mapping_lo_student = DynamicFrame.fromDF(df_mapping_lo_student, glueContext,
                                                         "dyf_mapping_lo_student")
            applymapping = ApplyMapping.apply(frame=dyf_mapping_lo_student,
                                              mappings=[("student_id", "long", "student_id", "long"),
                                                        ("user_id", "long", "user_id", "long"),
                                                        ("class_id", "long", "class_id", "long"),
                                                        ("knowledge_number", "long", "knowledge_number", "long"),
                                                        (
                                                        "comprehension_number", 'long', 'comprehension_number', 'long'),
                                                        ("application_number", 'long', 'application_number', 'long'),
                                                        ("analysis_number", 'long', 'analysis_number', 'long'),
                                                        ("synthesis_number", 'long', 'synthesis_number', 'long'),
                                                        ("evaluation_number", 'long', 'evaluation_number', 'long'),
                                                        ("created_date_id", 'string', 'created_date_id', 'long'),
                                                        ("learning_object_type", 'string', 'learning_object_type',
                                                         'string')])
            resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                transformation_ctx="resolvechoice2")
            dyf_student_lo_init = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dyf_student_lo_init")
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_student_lo_init,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_lo_student_number",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_number",
                                                                       transformation_ctx="datasink5")

            print("END......................")
        except Exception as e:
            print("###################### Exception ##########################")
            print(e)

Exemple #8

0

Afficher le fichier

vote_dataset_agg4 = vote_dataset.groupBy(col("idx_tedx")).agg(
    collect_list(
        struct(col("date"), col("time"), col("mail_user"),
               col("vote"))).alias("vote_user"))
vote_dataset_agg4.printSchema()

tedx_dataset_agg4 = tedx_dataset_agg3.join(vote_dataset_agg4, tedx_dataset_agg3._id == vote_dataset_agg4.idx_tedx, "left") \
    .drop("idx_tedx")

tedx_dataset_agg4.printSchema()

mongo_uri = "mongodb://mycluster-shard-00-00-wo6at.mongodb.net:27017,mycluster-shard-00-01-wo6at.mongodb.net:27017,mycluster-shard-00-02-wo6at.mongodb.net:27017"

write_mongo_options = {
    "uri": mongo_uri,
    "database": "unibg_tedx",
    "collection": "tedz_data",
    "username": "******",
    "password": "******",
    "ssl": "true",
    "ssl.domain_match": "false"
}
from awsglue.dynamicframe import DynamicFrame
tedx_dataset_dynamic_frame = DynamicFrame.fromDF(tedx_dataset_agg4,
                                                 glueContext, "nested")

glueContext.write_dynamic_frame.from_options(
    tedx_dataset_dynamic_frame,
    connection_type="mongodb",
    connection_options=write_mongo_options)

Exemple #9

0

Afficher le fichier

Fichier : h2472_backup.py Projet : 01662024622/dts-odin-etl

def back_kup_h2472_question_type():
    dyf_jh2472_question_type = glueContext \
        .create_dynamic_frame.from_catalog(database="do_h2472",
                                           table_name="question_type")

    if is_dev:
        print('dyf_jh2472_question_type')
        dyf_jh2472_question_type.printSchema()
        dyf_jh2472_question_type.show(3)

    # root
    # | -- id: string
    # | -- created_date: string
    # | -- description: string
    # | -- group_type: string

    # | -- modified_date: string
    # | -- name: string
    # | -- active: boolean
    # | -- parent_id: string
    # | -- _key: long
    # | -- _table: string
    # | -- _schema: string

    dyf_jh2472_question_type = dyf_jh2472_question_type.resolveChoice(
        specs=[('id', 'cast:long')])
    #
    dyf_jh2472_question_type = Filter.apply(frame=dyf_jh2472_question_type,
                                            f=lambda x: x["id"] > 54)

    df_jh2472_question_type = dyf_jh2472_question_type.toDF()
    df_jh2472_question_type = df_jh2472_question_type.dropDuplicates(['id'])
    df_jh2472_question_type = df_jh2472_question_type.withColumn(
        'name', f.concat('name', f.lit('_'), 'id'))
    dyf_jh2472_question_type = DynamicFrame.fromDF(df_jh2472_question_type,
                                                   glueContext,
                                                   'dyf_jh2472_question_type')

    # #
    applymapping1 = ApplyMapping.apply(
        frame=dyf_jh2472_question_type,
        mappings=[("id", 'long', 'id', 'long'),
                  ("created_date", "string", "created_date", "timestamp"),
                  ("description", "string", "description", "string"),
                  ("group_type", "string", "group_type", "string"),
                  ("modified_date", 'string', 'modified_date', 'timestamp'),
                  ("name", "string", "name", "string"),
                  ("active", "boolean", "active", "boolean")])
    # #
    resolvechoice1 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice1")

    if is_dev:
        print('resolvechoice1')
        resolvechoice1.printSchema()
        resolvechoice1.show(3)
    # #
    # #
    datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=resolvechoice1,
        catalog_connection="h2474_backup",
        connection_options={
            "dbtable": "question_type",
            "database": "topicaH2472"
        },
        redshift_tmp_dir="s3a://dts-odin/topicaH2472/question_type",
        transformation_ctx="datasink5")

Exemple #10

0

Afficher le fichier

Fichier : Lambda-[usecase-invoice-sandbox]-SparkSQL.py Projet : veena-LINE/Training

def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
    for alias, frame in mapping.items():
        frame.toDF().createOrReplaceTempView(alias)
    result = spark.sql(query)
    return DynamicFrame.fromDF(result, glueContext, transformation_ctx)

Exemple #11

0

Afficher le fichier

Fichier : scripts_utils.py Projet : mazelx/aws-glue-samples

def write_df_to_s3(glue_context, data_frame, backup_location):
    dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
    sink = glue_context.getSink("s3", path=backup_location)
    sink.setFormat("json")
    sink.write(dynamic_frame)

Exemple #12

0

Afficher le fichier

Fichier : resolve_choice.py Projet : mazelx/aws-glue-samples

# s3 output directories
medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast"
medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project"
medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols"
medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct"
medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql"

# Read data into a dynamic frame
medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res_cast = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])
medicare_res_project = medicare_dyf.resolveChoice(specs = [('provider id','project:long')])
medicare_res_make_cols = medicare_dyf.resolveChoice(specs = [('provider id','make_cols')])
medicare_res_make_struct = medicare_dyf.resolveChoice(specs = [('provider id','make_struct')])

# Spark SQL on a Spark dataframe
medicare_df = medicare_dyf.toDF()
medicare_df.createOrReplaceTempView("medicareTable")
medicare_sql_df = spark.sql("SELECT * FROM medicareTable WHERE `total discharges` > 30")
medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf")

# Write it out in Json
glueContext.write_dynamic_frame.from_options(frame = medicare_res_cast, connection_type = "s3", connection_options = {"path": medicare_cast}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_project, connection_type = "s3", connection_options = {"path": medicare_project}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_cols, connection_type = "s3", connection_options = {"path": medicare_cols}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_struct, connection_type = "s3", connection_options = {"path": medicare_struct}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_sql_dyf, connection_type = "s3", connection_options = {"path": medicare_sql}, format = "json")

Exemple #13

0

Afficher le fichier

Fichier : data_cleaning_and_lambda.py Projet : mazelx/aws-glue-samples

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records
medicare_df = medicare_res.toDF()
medicare_df = medicare_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'
chop_f = udf(lambda x: x[1:], StringType())
medicare_df = medicare_df.withColumn("ACC", chop_f(medicare_df["average covered charges"])).withColumn("ATP", chop_f(medicare_df["average total payments"])).withColumn("AMP", chop_f(medicare_df["average medicare payments"]))

# Turn it back to a dynamic frame
medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping
medicare_nest = medicare_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'), 
                             ('provider id', 'long', 'provider.id', 'long'),
                             ('provider name', 'string', 'provider.name', 'string'),
                             ('provider city', 'string', 'provider.city', 'string'),
                             ('provider state', 'string', 'provider.state', 'string'),
                             ('provider zip code', 'long', 'provider.zip', 'long'),
                             ('hospital referral region description', 'string','rr', 'string'),
                             ('ACC', 'string', 'charges.covered', 'double'),
                             ('ATP', 'string', 'charges.total_pay', 'double'),
                             ('AMP', 'string', 'charges.medicare_pay', 'double')])

# Write it out in Parquet
glueContext.write_dynamic_frame.from_options(frame = medicare_nest, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet")

Exemple #14

0

Afficher le fichier

Fichier : data-insert-Redshift.py Projet : AshleyYt/Calls-AWS-ETL-POC

input_file_path = "s3://xxxxx"

df = spark.read.option("header","true")\
 .option("inferSchema","true")\
 .option("quote","\"")\
 .option("escape","\"").csv(input_file_path)

df = df.withColumn(
    'event_timestamp',
    f.to_timestamp('event_timestamp', format='MM/dd/yyyy HH:mm'))


df= df.withColumn('year',f.year(f.col('event_timestamp')))\
 .withColumn('month',f.month(f.col('event_timestamp')))

dynamic_df = DynamicFrame.fromDF(df, glueContext, "dynamic_df")

mapped_df = ResolveChoice.apply(frame=dynamic_df,
                                choice="make_cols",
                                transformation_ctx="mapped_df")

datasink = glueContext.write_dynamic_frame.from_jdbc_conf(
    frame=mapped_df,
    catalog_connection="xxxxxxx",
    connection_options={
        "dbtable": "external_data_schema.xxxxxx",
        "database": "dev"
    },
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink")

Exemple #15

0

Afficher le fichier

Fichier : job_mrr_fnd_prt_persn_d_must-173.py Projet : kavyaktj1/GE_CICD_Migration_Boto3

NATURAL_KEY = FINAL_TUPLE_WITH_DF_AND_MD5[1]

## Taking the natual key that passed in Json File.
NATURAL_KEY_1 = NATURAL_KEY[0]

##Taking the value from SOURCE_NAME column (example : "HR PERSON") from FINAL_MD5_DF
POST_QUERY_SOURCE_NAME = FINAL_MD5_DF.select("source_name").limit(1).rdd.map(
    lambda a: a[0]).collect()[0]
print('#######>>>>>>>POST_QUERY_SOURCE_NAME', POST_QUERY_SOURCE_NAME)
print("finalmd5")

FINAL_MD5_DF1 = FINAL_MD5_DF.drop_duplicates()

# Final Data frame is converted to Dynamic frame
# Final Dynamic Frame will be written to Stage Table
FINAL_DYNAMIC_FRAME = DynamicFrame.fromDF(FINAL_MD5_DF1, GLUECONTEXT,
                                          "Final_dynamic_frame")

#Updates,Inserts and Deletes counts logic here
# 1. Create a DF with counts and op_val, Group by JobId,op_val
# 2. Extract inserts, updates and deletes
# 3. Add it to Cloud Watch Logs.

COUNT_DF = FINAL_MD5_DF.withColumn('JobRunId', F.lit(str(RUN_ID)))\
                       .withColumn('JobName', F.lit(str(RUN_ID)))

## Truncating the stage table
PRE_QUERY = """begin;
truncate table {stage_database_name}.{stage_table};
end;""".format(stage_database_name=STAGE_DATABASE_NAME,
               stage_table=STAGE_TABLE)

Exemple #16

0

Afficher le fichier

    *["*"] + [col("kvs").getItem(k).alias(k) for k in keys])

# change the data types and column names to be easier to query later
with_map = with_map \
    .withColumn("id", monotonically_increasing_id()) \
    .withColumn("resources_used_walltime_secs", get_sec("resources_used_walltime")) \
    .withColumn("resources_used_cput", get_sec("resources_used_cput")) \
    .withColumn("resources_used_mem_gb", convert_to_gb("resources_used_mem")) \
    .withColumn("resource_list_nodect", expr("CAST(resource_list_nodect AS INTEGER)")) \
    .withColumn("resource_list_cpu", expr("CAST(resource_list_cpu AS INTEGER)")) \
    .withColumn("resource_list_gpu", expr("CAST(resource_list_gpu AS INTEGER)")) \
    .withColumn("qtime", expr("CAST(qtime AS LONG)")) \
    .withColumn("start", expr("CAST(start AS LONG)")) \
    .withColumn("ctime", expr("CAST(qtime AS LONG)")) \
    .withColumn("etime", expr("CAST(qtime AS LONG)")) \
    .withColumn("end", expr("CAST(qtime AS LONG)")) \
    .withColumn("exit_status", expr("CAST(exit_status AS INTEGER)")) \
    .withColumnRenamed("group", "group_name") \
    .withColumn("resource_list_cores", expr("CAST(resource_list_nodes as LONG) * CAST(resource_list_cpu as INTEGER)")) \
    .withColumn("resources_used_walltime_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \
    .withColumn("resources_used_cput_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \
    .drop('resources_used_vmem', 'kvs', 'session', 'exec_host', 'resource_list_neednodes', 'resource_list_walltime', 'detail')
# eventually drop detail and the asked resources to only use actually used

torq = DynamicFrame.fromDF(with_map, glueContext, "joined")

datasink5 = glueContext.write_dynamic_frame.from_options(frame=torq, connection_type="s3", connection_options={
                                                         "path": args['S3_OUTPUT_PATH'], "partitionKeys": ["year", "month", "day"]}, format="parquet", transformation_ctx="datasink5")

job.commit()

Exemple #17

0

Afficher le fichier

    "zipcode", 'size_of_adjusted_gross_income', 'num_of_returns',
    'num_of_single_returns', 'num_of_joint_returns',
    'num_of_head_of_household_returns', 'num_with_paid_preparers_signature',
    'num_of_exemptions', 'num_of_dependents',
    'num_of_volunteer_prepared_returns_Total',
    'num_of_volunteer_prepared_returns_Num_of_volunteer_income_tax_assistance_prepared_returns',
    'num_of_volunteer_prepared_returns_Num_of_tax_counseling_for_the_elderly_prepared_returns'
]

#rename the columns
for c, n in zip(income_ny_df.columns, new_cols):
    income_ny_df = income_ny_df.withColumnRenamed(c, n)

print("new columns:   ", income_ny_df.columns)

income_ny_DyF = DynamicFrame.fromDF(income_ny_df, glueContext, "income_ny_DyF")

income_ny_DyF.printSchema()

# Print out information about this data
print("Parks Count:  ", parks_DyF.count())
parks_DyF.printSchema()

# Print out information about this data.
print("Playground Count:  ", playgrounds_DyF.count())
playgrounds_DyF.printSchema()
# Convert to Spark DataFrame for left outer join
playgrounds_df = playgrounds_DyF.toDF()
# Drop duplicate columns in parks dataframe
columns_to_drop = ['Location', 'Name', 'year', 'month', 'day']
playgrounds_df = playgrounds_df.drop(*columns_to_drop)

Exemple #18

0

Afficher le fichier

Fichier : udf.py Projet : imyoungyang/amazon-serverless-datalake-workshop

def hash_cc(s):
    return hashlib.sha256(s).hexdigest()

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0")


## @convert glue DynamicFrame to DataFrame to manipulate the columns
dataframe0 = DynamicFrame.toDF(datasource0)

hash_cc_f = udf(lambda x: hash_cc(x), StringType())

dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"]))
dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password')

## @convert dataframe to glue DynamicFrame and write the output in parquet format
datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1")


datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4")

job.commit()

Exemple #19

0

Afficher le fichier

Fichier : job_etl_user_c_fullname_address.py Projet : 01662024622/dts-odin-etl

def main():

    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    dyf_crm_contacts = dyf_crm_contacts.select_fields(
        ['_key', 'Id', 'Code', 'Fullname', 'Address'])
    dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('_key',
                                                              'cast:long')])

    dy_source_voxy_cache = dyf_crm_contacts.toDF()
    dy_source_voxy_cache = dy_source_voxy_cache.cache()
    dyf_crm_contacts = DynamicFrame.fromDF(dy_source_voxy_cache, glueContext,
                                           'dyf_crm_contacts')

    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_user_communication_full_name.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
    #                                     f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')

    print('the number of new contacts: ', dyf_crm_contacts.count())

    if (dyf_crm_contacts.count() > 0):

        # print('Chay vao day nhe------------------')
        # print('dyf_crm_contacts::----------------')
        # dyf_crm_contacts.printSchema()
        # try:
        #--------------------------------------------------------------------------------------------------------------#
        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Id"] is not None and x["Id"] != '' and x[
                "Code"] is not None and x["Code"] != '' and x[
                    "Fullname"] is not None and x["Fullname"] != '')
        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#
        # today = date.today()
        # today_timestamp = today.timestamp();
        # print("Today's date:", today_timestamp)

        dy_crm_contacts = dyf_crm_contacts.toDF()
        dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code'])
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'communication_type_full_name', f.lit(4))
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'communication_type_address', f.lit(6))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_primary', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_deleted', f.lit(0))
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'last_update_date', f.lit('2019-08-28 00:00:00'))
        dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts, glueContext,
                                               'dyf_crm_contacts')

        dyf_crm_contacts = dyf_crm_contacts.resolveChoice(
            specs=[('last_update_date', 'cast:long')])

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("communication_type_full_name", 'int',
                       'communication_type', 'int'),
                      ("is_primary", 'int', 'is_primary', 'int'),
                      ("is_deleted", 'int', 'is_deleted', 'int'),
                      ("Fullname", 'string', 'comunication', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])

        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/",
            transformation_ctx="datasink4")

        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Address"] is not None and x["Address"] != '')
        #--------------------------------------------------------------------------------------------------------------#
        applymapping3 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("communication_type_address", 'int',
                       'communication_type', 'int'),
                      ("is_primary", 'int', 'is_primary', 'int'),
                      ("is_deleted", 'int', 'is_deleted', 'int'),
                      ("Address", 'string', 'comunication', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])
        #
        #
        resolvechoice3 = ResolveChoice.apply(
            frame=applymapping3,
            choice="make_cols",
            transformation_ctx="resolvechoice3")
        dropnullfields3 = DropNullFields.apply(
            frame=resolvechoice3, transformation_ctx="dropnullfields3")

        datasink3 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields3,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/address/",
            transformation_ctx="datasink3")
        # --------------------------------------------------------------------------------------------------------------#

        #insert into source_id

        # lay max _key tren datasource
        datasource = dyf_crm_contacts.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        # ghi de flag moi vao s3
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        df.write.parquet(
            "s3a://dts-odin/flag/flag_user_communication_full_name.parquet",
            mode="overwrite")

Exemple #20

0

Afficher le fichier

Fichier : mysqldbingestion.py Projet : mbahra/AWS_Data_Engineer

## @args: [f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame = Transform7]
Transform2 = Filter.apply(frame = Transform7, f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2")
## @type: ApplyMapping
## @args: [mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6"]
## @return: Transform6
## @inputs: [frame = Transform2]
Transform6 = ApplyMapping.apply(frame = Transform2, mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6")
## @type: Join
## @args: [columnConditions = ["=", "="], joinType = right, keys2 = ["idfixture", "idhometeam"], keys1 = ["(predictions) idfixture", "(predictions) idteam"], transformation_ctx = "Transform4"]
## @return: Transform4
## @inputs: [frame1 = Transform1, frame2 = Transform6]
Transform1DF = Transform1.toDF()
Transform6DF = Transform6.toDF()
Transform4 = DynamicFrame.fromDF(Transform1DF.join(Transform6DF, (Transform1DF['(predictions) idfixture'] == Transform6DF['idfixture']) & (Transform1DF['(predictions) idteam'] == Transform6DF['idhometeam']), "right"), glueContext, "Transform4")
## @type: ApplyMapping
## @args: [mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5"]
## @return: Transform5
## @inputs: [frame = Transform4]
Transform5 = ApplyMapping.apply(frame = Transform4, mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5")
## @type: Join
## @args: [columnConditions = ["=", "="], joinType = left, keys2 = ["(predictions) idfixture", "(predictions) idteam"], keys1 = ["idfixture", "idawayteam"], transformation_ctx = "Transform8"]
## @return: Transform8
## @inputs: [frame1 = Transform5, frame2 = Transform1]
Transform5DF = Transform5.toDF()
Transform1DF = Transform1.toDF()
Transform8 = DynamicFrame.fromDF(Transform5DF.join(Transform1DF, (Transform5DF['idfixture'] == Transform1DF['(predictions) idfixture']) & (Transform5DF['idawayteam'] == Transform1DF['(predictions) idteam']), "left"), glueContext, "Transform8")
## @type: ApplyMapping
## @args: [mappings = [("date", "string", "date", "string"), ("(predictions) xgoals", "double", "xgoalsawayteam", "decimal"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("xgoalshometeam", "double", "xgoalshometeam", "decimal"), ("idfixture", "int", "idfixture", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("idhometeam", "int", "idhometeam", "int"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("time", "string", "time", "string"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string")], transformation_ctx = "Transform0"]
## @return: Transform0

Exemple #21

0

Afficher le fichier

Fichier : QATest312.py Projet : vijayanand-88/DIQA

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

## @params: [JOB_NAME]
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

ds0 = glueContext.create_dynamic_frame.from_catalog(
    database="autoglues3lineage",
    table_name="train_sm_s2adb_csv",
    transformation_ctx="ds0")

ds3 = ds0.toDF()
ds3.createOrReplaceTempView("train_sm_s2adb_csv_temp2")
ds4 = spark.sql("SELECT * FROM train_sm_s2adb_csv_temp2 WHERE age > 30")
ds5 = DynamicFrame.fromDF(ds4, glueContext, "ds5")

ds6 = glueContext.write_dynamic_frame.from_options(
    frame=ds5,
    connection_type="redshift",
    connection_options={
        "url":
        "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world",
        "dbtable": "atn.gluetable312"
    },
    transformation_ctx="ds6")
ds7 = glueContext.write_dynamic_frame.from_options(
    frame=ds5,
    connection_type="s3",
    connection_options={"path": "s3://asgqatestautomation4/Targetdata312"},
    format="json",

Exemple #22

0

Afficher le fichier

current_timestamp = time.strftime("%Y-%m-%d %H:%M:%S")

######################################
####        CONNECTION BLOCK      ####
######################################

## argo_carrier_visit connection
argoCV_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_initial",
    table_name="argo_carrier_visit",
    transformation_ctx="argoCV_ds")
argoCV_regDF = argoCV_ds.toDF()
argoCV_regDF = argoCV_regDF.withColumn("sourcesystem", lit("PNCT")).withColumn(
    "dboperationtype", lit("L")).withColumn("audtdateadded",
                                            lit(current_timestamp))
argoCV_dynDF = DynamicFrame.fromDF(argoCV_regDF, glueContext, "nested")

## argo_chargeable_unit_events connection
argoCUE_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_initial",
    table_name="argo_chargeable_unit_events",
    transformation_ctx="argoCUE_ds")
argoCUE_regDF = argoCUE_ds.toDF()
argoCUE_regDF = argoCUE_regDF.withColumn(
    "sourcesystem",
    lit("PNCT")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
argoCUE_dynDF = DynamicFrame.fromDF(argoCUE_regDF, glueContext, "nested")

## argo_visit_details connection

Exemple #23

0

Afficher le fichier

Fichier : partition-inventory.py Projet : awslabs/amazon-s3-glacier-refreezer

    database=DATABASE, table_name=INVENTORY_TABLE).toDF()
filelist = glueContext.create_dynamic_frame.from_catalog(
    database=DATABASE, table_name=FILENAME_TABLE)
mapped = filelist.apply_mapping([
    ("archiveid", "string", "archiveid", "string"),
    ("override", "string", "override", "string")
]).toDF().dropDuplicates(['archiveid'])

rownum = inventory.withColumn(
    "row_num",
    row_number().over(
        Window.orderBy(inventory['creationdate'],
                       inventory['archiveid'])).cast("long"))
merged = rownum.join(mapped, "archiveid", how='left_outer')

frame = DynamicFrame.fromDF(merged, glueContext, "merged")


def transform(rec):
    rec["part"] = rec["row_num"] // partiton_size
    rec["archivedescription"] = rec["override"] if rec["override"] and rec[
        "override"].strip() else rec["archivedescription"]
    rec.pop('override', None)
    return rec


trans0 = Map.apply(frame=frame, f=transform)

sink = glueContext.getSink(connection_type="s3",
                           path='s3://' + STAGING_BUCKET + '/partitioned/',
                           enableUpdateCatalog=True,

Exemple #24

0

Afficher le fichier

Fichier : etl-yellow.py Projet : groth00/DATA228

              ("passenger_count", "long", "passenger_count", "long"),
              ("trip_distance", "double", "trip_distance", "double"),
              ("pulocationid", "long", "pulocationid", "long"),
              ("dolocationid", "long", "dolocationid", "long"),
              ("fare_amount", "double", "fare_amount", "double"),
              ("tip_amount", "double", "tip_amount", "double"),
              ("total_amount", "double", "total_amount", "double")],
    transformation_ctx="applymapping1")

resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")
sparkdf = resolvechoice2.toDF()
transform1 = sparkdf.where(
    func.col('tpep_pickup_datetime').between('2019-01-01', '2020-12-31'))
transform2 = transform1.dropna(subset=['passenger_count', 'trip_distance'])
result = DynamicFrame.fromDF(dataframe=transform2,
                             glue_ctx=glueContext,
                             name='result')

datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
    frame=result,
    catalog_connection="redshift-east",
    connection_options={
        "dbtable": "yellow",
        "database": "dev"
    },
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink4")

job.commit()

Exemple #25

0

Afficher le fichier

def _find_row(paintings: DynamicFrame, episode_text: str):
    """ Assert a given row exists in the dynamic frame and that it contains the expected values """
    matches = paintings.filter(
        lambda x: x['season_episode_text'] == episode_text).toDF().collect()
    assert len(matches) == 1
    return matches[0]

Exemple #26

0

Afficher le fichier

def main():
    def checknull(level_modified, level_study):
        if level_modified is not None:
            return level_modified
        else:
            return level_study

    checknull_ = udf(checknull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_package_status_code,
                  transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_package_status_code is not None:
            text_concat += str(student_package_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['student_id', 'contact_id', 'level_study'])

    dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_level_study")

    dyf_log_student_level_study = dyf_log_student_level_study.select_fields([
        'contact_id', 'level_current', 'level_modified', 'package_code',
        'time_created'
    ])
    dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice(
        specs=[('_key', 'cast:int')])

    dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product")
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([
        '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price',
        'invoice_price', 'invoice_code'
    ])
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice(
        specs=[('_key', 'cast:long')])
    dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product_details")

    dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
        ['cat_code', 'package_time', 'invoice_code'])

    dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_package")

    # chon cac field
    dyf_student_package = dyf_student_package.select_fields(
        ['student_id', 'start_time', 'end_time',
         'package_code']).rename_field('student_id', 'student_id1')
    dyf_student_package.printSchema()
    dyf_student_package.show(2)
    # # doc flag tu s3
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet"
        )
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_tpe_invoice_product = Filter.apply(
            frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', dyf_tpe_invoice_product.count())

    if (dyf_tpe_invoice_product.count() > 0):
        df_log_student_level_study = dyf_log_student_level_study.toDF()
        df_log_student_level_study = df_log_student_level_study.groupby(
            'contact_id', 'level_current', 'level_modified',
            'package_code').agg(f.max('time_created').alias('time_created'))

        dyf_join0 = Join.apply(dyf_tpe_invoice_product,
                               dyf_tpe_invoice_product_details, 'invoice_code',
                               'invoice_code')
        print("@@@@@@@@@@@@")
        dyf_join0.printSchema()
        dyf_join0.show(2)
        dyf_log_student_level_study = DynamicFrame.fromDF(
            df_log_student_level_study, glueContext,
            "dyf_log_student_level_study")

        dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id",
                               "user_id")
        dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study,
                              "user_id", "contact_id")
        print("@@@@@@@@@@@@")
        dyf_join.printSchema()
        dyf_join.show(2)
        dyf_join = Filter.apply(
            frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated'])

        dyf_data_join3 = Join.apply(dyf_join, dyf_student_package,
                                    "student_id", "student_id1")
        dyf_data_join3 = Filter.apply(
            frame=dyf_data_join3,
            f=lambda x: x['package_code'] == x['cat_code'])
        df_data_join3 = dyf_data_join3.toDF()
        df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\
        .withColumn("behavior_id", f.lit(3))\
        .withColumn("student_package_status_code", f.lit("DEACTIVED"))\
        .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\
        .withColumn("package_starttime", df_data_join3['start_time'])\
        .withColumn("package_endtime", df_data_join3['end_time']) \
            .withColumn("transformed_at", f.lit(None))
        df_data_join3 = df_data_join3.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(df_data_join3.student_behavior_date,
                          df_data_join3.behavior_id, df_data_join3.student_id,
                          df_data_join3.contact_id, df_data_join3.package_code,
                          df_data_join3.package_endtime,
                          df_data_join3.package_starttime,
                          df_data_join3.student_level_code,
                          df_data_join3.student_package_status_code,
                          df_data_join3.transformed_at)))
        df_data_join3 = df_data_join3.dropDuplicates()
        dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext,
                                             "dyf_data_join3")
        dyf_data_join3 = dyf_data_join3.resolveChoice(
            specs=[('behavior_id',
                    'cast:int'), ('student_behavior_date', 'cast:timestamp')])
        dyf_data_join3.printSchema()
        dyf_data_join3.show(2)
        applymapping = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("student_behavior_date", "timestamp",
                       "student_behavior_date", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("cat_code", "string", "package_code", "string"),
                      ("package_starttime", "int", "package_starttime",
                       "long"),
                      ("package_endtime", "int", "package_endtime", "long"),
                      ("student_package_status_code", "string",
                       "student_status_code", "string"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string")])

        resolvechoice = ResolveChoice.apply(frame=applymapping,
                                            choice="make_cols",
                                            transformation_ctx="resolvechoice")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        print(dropnullfields.count())
        dropnullfields.toDF().show()

        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        applymapping1 = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("invoice_packages_price", "int", "measure1", "long"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("invoice_price", "int", "measure2 ", "long")])

        resolvechoice1 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice1")

        dropnullfields1 = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields1")

        print(dropnullfields1.count())
        dropnullfields1.toDF().show()
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path":
                "s3://dtsodin/student_behavior/student_general_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF()
        flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet",
            mode="overwrite")

Exemple #27

0

Afficher le fichier

# extract out transactions for test/validation
n_train = int(transactions.count() * train_data_ratio)
test_ids = transactions.select_fields(TRANSACTION_ID)
get_fraud_frac = lambda series: 100 * sum(series) / len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(
    sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids.toDF(), 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols,
                                                 cat_cols)

# Creating glue dynamic frame from spark dataframe
features_dynamic_df = DynamicFrame.fromDF(features_df, glueContext,
                                          'FeaturesDF')
features_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    features_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions as vertices of graph...')
features_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=50))
logger.info(f'Creating glue DF from labels dataframe')
labels_dynamic_df = DynamicFrame.fromDF(labels_df, glueContext, 'LabelsDF')
labels_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    labels_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions with isFraud property...')
labels_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=100))

dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

Exemple #28

0

Afficher le fichier

Fichier : glue-job.py Projet : asimjalis/aws-demos

glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Read DynamicFrame.
dynf = glueContext.create_dynamic_frame.from_catalog(database="default",
                                                     table_name="sales",
                                                     transformation_ctx="dynf")

# Convert to DataFrame.
df = dynf.toDF()

# Put table on DataFrame.
df.createOrReplaceTempView("sales_tmp")

# Run SQL.
sql_df = spark.sql(
    "SELECT id, date, store, state, product, amount * 2.1 from sales_tmp")

# Convert back to DynamicFrame.
dynf_new = DynamicFrame.fromDF(sql_df, glueContext, "df")
datasink4 = glueContext.write_dynamic_frame.from_catalog(
    frame=dynf_new,
    database="default",
    table_name="sales1",
    transformation_ctx="datasink4")

# Commit.
job.commit()

Exemple #29

0

Afficher le fichier

specid ,
systemcreationdate ,
udblistingid,
to_date(effectiveto) AS effectiveto_date
FROM edw_listings WHERE row_number_seq = 1
            ''')
#df_joined.cache()
df_joined.describe()
df_joined.printSchema()
#print  df_joined.count()
s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_data/listingdim_pdt_deduped_pq'

output_folder = s3_location_target  # With absolute path
print 'output_folder= %s' % (output_folder)
#----  PySpark section ----

#df_joined.write.mode('overwrite').parquet(output_folder)
#df_joined.write.mode('overwrite').save(output_folder)
new_dynamic_frame = DynamicFrame.fromDF(df_joined, glueContext,
                                        "new_dynamic_frame")
codec = 'snappy'
#glueContext.write_dynamic_frame.from_options(frame = m_df, connection_type = "s3", connection_options = {"path":  child_output_dir}, format = "parquet", compression=codec)
glueContext.write_dynamic_frame.from_options(
    frame=new_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_folder},
    format="parquet",
    compression=codec)

print 'Done Parquet Conversion !'

Exemple #30

0

Afficher le fichier

Fichier : context.py Projet : yuua/aws-glue-libs

 def create_dynamic_frame_from_rdd(self, data, name, schema=None, sample_ratio=None, transformation_ctx=""):
     """Creates a DynamicFrame from an RDD.
     """
     df = super(GlueContext, self).createDataFrame(data, schema, sample_ratio)
     return DynamicFrame.fromDF(df, self, name)

Exemple #31

0

Afficher le fichier

Fichier : IngestREST.py Projet : taalba/DataDirect-Code-Samples

args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

##Read Data from REST API using DataDirect Autonomous REST Connector JDBC driver in to DataFrame
source_df = spark.read.format("jdbc").option(
    "url",
    "jdbc:datadirect:autorest:config=yelp.rest;AuthenticationMethod=HttpHeader;AuthHeader=Authorization;SecurityToken='Bearer JcMUtuWfaqJdWJBqqLrgBxfbYh6GIUGv3zUyXOG4zsfe6wnOtlZBeroFb8rpRM-dESFzcSAUd1YDAtQm2yl0hrJwfldvHp2AdEzRXThZku69r-w4wTv80Cj7d08ZXHYx'"
).option("dbtable", "AUTOREST.BUSINESSES").option(
    "driver", "com.ddtek.jdbc.autorest.AutoRESTDriver").load()

job.init(args['JOB_NAME'], args)

print(source_df)

##Convert DataFrames to AWS Glue's DynamicFrames Object
dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df")

##Write Dynamic Frames to S3 in CSV format. You can write it to any rds/redshift, by using the connection that you have defined previously in Glue
datasink4 = glueContext.write_dynamic_frame.from_options(
    frame=dynamic_dframe,
    connection_type="s3",
    connection_options={"path": "s3://glueuserdata"},
    format="csv",
    transformation_ctx="datasink4")

job.commit()

Exemple #32

0

Afficher le fichier

####        CONNECTION BLOCK      ####
######################################

## ref_bizunit_scoped connection
refBizScopedCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="nola_staging_initial",
    table_name="ref_bizunit_scoped",
    transformation_ctx="refBizScopedCon_ds")
refBizScopedCon_regDF = refBizScopedCon_ds.toDF()
refBizScopedCon_regDF = refBizScopedCon_regDF.withColumn(
    "sourcesystem",
    lit("NOLA")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
refBizScopedCon_distDF = refBizScopedCon_regDF.distinct()
refBizScopedCon_dynDF = DynamicFrame.fromDF(refBizScopedCon_distDF,
                                            glueContext, "nested")

## ref_carrier_itinerary connection
refCarItinCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="nola_staging_initial",
    table_name="ref_carrier_itinerary",
    transformation_ctx="refCarItinCon_ds")
refCarItinCon_regDF = refCarItinCon_ds.toDF()
refCarItinCon_regDF = refCarItinCon_regDF.withColumn(
    "sourcesystem",
    lit("NOLA")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
refCarItinCon_distDF = refCarItinCon_regDF.distinct()
refCarItinCon_dynDF = DynamicFrame.fromDF(refCarItinCon_distDF, glueContext,
                                          "nested")

Exemple #33

0

Afficher le fichier

# Cast choices into integers, those values that cannot cast result in null
medicare_res_cast = medicare_dyf.resolveChoice(specs=[('provider id',
                                                       'cast:long')])
medicare_res_project = medicare_dyf.resolveChoice(specs=[('provider id',
                                                          'project:long')])
medicare_res_make_cols = medicare_dyf.resolveChoice(specs=[('provider id',
                                                            'make_cols')])
medicare_res_make_struct = medicare_dyf.resolveChoice(specs=[('provider id',
                                                              'make_struct')])

# Spark SQL on a Spark dataframe
medicare_df = medicare_dyf.toDF()
medicare_df.createOrReplaceTempView("medicareTable")
medicare_sql_df = spark.sql(
    "SELECT * FROM medicareTable WHERE `total discharges` > 30")
medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext,
                                       "medicare_sql_dyf")

# Write it out in Json
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_cast,
    connection_type="s3",
    connection_options={"path": medicare_cast},
    format="json")
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_project,
    connection_type="s3",
    connection_options={"path": medicare_project},
    format="json")
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_make_cols,
    connection_type="s3",

Exemple #34

0

Afficher le fichier

Fichier : job_etl_user_profile.py Projet : 01662024622/dts-odin-etl

def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    is_dev = True
    limit = True
    # information database???????
    dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    # dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
    #                                 f=lambda x: x["Id"] < 1102)

    # print('dyf_crm_contacts::fdfdfdfdfdfdfd----------------')
    # dyf_crm_contacts.printSchema()
    dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('Id',
                                                              'cast:int')])

    print('dyf_crm_contacts')
    dyf_crm_contacts.printSchema()

    #doc moc flag tu s3
    try:
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/flag_user_profile.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
                                        f=lambda x: x["Id"] > read_from_index)
    except:
        print('read flag file error ')
    print('the number of new contacts: ', dyf_crm_contacts.count())

    crm_contacts_number = dyf_crm_contacts.count()
    print('crm_contacts_number: ', crm_contacts_number)
    if crm_contacts_number < 1:
        print('Stopping--- crm_contacts_number < 1')
        return

    dyf_crm_contacts = dyf_crm_contacts.select_fields(
        ['_key', 'Id', 'Code', 'Birthday', 'Gender', 'Job', 'CreatedDate'])
    dy_crm_contacts_cache = dyf_crm_contacts.toDF()
    dy_crm_contacts_cache = dy_crm_contacts_cache.dropDuplicates(['Code'])
    dy_crm_contacts_cache = dy_crm_contacts_cache.cache()
    dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache, glueContext,
                                           'dyf_crm_contacts')

    today = date.today()
    d4 = today.strftime("%Y-%m-%d")
    print("d4 =", d4)

    # print('Chay vao day nhe------------------')
    # print('dyf_crm_contacts::----------------')
    # dyf_crm_contacts.printSchema()
    # try:
    #--------------------------------------------------------------------------------------------------------------#
    dyf_crm_contacts = Filter.apply(
        frame=dyf_crm_contacts,
        f=lambda x: x["Id"] is not None and x["Id"] != '' and x[
            "Code"] is not None and x["Code"] != '')
    # --------------------------------------------------------------------------------------------------------------#

    # --------------------------------------------------------------------------------------------------------------#
    if (dyf_crm_contacts.count() > 0):
        dy_crm_contacts = dyf_crm_contacts.toDF()
        # dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code'])
        dy_crm_contacts = dy_crm_contacts.withColumn('source_type', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_root', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('description', f.lit(d4))
        dy_crm_contacts = dy_crm_contacts.withColumn('last_update_date',
                                                     f.lit(d4))
        dy_crm_contacts_cache_2 = dy_crm_contacts.cache()
        dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache_2,
                                               glueContext, 'dyf_crm_contacts')

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("Gender", 'int', 'gender', 'string'),
                      ("is_root", 'int', 'is_root', 'int'),
                      ("Birthday", 'string', 'birthday', 'date'),
                      ("Job", 'string', 'job', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])
        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_profile",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/profile/",
            transformation_ctx="datasink4")

        #insert into source_id

        # print('dyf_crm_contacts::-------source_type---------')
        # dyf_crm_contacts.printSchema()

        applymapping3 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("source_type", 'int', 'source_type', 'int'),
                      ("Code", 'string', 'source_id', 'string'),
                      ("description", 'string', 'description', 'string')])

        resolvechoice3 = ResolveChoice.apply(
            frame=applymapping3,
            choice="make_cols",
            transformation_ctx="resolvechoice3")
        dropnullfields7 = DropNullFields.apply(
            frame=resolvechoice3, transformation_ctx="resolvechoice3")

        # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields7,
        #                                                            catalog_connection="glue_redshift",
        #                                                            connection_options={"dbtable": "user_map",
        #                                                                                "database": "dts_odin"},
        #                                                            redshift_tmp_dir="s3n://dts-odin/temp/user/map/",
        #                                                            transformation_ctx="datasink5")

        #lay max _key tren datasource
        flag = dy_crm_contacts_cache.agg({"Id": "max"}).collect()[0][0]

        # ghi de flag moi vao s3
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "int").toDF('flag')

        df.write.parquet("s3a://dtsodin/flag/flag_user_profile.parquet",
                         mode="overwrite")
        dy_crm_contacts_cache_2.unpersist()
        dy_crm_contacts_cache.unpersist()

Exemple #35

0

Afficher le fichier

def main():
    today = datetime.now(ho_chi_minh_timezone)
    print('today: ', today)
    yesterday = today - timedelta(1)
    today_id = int(today.strftime("%Y%m%d"))
    yesterday_id = int(yesterday.strftime("%Y%m%d"))
    print('today_id: ', today_id)
    print('yesterday_id: ', yesterday_id)

    lastest_number_days = 30
    chosen_word_number = 24

    yesterday = date.today() - timedelta(1)
    yesterday_id = int(yesterday.strftime("%Y%m%d"))

    lasted_30_day = today - timedelta(lastest_number_days)
    lasted_30_day_id = int(lasted_30_day.strftime("%Y%m%d"))

    StructPlusNumber = StructType([
        StructField("lo_plus_number", LongType(), False),
        StructField("learning_object_id", LongType(), False),
        StructField("learning_last_date_id", LongType(), False)
    ])

    def getBestWords(plus_number_pair_list):
        plus_number_pair_list = \
            sorted(plus_number_pair_list, key=lambda x: x['lo_plus_number'], reverse=True)
        a = plus_number_pair_list[0:chosen_word_number]
        return a

    getBestWords = udf(getBestWords, ArrayType(StructPlusNumber))

    #--------------------------------------------------------
    StructMiniNumber = StructType([
        StructField("lo_minus_number", LongType(), False),
        StructField("learning_object_id", LongType(), False),
        StructField("learning_last_date_id", LongType(), False)
    ])

    def getWorstWords(minus_number_pair_list):
        minus_number_pair_list = \
            sorted(minus_number_pair_list, key=lambda x: x['lo_minus_number'], reverse=True)
        a = minus_number_pair_list[0:chosen_word_number]
        return a

    getWorstWords = udf(getWorstWords, ArrayType(StructMiniNumber))
    #----------------------------------------

    if IS_DEV:
        dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user":
                "******",
                "password":
                "******",
                "dbtable":
                "mapping_lo_student_history_test",
                "redshiftTmpDir":
                "s3://dts-odin/temp1/mapping_lo_student_history_test/v9"
            })
    else:
        # dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
        #     database="nvn_knowledge",
        #     table_name="mapping_lo_student_history",
        #     additional_options={"path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"},
        #     push_down_predicate="(partition_0=='starter_ait' or partition_0=='starter_micro')"
        # )

        dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
            database="nvn_knowledge",
            table_name="mapping_lo_student_history",
            additional_options={
                "path":
                "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"
            },
            push_down_predicate="(partition_0=='starter_micro')")

    dyf_mapping_lo_student_history = dyf_mapping_lo_student_history.select_fields(
        [
            'student_id', 'learning_object_id', 'minus_number', 'plus_number',
            'lu_type', 'created_date_id'
        ])

    if not IS_DEV:
        dyf_mapping_lo_student_history = Filter.apply(
            frame=dyf_mapping_lo_student_history,
            f=lambda x: x["student_id"] is not None and x["student_id"] != 0
            and x["learning_object_id"] is not None and x[
                "created_date_id"] >= lasted_30_day_id and x["lu_type"] == 1)

    if IS_DEV:
        print('dyf_mapping_lo_student_history')
        # dyf_mapping_lo_student_history.printSchema()
        # dyf_mapping_lo_student_history.show(3)

    df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF()
    df_mapping_lo_student_history = df_mapping_lo_student_history.cache()

    # print('df_mapping_lo_student_history: ', df_mapping_lo_student_history.count())

    if df_mapping_lo_student_history.count() < 1:
        return

    df_group_plus_minus_number = df_mapping_lo_student_history.groupby(
        'student_id', 'learning_object_id').agg(
            f.sum('plus_number').alias('lo_plus_number'),
            f.sum('minus_number').alias('lo_minus_number'),
            f.max('created_date_id').alias('learning_last_date_id'))

    # print('df_group_plus_minus_number')
    df_group_plus_minus_number.printSchema()
    df_group_plus_minus_number.show(3)

    df_group_plus_minus_number = df_group_plus_minus_number.na.fill({
        'lo_plus_number':
        0,
        'lo_minus_number':
        0
    })

    # xu ly de plus va minus khong trung nhau
    df_group_plus_minus_number = df_group_plus_minus_number\
        .select(
            'student_id', 'learning_object_id',
            f.when(f.col('lo_plus_number') >= f.col('lo_minus_number'), f.col('lo_plus_number'))
                .otherwise(0).alias('lo_plus_number'),
            f.when(f.col('lo_plus_number') < f.col('lo_minus_number'), f.col('lo_minus_number'))
                .otherwise(0).alias('lo_minus_number'),

            'learning_last_date_id'
    )

    df_group_plus_minus_number = df_group_plus_minus_number.select(
        'student_id',
        f.struct('lo_plus_number', 'learning_object_id',
                 'learning_last_date_id').alias('plus_number_pair'),
        f.struct('lo_minus_number', 'learning_object_id',
                 'learning_last_date_id').alias('minus_number_pair'))

    df_group_l2 = df_group_plus_minus_number.groupby('student_id').agg(
        f.collect_list('plus_number_pair').alias('plus_number_pair_list'),
        f.collect_list('minus_number_pair').alias('minus_number_pair_list'))

    print('df_group_l2')
    df_group_l2.printSchema()
    df_group_l2.show(2)

    df_group_l2 = df_group_l2.withColumn('right_list', getBestWords(df_group_l2.plus_number_pair_list))\
            .withColumn('wrong_list', getWorstWords(df_group_l2.minus_number_pair_list))

    print('df_group_l2---')
    df_group_l2.printSchema()
    df_group_l2.show(1)

    df_group_l2_right = df_group_l2.select(
        'student_id',
        f.explode('right_list').alias('str_right_item'))
    df_group_l2_wrong = df_group_l2.select(
        'student_id',
        f.explode('wrong_list').alias('str_wrong_item'))

    df_group_l2_right = df_group_l2_right.select(
        'student_id',
        f.col('str_right_item').getItem("lo_plus_number").alias(
            "learning_object_number"),
        f.col('str_right_item').getItem("learning_object_id").alias(
            "learning_object_id"),
        f.col('str_right_item').getItem("learning_last_date_id").alias(
            "learning_last_date_id"),
        f.lit(1).cast('long').alias("number_type"))

    df_group_l2_right = df_group_l2_right.filter(
        df_group_l2_right.learning_object_number.isNotNull())

    df_group_l2_wrong = df_group_l2_wrong.select(
        'student_id',
        f.col('str_wrong_item').getItem("lo_minus_number").alias(
            "learning_object_number"),
        f.col('str_wrong_item').getItem("learning_object_id").alias(
            "learning_object_id"),
        f.col('str_wrong_item').getItem("learning_last_date_id").alias(
            "learning_last_date_id"),
        f.lit(-1).cast('long').alias("number_type"))

    df_group_l2_wrong = df_group_l2_wrong.filter(
        (df_group_l2_wrong.learning_object_number.isNotNull())
        & (df_group_l2_wrong.learning_object_number != 0))
    print('df_group_l2_right')
    df_group_l2_right.printSchema()
    df_group_l2_right.show(2)

    print('df_group_l2_wrong')
    df_group_l2_wrong.printSchema()
    df_group_l2_wrong.show(2)

    total_plus_minus = df_group_l2_right.union(df_group_l2_wrong)

    #add created_date_id
    total_plus_minus = total_plus_minus.withColumn(
        'created_date_ids', udf_get_date_list(f.lit(yesterday)))
    total_plus_minus = total_plus_minus\
        .select(
            'student_id',
            'learning_object_number',
            'learning_object_id',
            'learning_last_date_id',
            'number_type',
            f.explode('created_date_ids').alias('created_date_id')
        )

    print('total_plus_minus')
    total_plus_minus.printSchema()

    dyf_total_plus_minus = DynamicFrame.fromDF(total_plus_minus, glueContext,
                                               'dyf_total_plus_minus')

    clear_before_saving = 'DELETE student_phonetic_number_history where created_date_id >= ' + str(
        yesterday_id)

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dyf_total_plus_minus,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": clear_before_saving,
            "dbtable": "student_phonetic_number_history",
            "database": "dts_odin"
        },
        redshift_tmp_dir=
        "s3://dts-odin/temp/nvn/knowledge/student_phonetic_number_history/v4",
        transformation_ctx="datasink4")

    df_mapping_lo_student_history.unpersist()

Exemple #36

0

Afficher le fichier

Fichier : h2472_backup.py Projet : 01662024622/dts-odin-etl

def back_kup_h2472_rating_answer():
    dyf_jh2472_rating_answer = glueContext \
        .create_dynamic_frame.from_catalog(database="do_h2472",
                                           table_name="rating_answer")

    if is_dev:
        print('dyf_jh2472_rating_answer')
        dyf_jh2472_rating_answer.printSchema()
        dyf_jh2472_rating_answer.show(3)

    # return

    # root
    # | -- id: string
    # | -- rating: float
    # | -- rating_date: string
    # | -- rating_user: string
    # | -- answer_id: string
    # | -- _key: string
    # | -- _table: string
    # | -- _schema: string

    dyf_jh2472_rating_answer = dyf_jh2472_rating_answer.resolveChoice(
        specs=[('id', 'cast:long'), ('rating', 'cast:double')])
    #
    dyf_jh2472_rating_answer = Filter.apply(frame=dyf_jh2472_rating_answer,
                                            f=lambda x: x["id"] > 26139)

    df_jh2472_rating_answer = dyf_jh2472_rating_answer.toDF()
    df_jh2472_rating_answer = df_jh2472_rating_answer.dropDuplicates(['id'])
    dyf_jh2472_rating_answer = DynamicFrame.fromDF(df_jh2472_rating_answer,
                                                   glueContext,
                                                   'dyf_jh2472_rating_answer')

    # #
    applymapping1 = ApplyMapping.apply(
        frame=dyf_jh2472_rating_answer,
        mappings=[("id", 'long', 'id', 'long'),
                  ("rating", "double", "rating", "double"),
                  ("rating_date", "string", "rating_date", "timestamp"),
                  ("rating_user", "string", "rating_user", "string"),
                  ("answer_id", 'string', 'answer_id', 'long')])
    # # #
    resolvechoice1 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice1")
    #
    #
    if is_dev:
        print('resolvechoice1')
        resolvechoice1.printSchema()
        resolvechoice1.show(3)

    # #
    # #
    datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=resolvechoice1,
        catalog_connection="h2474_backup",
        connection_options={
            "dbtable": "rating_answer",
            "database": "topicaH2472"
        },
        redshift_tmp_dir="s3a://dts-odin/topicaH2472/rating_answer",
        transformation_ctx="datasink5")

Exemple #37

0

Afficher le fichier

Fichier : log-combiner-glue-script.py Projet : partnercloudsupport/amazon-cloudfront-log-analysis

trimmedLEOriginRequestLogs = DropFields.apply(frame = labdaEdgeOriginRequestLogs, paths=["executionregion", "distributionid", "distributionname", "requestdata", "customtraceid", "eventtype", "year", "month", "date", "hour"], transformation_ctx ="trimmedLEOriginRequestLogs")

## Rename the requestid field for Lambda@Edge origin request logs to origin requestid
modifiedLEOriginRequestLogs = RenameField.apply(frame = trimmedLEOriginRequestLogs, old_name = "requestid", new_name = "origin_requestid", transformation_ctx ="modifiedLEOriginRequestLogs" )

## Convert to DataFrame
modifiedLEOriginRequestLogsDF = modifiedLEOriginRequestLogs.toDF()

## Convert to DataFrame
modifiedLEViewerRequestLogsDF = modifiedLEViewerRequestLogs.toDF()

## Join(left outer join) the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
combinedLambdaEdgeLogsDF = modifiedLEViewerRequestLogsDF.join(modifiedLEOriginRequestLogsDF, modifiedLEViewerRequestLogsDF["requestid"] == modifiedLEOriginRequestLogsDF["origin_requestid"], "left_outer")

## Convert to DynamicFrame
combinedLambdaEdgeLogs = DynamicFrame.fromDF(combinedLambdaEdgeLogsDF, glueContext, "combinedLambdaEdgeLogs")

## Join the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
#combinedLambdaEdgeLogs = Join.apply(modifiedLEViewerRequestLogs, modifiedLEOriginRequestLogs, 'requestid', 'origin_requestid')

## Drop the origin_requestid field
lambdaEdgeLogs = DropFields.apply(frame = combinedLambdaEdgeLogs, paths=["origin_requestid"], transformation_ctx ="lambdaEdgeLogs")

## Drop the "year", "month", "date", "hour" fields
trimmedLambdaEdgeLogs = DropFields.apply(frame =lambdaEdgeLogs, paths=["year", "month", "date", "hour", "useragentstring"], transformation_ctx ="trimmedLambdaEdgeLogs")

## Convert to DataFrame
trimmedLambdaEdgeLogsDF = trimmedLambdaEdgeLogs.toDF()

#Destnation S3 loaction for combine Lambda@Edge logs
leLogDestPath = "s3://" + args['target_s3_bucket'] + "/combined/lelogs"