def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions): df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items')) df_tables_array = df_tables.select(df_tables['type'], df_tables['database'], array(df_tables['item']).alias('items')) df_partitions_array_batched = batch_metastore_partitions(sql_context=sql_context, df_parts=df_partitions) dyf_databases = DynamicFrame.fromDF( dataframe=df_databases_array, glue_ctx=glue_context, name='dyf_databases') dyf_tables = DynamicFrame.fromDF( dataframe=df_tables_array, glue_ctx=glue_context, name='dyf_tables') dyf_partitions = DynamicFrame.fromDF( dataframe=df_partitions_array_batched, glue_ctx=glue_context, name='dyf_partitions') return dyf_databases, dyf_tables, dyf_partitions
def write_df_to_catalog(data_frame, entity_type, glue_context, options): # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get. if data_frame.rdd.isEmpty(): return # nothing to do database_name = options['catalog.database'] nested_data_frame = nest_data_frame(data_frame, database_name, entity_type) dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type) sink = glue_context.getSink('catalog', **options) sink.write(dynamic_frame)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' ACTIVED = 'ACTIVED' dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history" ) dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( ['_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated']) # .rename_field('contact_id', 'contactid') dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')]) # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') print dyf_tpe_enduser_used_product_history.count() if dyf_tpe_enduser_used_product_history.count() > 0: try: dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details" ) dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact" ) dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'contactid') ##################### Join and Filter data df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF() df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id').agg( f.max("timecreated").alias("max_timecreated")) \ .withColumnRenamed("contact_id", "contact_id_temp") print df_tpe_used_product_history_step1.count() df_tpe_used_product_history_step1.show() df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby('contact_id_temp').agg( f.max("max_timecreated").alias("max_timecreated"), f.count("used_product_id").alias("count_used_product_id")) print df_tpe_used_product_history_step2.count() df_tpe_used_product_history_step2.show() print "EEEEEEEEEEEEEEEEEEEEEEEEE" dyf_tpe_used_product_history = DynamicFrame.fromDF(df_tpe_used_product_history_step2, glueContext, "dyf_tpe_used_product_history") dyf_part_one = Filter.apply(frame=dyf_tpe_used_product_history, f=lambda x: x["count_used_product_id"] <= 1) # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history, # f=lambda x: x["used_product_id"] > 1) df_part_one = dyf_part_one.toDF() df_part_one = df_part_one.join(df_tpe_enduser_used_product_history, (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id) & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated)) dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one") dyf_part_one = dyf_part_one.select_fields(['contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated']) dyf_join_part_one_product_details = Join.apply(dyf_part_one, dyf_tpe_invoice_product_details, 'used_product_id', 'id') dyf_join_part_one_product_details.printSchema() print "total 01: ", dyf_join_part_one_product_details.count() dyf_join_part_one_product_details.toDF().show(2) dyf_join_part_one_contact = Join.apply(dyf_join_part_one_product_details, dyf_student_contact, 'contact_id', 'contactid') dyf_join_part_one_contact = dyf_join_part_one_contact \ .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated']) dyf_join_part_one_contact.printSchema() print "total 02: ", dyf_join_part_one_contact.count() dyf_join_part_one_contact.toDF().show(2) # df_join_part_one = dyf_join_part_one_contact.toDF() ###################################### ######## START active dyf_join_active_status = Filter.apply(frame=dyf_join_part_one_contact, f=lambda x: x["status_new"] == ACTIVED) print "dyf_join_active_status ", dyf_join_active_status.count() dyf_join_active_status.toDF().show(2) df_join_active_status = dyf_join_active_status.toDF() df_join_active_status = df_join_active_status \ .withColumn("change_status_date_id", from_unixtime(df_join_active_status.timecreated, 'yyyyMMdd').cast("long")) \ .withColumn("from_status_id", f.lit(None).cast("long")) \ .withColumn("to_status_id", f.lit(206).cast("long")) \ .withColumn("measure1", f.lit(None).cast("long")) \ .withColumn("measure2", f.lit(None).cast("long")) \ .withColumn("description", df_join_active_status.status_description) \ .withColumn("timestamp1", f.lit(None).cast("long")) df_join_active_status.show(3) dyf_join_active_status = DynamicFrame.fromDF(df_join_active_status, glueContext, "dyf_join_active_status") dyf_join_active_status = dyf_join_active_status \ .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id', 'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1']) dyf_join_active_status.printSchema() df_join_active_status = dyf_join_active_status.toDF() ####### END df_join_active_status = df_join_active_status.withColumn("user_id", f.lit(None).cast("long")) dyf_join_status = DynamicFrame.fromDF(df_join_active_status, glueContext, "dyf_join_status") applymapping1 = ApplyMapping.apply(frame=dyf_join_status, mappings=[ ("student_id", "string", "student_id", "long"), ("user_id", "long", "user_id", "long"), ("change_status_date_id", "long", "change_status_date_id", "long"), ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), ("measure1", "long", "measure1", "double"), ("measure2", "long", "measure2", "double"), ("description", "string", "description", "string"), ("timestamp1", "long", "timestamp1", "long"), ("contact_id", "string", "contact_id", "string") ]) resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() resolvechoice1.show(5) print('START WRITE TO REDSHIFT -------------------------') datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin" }, redshift_tmp_dir="s3a://dtsodin/temp/mapping_changed_status_student/", transformation_ctx="datasink1") print('START WRITE TO S3-------------------------') # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", # connection_options={ # "path": "s3://dtsodin/student_behavior/student_behavior/", # "partitionKeys": ["behavior_id"]}, # format="parquet", # transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') df_temp = dyf_tpe_enduser_used_product_history.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet", mode="overwrite") except Exception as e: print "Something was wrong ",e
format=file_format, ## "csv", format_options={ "withHeader": True }, transformation_ctx="data_df").toDF() data_df.show(10) ## read data from input table to a data frame #data_df=glueContext.create_dynamic_frame.from_catalog(database=database,table_name=table_name).toDF() ## running sql query on the dataframe created with input dataset data_df.createOrReplaceTempView('data_df') data_df = spark.sql('{} from data_df'.format(querySql)) query_columns = ['werk', 'spj', 'knr', 'result', 'probability', 'time'] data_df.toDF(*query_columns) ## convert the dataframe made by transformed dataset to dynamic frame again data_df = DynamicFrame.fromDF(data_df, glueContext, "data_df") ## Define target s3 output location rtp_dd_output = "s3://" + s3_output_data_folder + "/" + "plant=" + plant + "/" + "appid=" + applicationId + "/" # Store the output/final dynamicFrame to the target s3 location outputGDF = glueContext.write_dynamic_frame.from_options( frame=data_df, connection_type="s3", connection_options={"path": rtp_dd_output}, format="csv")
choice="MATCH_CATALOG", database="as-redshift-dw", table_name="as_tech_test_public_dim_user", transformation_ctx="resolvechoice3") ## @type: ResolveChoice ## @args: [choice = "make_cols", transformation_ctx = "resolvechoice4"] ## @return: resolvechoice4 ## @inputs: [frame = resolvechoice3] resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3, choice="make_cols", transformation_ctx="resolvechoice4") ##get Insert Date timestampedDf = resolvechoice4.toDF().withColumn("dim_user_insert_dt", current_timestamp()) #Back to DynamicFrame cleaned_datasource = DynamicFrame.fromDF(timestampedDf, glueContext, "cleaned_datasource") ## @type: DataSink ## @args: [database = "as-redshift-dw", table_name = "as_tech_test_public_dim_user", redshift_tmp_dir = TempDir, transformation_ctx = "datasink5"] ## @return: datasink5 ## @inputs: [frame = resolvechoice4] datasink5 = glueContext.write_dynamic_frame.from_catalog( frame=cleaned_datasource, database="as-redshift-dw", table_name="as_tech_test_public_dim_user", redshift_tmp_dir=args["TempDir"], transformation_ctx="datasink5") job.commit()
source_cd, forecast_dt, hour_num, usage_factor, esiid_cnt, unadj_load, distrib_loss_load, transmission_loss_load, ufe_loss_load, ancillary_loss_load, deration_loss_load, cap_ob, tran_ob, crdt, batch_dt, batch_hr from ams__iw_growth_stnorm_hourly__df order by forecast_dt ,hour_num""") select__df.createOrReplaceTempView('select__df') rowcount_df = select__df except Exception as e: raise errormessage = str(spark.sql("""select error_message() ,{} = error_severity() ,{} = error_state()""".format(errorseverity, errorstate)).collect()[0][0]) #Write modified data frames to target if __name__ == '__main__': p_manageforecastdata(*sys.argv[1:]) try: for tab_df in mod_df.keys(): if mod_df[tab_df] == org_df[tab_df]: continue dym__trans__df = DynamicFrame.fromDF(mod_df[tab_df],glueContext,'dym__trans__df') glueContext.write_dynamic_frame.from_options(frame = dym__trans__df, connection_type = 's3', connection_options = {'path': 's3://target/s3tables'}, format = 'csv') except: raise
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session class_topica_id = 1 now = datetime.now() # current date and time year = now.strftime("%Y%m%d") year = '20190901' print("year:", int(year)) cur_date = int(year) pre_date = cur_date - 1 print("year:", pre_date) ########## dyf_mapping_lo_student dyf_mapping_lo_student = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student" ) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_mapping_lo_student.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # # dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x['time_lms_created'] > start_read) # except: # print('read flag file error ') # dyf_mapping_lo_student = Filter.apply(frame=dyf_mapping_lo_student, f=lambda x: x['knowledge_pass_date_id'] >= f.lit(int(year))) print('df_student_contact count 1:', dyf_mapping_lo_student.count()) if dyf_mapping_lo_student.count() > 0: try: print("START......................") ########## dyf_mapping_lo_student dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object" ) ########## dyf_learning_object_class dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object_class" ) dyf_learning_object_class = dyf_learning_object_class.select_fields(['class_id', 'class_parent_id']) dyf_learning_object_class = Filter.apply(frame=dyf_learning_object_class, f=lambda x: x["class_parent_id"] == class_topica_id) ########## dyf_mapping_lo_class dyf_mapping_lo_class = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_class" ) dyf_mapping_lo_class = dyf_mapping_lo_class.select_fields(['class_id', 'learning_object_id'])\ .rename_field('class_id', 'map_class_id').rename_field('learning_object_id', 'map_lo_id') ## JOIN chi lay nhung trinh do cua TOPICA dyf_mapping_lo_class = Join.apply(dyf_mapping_lo_class, dyf_learning_object_class, 'map_class_id', 'class_id') dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_type']).rename_field('learning_object_id', 'lo_id') dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_learning_object, 'learning_object_id', 'lo_id') dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_mapping_lo_class, 'learning_object_id', 'map_lo_id') # dyf_mapping_lo_student.printSchema() # dyf_mapping_lo_student.show() df_mapping_lo_student = dyf_mapping_lo_student.toDF() df_mapping_lo_student = df_mapping_lo_student.groupby('student_id', 'learning_object_type', 'class_id').agg( f.count('knowledge_pass_date_id').alias("knowledge_number"), f.count('comprehension_pass_date_id').alias("comprehension_number"), f.count('application_pass_date_id').alias("application_number"), f.count('analysis_pass_date_id').alias("analysis_number"), f.count('synthesis_pass_date_id').alias("synthesis_number"), f.count('evaluation_pass_date_id').alias("evaluation_number")) df_mapping_lo_student = df_mapping_lo_student.withColumn("created_date_id", f.lit(str(year))) # print('Count:' , df_mapping_lo_student.count()) # df_mapping_lo_student.printSchema() # df_mapping_lo_student.show(5) dyf_mapping_lo_student = DynamicFrame.fromDF(df_mapping_lo_student, glueContext, "dyf_mapping_lo_student") applymapping = ApplyMapping.apply(frame=dyf_mapping_lo_student, mappings=[("student_id", "long", "student_id", "long"), ("user_id", "long", "user_id", "long"), ("class_id", "long", "class_id", "long"), ("knowledge_number", "long", "knowledge_number", "long"), ( "comprehension_number", 'long', 'comprehension_number', 'long'), ("application_number", 'long', 'application_number', 'long'), ("analysis_number", 'long', 'analysis_number', 'long'), ("synthesis_number", 'long', 'synthesis_number', 'long'), ("evaluation_number", 'long', 'evaluation_number', 'long'), ("created_date_id", 'string', 'created_date_id', 'long'), ("learning_object_type", 'string', 'learning_object_type', 'string')]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dyf_student_lo_init = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dyf_student_lo_init") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_student_lo_init, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_number", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_number", transformation_ctx="datasink5") print("END......................") except Exception as e: print("###################### Exception ##########################") print(e)
vote_dataset_agg4 = vote_dataset.groupBy(col("idx_tedx")).agg( collect_list( struct(col("date"), col("time"), col("mail_user"), col("vote"))).alias("vote_user")) vote_dataset_agg4.printSchema() tedx_dataset_agg4 = tedx_dataset_agg3.join(vote_dataset_agg4, tedx_dataset_agg3._id == vote_dataset_agg4.idx_tedx, "left") \ .drop("idx_tedx") tedx_dataset_agg4.printSchema() mongo_uri = "mongodb://mycluster-shard-00-00-wo6at.mongodb.net:27017,mycluster-shard-00-01-wo6at.mongodb.net:27017,mycluster-shard-00-02-wo6at.mongodb.net:27017" write_mongo_options = { "uri": mongo_uri, "database": "unibg_tedx", "collection": "tedz_data", "username": "******", "password": "******", "ssl": "true", "ssl.domain_match": "false" } from awsglue.dynamicframe import DynamicFrame tedx_dataset_dynamic_frame = DynamicFrame.fromDF(tedx_dataset_agg4, glueContext, "nested") glueContext.write_dynamic_frame.from_options( tedx_dataset_dynamic_frame, connection_type="mongodb", connection_options=write_mongo_options)
def back_kup_h2472_question_type(): dyf_jh2472_question_type = glueContext \ .create_dynamic_frame.from_catalog(database="do_h2472", table_name="question_type") if is_dev: print('dyf_jh2472_question_type') dyf_jh2472_question_type.printSchema() dyf_jh2472_question_type.show(3) # root # | -- id: string # | -- created_date: string # | -- description: string # | -- group_type: string # | -- modified_date: string # | -- name: string # | -- active: boolean # | -- parent_id: string # | -- _key: long # | -- _table: string # | -- _schema: string dyf_jh2472_question_type = dyf_jh2472_question_type.resolveChoice( specs=[('id', 'cast:long')]) # dyf_jh2472_question_type = Filter.apply(frame=dyf_jh2472_question_type, f=lambda x: x["id"] > 54) df_jh2472_question_type = dyf_jh2472_question_type.toDF() df_jh2472_question_type = df_jh2472_question_type.dropDuplicates(['id']) df_jh2472_question_type = df_jh2472_question_type.withColumn( 'name', f.concat('name', f.lit('_'), 'id')) dyf_jh2472_question_type = DynamicFrame.fromDF(df_jh2472_question_type, glueContext, 'dyf_jh2472_question_type') # # applymapping1 = ApplyMapping.apply( frame=dyf_jh2472_question_type, mappings=[("id", 'long', 'id', 'long'), ("created_date", "string", "created_date", "timestamp"), ("description", "string", "description", "string"), ("group_type", "string", "group_type", "string"), ("modified_date", 'string', 'modified_date', 'timestamp'), ("name", "string", "name", "string"), ("active", "boolean", "active", "boolean")]) # # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") if is_dev: print('resolvechoice1') resolvechoice1.printSchema() resolvechoice1.show(3) # # # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=resolvechoice1, catalog_connection="h2474_backup", connection_options={ "dbtable": "question_type", "database": "topicaH2472" }, redshift_tmp_dir="s3a://dts-odin/topicaH2472/question_type", transformation_ctx="datasink5")
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame: for alias, frame in mapping.items(): frame.toDF().createOrReplaceTempView(alias) result = spark.sql(query) return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
def write_df_to_s3(glue_context, data_frame, backup_location): dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3") sink = glue_context.getSink("s3", path=backup_location) sink.setFormat("json") sink.write(dynamic_frame)
# s3 output directories medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast" medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project" medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols" medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct" medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql" # Read data into a dynamic frame medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name) # The `provider id` field will be choice between long and string # Cast choices into integers, those values that cannot cast result in null medicare_res_cast = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')]) medicare_res_project = medicare_dyf.resolveChoice(specs = [('provider id','project:long')]) medicare_res_make_cols = medicare_dyf.resolveChoice(specs = [('provider id','make_cols')]) medicare_res_make_struct = medicare_dyf.resolveChoice(specs = [('provider id','make_struct')]) # Spark SQL on a Spark dataframe medicare_df = medicare_dyf.toDF() medicare_df.createOrReplaceTempView("medicareTable") medicare_sql_df = spark.sql("SELECT * FROM medicareTable WHERE `total discharges` > 30") medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf") # Write it out in Json glueContext.write_dynamic_frame.from_options(frame = medicare_res_cast, connection_type = "s3", connection_options = {"path": medicare_cast}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_project, connection_type = "s3", connection_options = {"path": medicare_project}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_cols, connection_type = "s3", connection_options = {"path": medicare_cols}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_struct, connection_type = "s3", connection_options = {"path": medicare_struct}, format = "json") glueContext.write_dynamic_frame.from_options(frame = medicare_sql_dyf, connection_type = "s3", connection_options = {"path": medicare_sql}, format = "json")
# The `provider id` field will be choice between long and string # Cast choices into integers, those values that cannot cast result in null medicare_res = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')]) # Remove erroneous records medicare_df = medicare_res.toDF() medicare_df = medicare_df.where("`provider id` is NOT NULL") # Apply a lambda to remove the '$' chop_f = udf(lambda x: x[1:], StringType()) medicare_df = medicare_df.withColumn("ACC", chop_f(medicare_df["average covered charges"])).withColumn("ATP", chop_f(medicare_df["average total payments"])).withColumn("AMP", chop_f(medicare_df["average medicare payments"])) # Turn it back to a dynamic frame medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested") # Rename, cast, and nest with apply_mapping medicare_nest = medicare_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'), ('provider id', 'long', 'provider.id', 'long'), ('provider name', 'string', 'provider.name', 'string'), ('provider city', 'string', 'provider.city', 'string'), ('provider state', 'string', 'provider.state', 'string'), ('provider zip code', 'long', 'provider.zip', 'long'), ('hospital referral region description', 'string','rr', 'string'), ('ACC', 'string', 'charges.covered', 'double'), ('ATP', 'string', 'charges.total_pay', 'double'), ('AMP', 'string', 'charges.medicare_pay', 'double')]) # Write it out in Parquet glueContext.write_dynamic_frame.from_options(frame = medicare_nest, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet")
input_file_path = "s3://xxxxx" df = spark.read.option("header","true")\ .option("inferSchema","true")\ .option("quote","\"")\ .option("escape","\"").csv(input_file_path) df = df.withColumn( 'event_timestamp', f.to_timestamp('event_timestamp', format='MM/dd/yyyy HH:mm')) df= df.withColumn('year',f.year(f.col('event_timestamp')))\ .withColumn('month',f.month(f.col('event_timestamp'))) dynamic_df = DynamicFrame.fromDF(df, glueContext, "dynamic_df") mapped_df = ResolveChoice.apply(frame=dynamic_df, choice="make_cols", transformation_ctx="mapped_df") datasink = glueContext.write_dynamic_frame.from_jdbc_conf( frame=mapped_df, catalog_connection="xxxxxxx", connection_options={ "dbtable": "external_data_schema.xxxxxx", "database": "dev" }, redshift_tmp_dir=args["TempDir"], transformation_ctx="datasink")
NATURAL_KEY = FINAL_TUPLE_WITH_DF_AND_MD5[1] ## Taking the natual key that passed in Json File. NATURAL_KEY_1 = NATURAL_KEY[0] ##Taking the value from SOURCE_NAME column (example : "HR PERSON") from FINAL_MD5_DF POST_QUERY_SOURCE_NAME = FINAL_MD5_DF.select("source_name").limit(1).rdd.map( lambda a: a[0]).collect()[0] print('#######>>>>>>>POST_QUERY_SOURCE_NAME', POST_QUERY_SOURCE_NAME) print("finalmd5") FINAL_MD5_DF1 = FINAL_MD5_DF.drop_duplicates() # Final Data frame is converted to Dynamic frame # Final Dynamic Frame will be written to Stage Table FINAL_DYNAMIC_FRAME = DynamicFrame.fromDF(FINAL_MD5_DF1, GLUECONTEXT, "Final_dynamic_frame") #Updates,Inserts and Deletes counts logic here # 1. Create a DF with counts and op_val, Group by JobId,op_val # 2. Extract inserts, updates and deletes # 3. Add it to Cloud Watch Logs. COUNT_DF = FINAL_MD5_DF.withColumn('JobRunId', F.lit(str(RUN_ID)))\ .withColumn('JobName', F.lit(str(RUN_ID))) ## Truncating the stage table PRE_QUERY = """begin; truncate table {stage_database_name}.{stage_table}; end;""".format(stage_database_name=STAGE_DATABASE_NAME, stage_table=STAGE_TABLE)
*["*"] + [col("kvs").getItem(k).alias(k) for k in keys]) # change the data types and column names to be easier to query later with_map = with_map \ .withColumn("id", monotonically_increasing_id()) \ .withColumn("resources_used_walltime_secs", get_sec("resources_used_walltime")) \ .withColumn("resources_used_cput", get_sec("resources_used_cput")) \ .withColumn("resources_used_mem_gb", convert_to_gb("resources_used_mem")) \ .withColumn("resource_list_nodect", expr("CAST(resource_list_nodect AS INTEGER)")) \ .withColumn("resource_list_cpu", expr("CAST(resource_list_cpu AS INTEGER)")) \ .withColumn("resource_list_gpu", expr("CAST(resource_list_gpu AS INTEGER)")) \ .withColumn("qtime", expr("CAST(qtime AS LONG)")) \ .withColumn("start", expr("CAST(start AS LONG)")) \ .withColumn("ctime", expr("CAST(qtime AS LONG)")) \ .withColumn("etime", expr("CAST(qtime AS LONG)")) \ .withColumn("end", expr("CAST(qtime AS LONG)")) \ .withColumn("exit_status", expr("CAST(exit_status AS INTEGER)")) \ .withColumnRenamed("group", "group_name") \ .withColumn("resource_list_cores", expr("CAST(resource_list_nodes as LONG) * CAST(resource_list_cpu as INTEGER)")) \ .withColumn("resources_used_walltime_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \ .withColumn("resources_used_cput_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \ .drop('resources_used_vmem', 'kvs', 'session', 'exec_host', 'resource_list_neednodes', 'resource_list_walltime', 'detail') # eventually drop detail and the asked resources to only use actually used torq = DynamicFrame.fromDF(with_map, glueContext, "joined") datasink5 = glueContext.write_dynamic_frame.from_options(frame=torq, connection_type="s3", connection_options={ "path": args['S3_OUTPUT_PATH'], "partitionKeys": ["year", "month", "day"]}, format="parquet", transformation_ctx="datasink5") job.commit()
"zipcode", 'size_of_adjusted_gross_income', 'num_of_returns', 'num_of_single_returns', 'num_of_joint_returns', 'num_of_head_of_household_returns', 'num_with_paid_preparers_signature', 'num_of_exemptions', 'num_of_dependents', 'num_of_volunteer_prepared_returns_Total', 'num_of_volunteer_prepared_returns_Num_of_volunteer_income_tax_assistance_prepared_returns', 'num_of_volunteer_prepared_returns_Num_of_tax_counseling_for_the_elderly_prepared_returns' ] #rename the columns for c, n in zip(income_ny_df.columns, new_cols): income_ny_df = income_ny_df.withColumnRenamed(c, n) print("new columns: ", income_ny_df.columns) income_ny_DyF = DynamicFrame.fromDF(income_ny_df, glueContext, "income_ny_DyF") income_ny_DyF.printSchema() # Print out information about this data print("Parks Count: ", parks_DyF.count()) parks_DyF.printSchema() # Print out information about this data. print("Playground Count: ", playgrounds_DyF.count()) playgrounds_DyF.printSchema() # Convert to Spark DataFrame for left outer join playgrounds_df = playgrounds_DyF.toDF() # Drop duplicate columns in parks dataframe columns_to_drop = ['Location', 'Name', 'year', 'month', 'day'] playgrounds_df = playgrounds_df.drop(*columns_to_drop)
def hash_cc(s): return hashlib.sha256(s).hexdigest() ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0") ## @convert glue DynamicFrame to DataFrame to manipulate the columns dataframe0 = DynamicFrame.toDF(datasource0) hash_cc_f = udf(lambda x: hash_cc(x), StringType()) dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"])) dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password') ## @convert dataframe to glue DynamicFrame and write the output in parquet format datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1") datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4") job.commit()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog( database='crm_native', table_name='contacts') dyf_crm_contacts = dyf_crm_contacts.select_fields( ['_key', 'Id', 'Code', 'Fullname', 'Address']) dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('_key', 'cast:long')]) dy_source_voxy_cache = dyf_crm_contacts.toDF() dy_source_voxy_cache = dy_source_voxy_cache.cache() dyf_crm_contacts = DynamicFrame.fromDF(dy_source_voxy_cache, glueContext, 'dyf_crm_contacts') # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_user_communication_full_name.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') print('the number of new contacts: ', dyf_crm_contacts.count()) if (dyf_crm_contacts.count() > 0): # print('Chay vao day nhe------------------') # print('dyf_crm_contacts::----------------') # dyf_crm_contacts.printSchema() # try: #--------------------------------------------------------------------------------------------------------------# dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["Id"] is not None and x["Id"] != '' and x[ "Code"] is not None and x["Code"] != '' and x[ "Fullname"] is not None and x["Fullname"] != '') # --------------------------------------------------------------------------------------------------------------# # --------------------------------------------------------------------------------------------------------------# # today = date.today() # today_timestamp = today.timestamp(); # print("Today's date:", today_timestamp) dy_crm_contacts = dyf_crm_contacts.toDF() dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code']) dy_crm_contacts = dy_crm_contacts.withColumn( 'communication_type_full_name', f.lit(4)) dy_crm_contacts = dy_crm_contacts.withColumn( 'communication_type_address', f.lit(6)) dy_crm_contacts = dy_crm_contacts.withColumn('is_primary', f.lit(1)) dy_crm_contacts = dy_crm_contacts.withColumn('is_deleted', f.lit(0)) dy_crm_contacts = dy_crm_contacts.withColumn( 'last_update_date', f.lit('2019-08-28 00:00:00')) dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts, glueContext, 'dyf_crm_contacts') dyf_crm_contacts = dyf_crm_contacts.resolveChoice( specs=[('last_update_date', 'cast:long')]) applymapping2 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[("Id", "int", "user_id", "bigint"), ("communication_type_full_name", 'int', 'communication_type', 'int'), ("is_primary", 'int', 'is_primary', 'int'), ("is_deleted", 'int', 'is_deleted', 'int'), ("Fullname", 'string', 'comunication', 'string'), ("last_update_date", 'string', 'last_update_date', 'timestamp')]) # # resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields6 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields6, catalog_connection="glue_redshift", connection_options={ "dbtable": "user_communication", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/", transformation_ctx="datasink4") dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["Address"] is not None and x["Address"] != '') #--------------------------------------------------------------------------------------------------------------# applymapping3 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[("Id", "int", "user_id", "bigint"), ("communication_type_address", 'int', 'communication_type', 'int'), ("is_primary", 'int', 'is_primary', 'int'), ("is_deleted", 'int', 'is_deleted', 'int'), ("Address", 'string', 'comunication', 'string'), ("last_update_date", 'string', 'last_update_date', 'timestamp')]) # # resolvechoice3 = ResolveChoice.apply( frame=applymapping3, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields3 = DropNullFields.apply( frame=resolvechoice3, transformation_ctx="dropnullfields3") datasink3 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "user_communication", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/user/communication/address/", transformation_ctx="datasink3") # --------------------------------------------------------------------------------------------------------------# #insert into source_id # lay max _key tren datasource datasource = dyf_crm_contacts.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # ghi de flag moi vao s3 flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/flag_user_communication_full_name.parquet", mode="overwrite")
## @args: [f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2"] ## @return: Transform2 ## @inputs: [frame = Transform7] Transform2 = Filter.apply(frame = Transform7, f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2") ## @type: ApplyMapping ## @args: [mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6"] ## @return: Transform6 ## @inputs: [frame = Transform2] Transform6 = ApplyMapping.apply(frame = Transform2, mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6") ## @type: Join ## @args: [columnConditions = ["=", "="], joinType = right, keys2 = ["idfixture", "idhometeam"], keys1 = ["(predictions) idfixture", "(predictions) idteam"], transformation_ctx = "Transform4"] ## @return: Transform4 ## @inputs: [frame1 = Transform1, frame2 = Transform6] Transform1DF = Transform1.toDF() Transform6DF = Transform6.toDF() Transform4 = DynamicFrame.fromDF(Transform1DF.join(Transform6DF, (Transform1DF['(predictions) idfixture'] == Transform6DF['idfixture']) & (Transform1DF['(predictions) idteam'] == Transform6DF['idhometeam']), "right"), glueContext, "Transform4") ## @type: ApplyMapping ## @args: [mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5"] ## @return: Transform5 ## @inputs: [frame = Transform4] Transform5 = ApplyMapping.apply(frame = Transform4, mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5") ## @type: Join ## @args: [columnConditions = ["=", "="], joinType = left, keys2 = ["(predictions) idfixture", "(predictions) idteam"], keys1 = ["idfixture", "idawayteam"], transformation_ctx = "Transform8"] ## @return: Transform8 ## @inputs: [frame1 = Transform5, frame2 = Transform1] Transform5DF = Transform5.toDF() Transform1DF = Transform1.toDF() Transform8 = DynamicFrame.fromDF(Transform5DF.join(Transform1DF, (Transform5DF['idfixture'] == Transform1DF['(predictions) idfixture']) & (Transform5DF['idawayteam'] == Transform1DF['(predictions) idteam']), "left"), glueContext, "Transform8") ## @type: ApplyMapping ## @args: [mappings = [("date", "string", "date", "string"), ("(predictions) xgoals", "double", "xgoalsawayteam", "decimal"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("xgoalshometeam", "double", "xgoalshometeam", "decimal"), ("idfixture", "int", "idfixture", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("idhometeam", "int", "idhometeam", "int"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("time", "string", "time", "string"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string")], transformation_ctx = "Transform0"] ## @return: Transform0
from pyspark.sql.functions import udf from pyspark.sql.types import StringType ## @params: [JOB_NAME] glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session ds0 = glueContext.create_dynamic_frame.from_catalog( database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="ds0") ds3 = ds0.toDF() ds3.createOrReplaceTempView("train_sm_s2adb_csv_temp2") ds4 = spark.sql("SELECT * FROM train_sm_s2adb_csv_temp2 WHERE age > 30") ds5 = DynamicFrame.fromDF(ds4, glueContext, "ds5") ds6 = glueContext.write_dynamic_frame.from_options( frame=ds5, connection_type="redshift", connection_options={ "url": "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world", "dbtable": "atn.gluetable312" }, transformation_ctx="ds6") ds7 = glueContext.write_dynamic_frame.from_options( frame=ds5, connection_type="s3", connection_options={"path": "s3://asgqatestautomation4/Targetdata312"}, format="json",
current_timestamp = time.strftime("%Y-%m-%d %H:%M:%S") ###################################### #### CONNECTION BLOCK #### ###################################### ## argo_carrier_visit connection argoCV_ds = glueContext.create_dynamic_frame.from_catalog( database="staging_initial", table_name="argo_carrier_visit", transformation_ctx="argoCV_ds") argoCV_regDF = argoCV_ds.toDF() argoCV_regDF = argoCV_regDF.withColumn("sourcesystem", lit("PNCT")).withColumn( "dboperationtype", lit("L")).withColumn("audtdateadded", lit(current_timestamp)) argoCV_dynDF = DynamicFrame.fromDF(argoCV_regDF, glueContext, "nested") ## argo_chargeable_unit_events connection argoCUE_ds = glueContext.create_dynamic_frame.from_catalog( database="staging_initial", table_name="argo_chargeable_unit_events", transformation_ctx="argoCUE_ds") argoCUE_regDF = argoCUE_ds.toDF() argoCUE_regDF = argoCUE_regDF.withColumn( "sourcesystem", lit("PNCT")).withColumn("dboperationtype", lit("L")).withColumn("audtdateadded", lit(current_timestamp)) argoCUE_dynDF = DynamicFrame.fromDF(argoCUE_regDF, glueContext, "nested") ## argo_visit_details connection
database=DATABASE, table_name=INVENTORY_TABLE).toDF() filelist = glueContext.create_dynamic_frame.from_catalog( database=DATABASE, table_name=FILENAME_TABLE) mapped = filelist.apply_mapping([ ("archiveid", "string", "archiveid", "string"), ("override", "string", "override", "string") ]).toDF().dropDuplicates(['archiveid']) rownum = inventory.withColumn( "row_num", row_number().over( Window.orderBy(inventory['creationdate'], inventory['archiveid'])).cast("long")) merged = rownum.join(mapped, "archiveid", how='left_outer') frame = DynamicFrame.fromDF(merged, glueContext, "merged") def transform(rec): rec["part"] = rec["row_num"] // partiton_size rec["archivedescription"] = rec["override"] if rec["override"] and rec[ "override"].strip() else rec["archivedescription"] rec.pop('override', None) return rec trans0 = Map.apply(frame=frame, f=transform) sink = glueContext.getSink(connection_type="s3", path='s3://' + STAGING_BUCKET + '/partitioned/', enableUpdateCatalog=True,
("passenger_count", "long", "passenger_count", "long"), ("trip_distance", "double", "trip_distance", "double"), ("pulocationid", "long", "pulocationid", "long"), ("dolocationid", "long", "dolocationid", "long"), ("fare_amount", "double", "fare_amount", "double"), ("tip_amount", "double", "tip_amount", "double"), ("total_amount", "double", "total_amount", "double")], transformation_ctx="applymapping1") resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") sparkdf = resolvechoice2.toDF() transform1 = sparkdf.where( func.col('tpep_pickup_datetime').between('2019-01-01', '2020-12-31')) transform2 = transform1.dropna(subset=['passenger_count', 'trip_distance']) result = DynamicFrame.fromDF(dataframe=transform2, glue_ctx=glueContext, name='result') datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=result, catalog_connection="redshift-east", connection_options={ "dbtable": "yellow", "database": "dev" }, redshift_tmp_dir=args["TempDir"], transformation_ctx="datasink4") job.commit()
def _find_row(paintings: DynamicFrame, episode_text: str): """ Assert a given row exists in the dynamic frame and that it contains the expected values """ matches = paintings.filter( lambda x: x['season_episode_text'] == episode_text).toDF().collect() assert len(matches) == 1 return matches[0]
def main(): def checknull(level_modified, level_study): if level_modified is not None: return level_modified else: return level_study checknull_ = udf(checknull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_package_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_package_status_code is not None: text_concat += str(student_package_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['student_id', 'contact_id', 'level_study']) dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields([ 'contact_id', 'level_current', 'level_modified', 'package_code', 'time_created' ]) dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice( specs=[('_key', 'cast:int')]) dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product") dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([ '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price', 'invoice_price', 'invoice_code' ]) dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice( specs=[('_key', 'cast:long')]) dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['cat_code', 'package_time', 'invoice_code']) dyf_student_package = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_package") # chon cac field dyf_student_package = dyf_student_package.select_fields( ['student_id', 'start_time', 'end_time', 'package_code']).rename_field('student_id', 'student_id1') dyf_student_package.printSchema() dyf_student_package.show(2) # # doc flag tu s3 try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet" ) start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_tpe_invoice_product = Filter.apply( frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_tpe_invoice_product.count()) if (dyf_tpe_invoice_product.count() > 0): df_log_student_level_study = dyf_log_student_level_study.toDF() df_log_student_level_study = df_log_student_level_study.groupby( 'contact_id', 'level_current', 'level_modified', 'package_code').agg(f.max('time_created').alias('time_created')) dyf_join0 = Join.apply(dyf_tpe_invoice_product, dyf_tpe_invoice_product_details, 'invoice_code', 'invoice_code') print("@@@@@@@@@@@@") dyf_join0.printSchema() dyf_join0.show(2) dyf_log_student_level_study = DynamicFrame.fromDF( df_log_student_level_study, glueContext, "dyf_log_student_level_study") dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id", "user_id") dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study, "user_id", "contact_id") print("@@@@@@@@@@@@") dyf_join.printSchema() dyf_join.show(2) dyf_join = Filter.apply( frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated']) dyf_data_join3 = Join.apply(dyf_join, dyf_student_package, "student_id", "student_id1") dyf_data_join3 = Filter.apply( frame=dyf_data_join3, f=lambda x: x['package_code'] == x['cat_code']) df_data_join3 = dyf_data_join3.toDF() df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\ .withColumn("behavior_id", f.lit(3))\ .withColumn("student_package_status_code", f.lit("DEACTIVED"))\ .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\ .withColumn("package_starttime", df_data_join3['start_time'])\ .withColumn("package_endtime", df_data_join3['end_time']) \ .withColumn("transformed_at", f.lit(None)) df_data_join3 = df_data_join3.withColumn( 'student_behavior_id', f.md5( concaText(df_data_join3.student_behavior_date, df_data_join3.behavior_id, df_data_join3.student_id, df_data_join3.contact_id, df_data_join3.package_code, df_data_join3.package_endtime, df_data_join3.package_starttime, df_data_join3.student_level_code, df_data_join3.student_package_status_code, df_data_join3.transformed_at))) df_data_join3 = df_data_join3.dropDuplicates() dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext, "dyf_data_join3") dyf_data_join3 = dyf_data_join3.resolveChoice( specs=[('behavior_id', 'cast:int'), ('student_behavior_date', 'cast:timestamp')]) dyf_data_join3.printSchema() dyf_data_join3.show(2) applymapping = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("contact_id", "string", "contact_id", "string"), ("student_behavior_date", "timestamp", "student_behavior_date", "long"), ("student_id", "string", "student_id", "long"), ("cat_code", "string", "package_code", "string"), ("package_starttime", "int", "package_starttime", "long"), ("package_endtime", "int", "package_endtime", "long"), ("student_package_status_code", "string", "student_status_code", "string"), ("behavior_id", "int", "behavior_id", "long"), ("student_level_code", "string", "student_level_code", "string")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") print(dropnullfields.count()) dropnullfields.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") applymapping1 = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("invoice_packages_price", "int", "measure1", "long"), ("behavior_id", "int", "behavior_id", "long"), ("invoice_price", "int", "measure2 ", "long")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields1") print(dropnullfields1.count()) dropnullfields1.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF() flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet", mode="overwrite")
# extract out transactions for test/validation n_train = int(transactions.count() * train_data_ratio) test_ids = transactions.select_fields(TRANSACTION_ID) get_fraud_frac = lambda series: 100 * sum(series) / len(series) isfraud_df: DynamicFrame = transactions.select_fields("isFraud") logger.info("Percent fraud for train transactions: {}".format( sum_col(transactions.toDF(), "isFraud"))) dump_df_to_s3(test_ids.toDF(), 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) # Creating glue dynamic frame from spark dataframe features_dynamic_df = DynamicFrame.fromDF(features_df, glueContext, 'FeaturesDF') features_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( features_dynamic_df, [('~id', TRANSACTION_ID, 't')]) logger.info(f'Upserting transactions as vertices of graph...') features_dynamic_df.toDF().foreachPartition( gremlin_client.upsert_vertices('Transaction', batch_size=50)) logger.info(f'Creating glue DF from labels dataframe') labels_dynamic_df = DynamicFrame.fromDF(labels_df, glueContext, 'LabelsDF') labels_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( labels_dynamic_df, [('~id', TRANSACTION_ID, 't')]) logger.info(f'Upserting transactions with isFraud property...') labels_dynamic_df.toDF().foreachPartition( gremlin_client.upsert_vertices('Transaction', batch_size=100)) dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags')
glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Read DynamicFrame. dynf = glueContext.create_dynamic_frame.from_catalog(database="default", table_name="sales", transformation_ctx="dynf") # Convert to DataFrame. df = dynf.toDF() # Put table on DataFrame. df.createOrReplaceTempView("sales_tmp") # Run SQL. sql_df = spark.sql( "SELECT id, date, store, state, product, amount * 2.1 from sales_tmp") # Convert back to DynamicFrame. dynf_new = DynamicFrame.fromDF(sql_df, glueContext, "df") datasink4 = glueContext.write_dynamic_frame.from_catalog( frame=dynf_new, database="default", table_name="sales1", transformation_ctx="datasink4") # Commit. job.commit()
specid , systemcreationdate , udblistingid, to_date(effectiveto) AS effectiveto_date FROM edw_listings WHERE row_number_seq = 1 ''') #df_joined.cache() df_joined.describe() df_joined.printSchema() #print df_joined.count() s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_data/listingdim_pdt_deduped_pq' output_folder = s3_location_target # With absolute path print 'output_folder= %s' % (output_folder) #---- PySpark section ---- #df_joined.write.mode('overwrite').parquet(output_folder) #df_joined.write.mode('overwrite').save(output_folder) new_dynamic_frame = DynamicFrame.fromDF(df_joined, glueContext, "new_dynamic_frame") codec = 'snappy' #glueContext.write_dynamic_frame.from_options(frame = m_df, connection_type = "s3", connection_options = {"path": child_output_dir}, format = "parquet", compression=codec) glueContext.write_dynamic_frame.from_options( frame=new_dynamic_frame, connection_type="s3", connection_options={"path": output_folder}, format="parquet", compression=codec) print 'Done Parquet Conversion !'
def create_dynamic_frame_from_rdd(self, data, name, schema=None, sample_ratio=None, transformation_ctx=""): """Creates a DynamicFrame from an RDD. """ df = super(GlueContext, self).createDataFrame(data, schema, sample_ratio) return DynamicFrame.fromDF(df, self, name)
args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) ##Read Data from REST API using DataDirect Autonomous REST Connector JDBC driver in to DataFrame source_df = spark.read.format("jdbc").option( "url", "jdbc:datadirect:autorest:config=yelp.rest;AuthenticationMethod=HttpHeader;AuthHeader=Authorization;SecurityToken='Bearer JcMUtuWfaqJdWJBqqLrgBxfbYh6GIUGv3zUyXOG4zsfe6wnOtlZBeroFb8rpRM-dESFzcSAUd1YDAtQm2yl0hrJwfldvHp2AdEzRXThZku69r-w4wTv80Cj7d08ZXHYx'" ).option("dbtable", "AUTOREST.BUSINESSES").option( "driver", "com.ddtek.jdbc.autorest.AutoRESTDriver").load() job.init(args['JOB_NAME'], args) print(source_df) ##Convert DataFrames to AWS Glue's DynamicFrames Object dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df") ##Write Dynamic Frames to S3 in CSV format. You can write it to any rds/redshift, by using the connection that you have defined previously in Glue datasink4 = glueContext.write_dynamic_frame.from_options( frame=dynamic_dframe, connection_type="s3", connection_options={"path": "s3://glueuserdata"}, format="csv", transformation_ctx="datasink4") job.commit()
#### CONNECTION BLOCK #### ###################################### ## ref_bizunit_scoped connection refBizScopedCon_ds = glueContext.create_dynamic_frame.from_catalog( database="nola_staging_initial", table_name="ref_bizunit_scoped", transformation_ctx="refBizScopedCon_ds") refBizScopedCon_regDF = refBizScopedCon_ds.toDF() refBizScopedCon_regDF = refBizScopedCon_regDF.withColumn( "sourcesystem", lit("NOLA")).withColumn("dboperationtype", lit("L")).withColumn("audtdateadded", lit(current_timestamp)) refBizScopedCon_distDF = refBizScopedCon_regDF.distinct() refBizScopedCon_dynDF = DynamicFrame.fromDF(refBizScopedCon_distDF, glueContext, "nested") ## ref_carrier_itinerary connection refCarItinCon_ds = glueContext.create_dynamic_frame.from_catalog( database="nola_staging_initial", table_name="ref_carrier_itinerary", transformation_ctx="refCarItinCon_ds") refCarItinCon_regDF = refCarItinCon_ds.toDF() refCarItinCon_regDF = refCarItinCon_regDF.withColumn( "sourcesystem", lit("NOLA")).withColumn("dboperationtype", lit("L")).withColumn("audtdateadded", lit(current_timestamp)) refCarItinCon_distDF = refCarItinCon_regDF.distinct() refCarItinCon_dynDF = DynamicFrame.fromDF(refCarItinCon_distDF, glueContext, "nested")
# Cast choices into integers, those values that cannot cast result in null medicare_res_cast = medicare_dyf.resolveChoice(specs=[('provider id', 'cast:long')]) medicare_res_project = medicare_dyf.resolveChoice(specs=[('provider id', 'project:long')]) medicare_res_make_cols = medicare_dyf.resolveChoice(specs=[('provider id', 'make_cols')]) medicare_res_make_struct = medicare_dyf.resolveChoice(specs=[('provider id', 'make_struct')]) # Spark SQL on a Spark dataframe medicare_df = medicare_dyf.toDF() medicare_df.createOrReplaceTempView("medicareTable") medicare_sql_df = spark.sql( "SELECT * FROM medicareTable WHERE `total discharges` > 30") medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf") # Write it out in Json glueContext.write_dynamic_frame.from_options( frame=medicare_res_cast, connection_type="s3", connection_options={"path": medicare_cast}, format="json") glueContext.write_dynamic_frame.from_options( frame=medicare_res_project, connection_type="s3", connection_options={"path": medicare_project}, format="json") glueContext.write_dynamic_frame.from_options( frame=medicare_res_make_cols, connection_type="s3",
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source is_dev = True limit = True # information database??????? dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog( database='crm_native', table_name='contacts') # dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts, # f=lambda x: x["Id"] < 1102) # print('dyf_crm_contacts::fdfdfdfdfdfdfd----------------') # dyf_crm_contacts.printSchema() dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('Id', 'cast:int')]) print('dyf_crm_contacts') dyf_crm_contacts.printSchema() #doc moc flag tu s3 try: df_flag = spark.read.parquet( "s3a://dtsodin/flag/flag_user_profile.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts, f=lambda x: x["Id"] > read_from_index) except: print('read flag file error ') print('the number of new contacts: ', dyf_crm_contacts.count()) crm_contacts_number = dyf_crm_contacts.count() print('crm_contacts_number: ', crm_contacts_number) if crm_contacts_number < 1: print('Stopping--- crm_contacts_number < 1') return dyf_crm_contacts = dyf_crm_contacts.select_fields( ['_key', 'Id', 'Code', 'Birthday', 'Gender', 'Job', 'CreatedDate']) dy_crm_contacts_cache = dyf_crm_contacts.toDF() dy_crm_contacts_cache = dy_crm_contacts_cache.dropDuplicates(['Code']) dy_crm_contacts_cache = dy_crm_contacts_cache.cache() dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache, glueContext, 'dyf_crm_contacts') today = date.today() d4 = today.strftime("%Y-%m-%d") print("d4 =", d4) # print('Chay vao day nhe------------------') # print('dyf_crm_contacts::----------------') # dyf_crm_contacts.printSchema() # try: #--------------------------------------------------------------------------------------------------------------# dyf_crm_contacts = Filter.apply( frame=dyf_crm_contacts, f=lambda x: x["Id"] is not None and x["Id"] != '' and x[ "Code"] is not None and x["Code"] != '') # --------------------------------------------------------------------------------------------------------------# # --------------------------------------------------------------------------------------------------------------# if (dyf_crm_contacts.count() > 0): dy_crm_contacts = dyf_crm_contacts.toDF() # dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code']) dy_crm_contacts = dy_crm_contacts.withColumn('source_type', f.lit(1)) dy_crm_contacts = dy_crm_contacts.withColumn('is_root', f.lit(1)) dy_crm_contacts = dy_crm_contacts.withColumn('description', f.lit(d4)) dy_crm_contacts = dy_crm_contacts.withColumn('last_update_date', f.lit(d4)) dy_crm_contacts_cache_2 = dy_crm_contacts.cache() dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache_2, glueContext, 'dyf_crm_contacts') applymapping2 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[("Id", "int", "user_id", "bigint"), ("Gender", 'int', 'gender', 'string'), ("is_root", 'int', 'is_root', 'int'), ("Birthday", 'string', 'birthday', 'date'), ("Job", 'string', 'job', 'string'), ("last_update_date", 'string', 'last_update_date', 'timestamp')]) # # resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields6 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields6, catalog_connection="glue_redshift", connection_options={ "dbtable": "user_profile", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/user/profile/", transformation_ctx="datasink4") #insert into source_id # print('dyf_crm_contacts::-------source_type---------') # dyf_crm_contacts.printSchema() applymapping3 = ApplyMapping.apply( frame=dyf_crm_contacts, mappings=[("Id", "int", "user_id", "bigint"), ("source_type", 'int', 'source_type', 'int'), ("Code", 'string', 'source_id', 'string'), ("description", 'string', 'description', 'string')]) resolvechoice3 = ResolveChoice.apply( frame=applymapping3, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields7 = DropNullFields.apply( frame=resolvechoice3, transformation_ctx="resolvechoice3") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields7, # catalog_connection="glue_redshift", # connection_options={"dbtable": "user_map", # "database": "dts_odin"}, # redshift_tmp_dir="s3n://dts-odin/temp/user/map/", # transformation_ctx="datasink5") #lay max _key tren datasource flag = dy_crm_contacts_cache.agg({"Id": "max"}).collect()[0][0] # ghi de flag moi vao s3 flag_data = [flag] df = spark.createDataFrame(flag_data, "int").toDF('flag') df.write.parquet("s3a://dtsodin/flag/flag_user_profile.parquet", mode="overwrite") dy_crm_contacts_cache_2.unpersist() dy_crm_contacts_cache.unpersist()
def main(): today = datetime.now(ho_chi_minh_timezone) print('today: ', today) yesterday = today - timedelta(1) today_id = int(today.strftime("%Y%m%d")) yesterday_id = int(yesterday.strftime("%Y%m%d")) print('today_id: ', today_id) print('yesterday_id: ', yesterday_id) lastest_number_days = 30 chosen_word_number = 24 yesterday = date.today() - timedelta(1) yesterday_id = int(yesterday.strftime("%Y%m%d")) lasted_30_day = today - timedelta(lastest_number_days) lasted_30_day_id = int(lasted_30_day.strftime("%Y%m%d")) StructPlusNumber = StructType([ StructField("lo_plus_number", LongType(), False), StructField("learning_object_id", LongType(), False), StructField("learning_last_date_id", LongType(), False) ]) def getBestWords(plus_number_pair_list): plus_number_pair_list = \ sorted(plus_number_pair_list, key=lambda x: x['lo_plus_number'], reverse=True) a = plus_number_pair_list[0:chosen_word_number] return a getBestWords = udf(getBestWords, ArrayType(StructPlusNumber)) #-------------------------------------------------------- StructMiniNumber = StructType([ StructField("lo_minus_number", LongType(), False), StructField("learning_object_id", LongType(), False), StructField("learning_last_date_id", LongType(), False) ]) def getWorstWords(minus_number_pair_list): minus_number_pair_list = \ sorted(minus_number_pair_list, key=lambda x: x['lo_minus_number'], reverse=True) a = minus_number_pair_list[0:chosen_word_number] return a getWorstWords = udf(getWorstWords, ArrayType(StructMiniNumber)) #---------------------------------------- if IS_DEV: dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "mapping_lo_student_history_test", "redshiftTmpDir": "s3://dts-odin/temp1/mapping_lo_student_history_test/v9" }) else: # dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( # database="nvn_knowledge", # table_name="mapping_lo_student_history", # additional_options={"path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"}, # push_down_predicate="(partition_0=='starter_ait' or partition_0=='starter_micro')" # ) dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student_history", additional_options={ "path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*" }, push_down_predicate="(partition_0=='starter_micro')") dyf_mapping_lo_student_history = dyf_mapping_lo_student_history.select_fields( [ 'student_id', 'learning_object_id', 'minus_number', 'plus_number', 'lu_type', 'created_date_id' ]) if not IS_DEV: dyf_mapping_lo_student_history = Filter.apply( frame=dyf_mapping_lo_student_history, f=lambda x: x["student_id"] is not None and x["student_id"] != 0 and x["learning_object_id"] is not None and x[ "created_date_id"] >= lasted_30_day_id and x["lu_type"] == 1) if IS_DEV: print('dyf_mapping_lo_student_history') # dyf_mapping_lo_student_history.printSchema() # dyf_mapping_lo_student_history.show(3) df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF() df_mapping_lo_student_history = df_mapping_lo_student_history.cache() # print('df_mapping_lo_student_history: ', df_mapping_lo_student_history.count()) if df_mapping_lo_student_history.count() < 1: return df_group_plus_minus_number = df_mapping_lo_student_history.groupby( 'student_id', 'learning_object_id').agg( f.sum('plus_number').alias('lo_plus_number'), f.sum('minus_number').alias('lo_minus_number'), f.max('created_date_id').alias('learning_last_date_id')) # print('df_group_plus_minus_number') df_group_plus_minus_number.printSchema() df_group_plus_minus_number.show(3) df_group_plus_minus_number = df_group_plus_minus_number.na.fill({ 'lo_plus_number': 0, 'lo_minus_number': 0 }) # xu ly de plus va minus khong trung nhau df_group_plus_minus_number = df_group_plus_minus_number\ .select( 'student_id', 'learning_object_id', f.when(f.col('lo_plus_number') >= f.col('lo_minus_number'), f.col('lo_plus_number')) .otherwise(0).alias('lo_plus_number'), f.when(f.col('lo_plus_number') < f.col('lo_minus_number'), f.col('lo_minus_number')) .otherwise(0).alias('lo_minus_number'), 'learning_last_date_id' ) df_group_plus_minus_number = df_group_plus_minus_number.select( 'student_id', f.struct('lo_plus_number', 'learning_object_id', 'learning_last_date_id').alias('plus_number_pair'), f.struct('lo_minus_number', 'learning_object_id', 'learning_last_date_id').alias('minus_number_pair')) df_group_l2 = df_group_plus_minus_number.groupby('student_id').agg( f.collect_list('plus_number_pair').alias('plus_number_pair_list'), f.collect_list('minus_number_pair').alias('minus_number_pair_list')) print('df_group_l2') df_group_l2.printSchema() df_group_l2.show(2) df_group_l2 = df_group_l2.withColumn('right_list', getBestWords(df_group_l2.plus_number_pair_list))\ .withColumn('wrong_list', getWorstWords(df_group_l2.minus_number_pair_list)) print('df_group_l2---') df_group_l2.printSchema() df_group_l2.show(1) df_group_l2_right = df_group_l2.select( 'student_id', f.explode('right_list').alias('str_right_item')) df_group_l2_wrong = df_group_l2.select( 'student_id', f.explode('wrong_list').alias('str_wrong_item')) df_group_l2_right = df_group_l2_right.select( 'student_id', f.col('str_right_item').getItem("lo_plus_number").alias( "learning_object_number"), f.col('str_right_item').getItem("learning_object_id").alias( "learning_object_id"), f.col('str_right_item').getItem("learning_last_date_id").alias( "learning_last_date_id"), f.lit(1).cast('long').alias("number_type")) df_group_l2_right = df_group_l2_right.filter( df_group_l2_right.learning_object_number.isNotNull()) df_group_l2_wrong = df_group_l2_wrong.select( 'student_id', f.col('str_wrong_item').getItem("lo_minus_number").alias( "learning_object_number"), f.col('str_wrong_item').getItem("learning_object_id").alias( "learning_object_id"), f.col('str_wrong_item').getItem("learning_last_date_id").alias( "learning_last_date_id"), f.lit(-1).cast('long').alias("number_type")) df_group_l2_wrong = df_group_l2_wrong.filter( (df_group_l2_wrong.learning_object_number.isNotNull()) & (df_group_l2_wrong.learning_object_number != 0)) print('df_group_l2_right') df_group_l2_right.printSchema() df_group_l2_right.show(2) print('df_group_l2_wrong') df_group_l2_wrong.printSchema() df_group_l2_wrong.show(2) total_plus_minus = df_group_l2_right.union(df_group_l2_wrong) #add created_date_id total_plus_minus = total_plus_minus.withColumn( 'created_date_ids', udf_get_date_list(f.lit(yesterday))) total_plus_minus = total_plus_minus\ .select( 'student_id', 'learning_object_number', 'learning_object_id', 'learning_last_date_id', 'number_type', f.explode('created_date_ids').alias('created_date_id') ) print('total_plus_minus') total_plus_minus.printSchema() dyf_total_plus_minus = DynamicFrame.fromDF(total_plus_minus, glueContext, 'dyf_total_plus_minus') clear_before_saving = 'DELETE student_phonetic_number_history where created_date_id >= ' + str( yesterday_id) datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dyf_total_plus_minus, catalog_connection="glue_redshift", connection_options={ "preactions": clear_before_saving, "dbtable": "student_phonetic_number_history", "database": "dts_odin" }, redshift_tmp_dir= "s3://dts-odin/temp/nvn/knowledge/student_phonetic_number_history/v4", transformation_ctx="datasink4") df_mapping_lo_student_history.unpersist()
def back_kup_h2472_rating_answer(): dyf_jh2472_rating_answer = glueContext \ .create_dynamic_frame.from_catalog(database="do_h2472", table_name="rating_answer") if is_dev: print('dyf_jh2472_rating_answer') dyf_jh2472_rating_answer.printSchema() dyf_jh2472_rating_answer.show(3) # return # root # | -- id: string # | -- rating: float # | -- rating_date: string # | -- rating_user: string # | -- answer_id: string # | -- _key: string # | -- _table: string # | -- _schema: string dyf_jh2472_rating_answer = dyf_jh2472_rating_answer.resolveChoice( specs=[('id', 'cast:long'), ('rating', 'cast:double')]) # dyf_jh2472_rating_answer = Filter.apply(frame=dyf_jh2472_rating_answer, f=lambda x: x["id"] > 26139) df_jh2472_rating_answer = dyf_jh2472_rating_answer.toDF() df_jh2472_rating_answer = df_jh2472_rating_answer.dropDuplicates(['id']) dyf_jh2472_rating_answer = DynamicFrame.fromDF(df_jh2472_rating_answer, glueContext, 'dyf_jh2472_rating_answer') # # applymapping1 = ApplyMapping.apply( frame=dyf_jh2472_rating_answer, mappings=[("id", 'long', 'id', 'long'), ("rating", "double", "rating", "double"), ("rating_date", "string", "rating_date", "timestamp"), ("rating_user", "string", "rating_user", "string"), ("answer_id", 'string', 'answer_id', 'long')]) # # # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") # # if is_dev: print('resolvechoice1') resolvechoice1.printSchema() resolvechoice1.show(3) # # # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=resolvechoice1, catalog_connection="h2474_backup", connection_options={ "dbtable": "rating_answer", "database": "topicaH2472" }, redshift_tmp_dir="s3a://dts-odin/topicaH2472/rating_answer", transformation_ctx="datasink5")
trimmedLEOriginRequestLogs = DropFields.apply(frame = labdaEdgeOriginRequestLogs, paths=["executionregion", "distributionid", "distributionname", "requestdata", "customtraceid", "eventtype", "year", "month", "date", "hour"], transformation_ctx ="trimmedLEOriginRequestLogs") ## Rename the requestid field for Lambda@Edge origin request logs to origin requestid modifiedLEOriginRequestLogs = RenameField.apply(frame = trimmedLEOriginRequestLogs, old_name = "requestid", new_name = "origin_requestid", transformation_ctx ="modifiedLEOriginRequestLogs" ) ## Convert to DataFrame modifiedLEOriginRequestLogsDF = modifiedLEOriginRequestLogs.toDF() ## Convert to DataFrame modifiedLEViewerRequestLogsDF = modifiedLEViewerRequestLogs.toDF() ## Join(left outer join) the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid combinedLambdaEdgeLogsDF = modifiedLEViewerRequestLogsDF.join(modifiedLEOriginRequestLogsDF, modifiedLEViewerRequestLogsDF["requestid"] == modifiedLEOriginRequestLogsDF["origin_requestid"], "left_outer") ## Convert to DynamicFrame combinedLambdaEdgeLogs = DynamicFrame.fromDF(combinedLambdaEdgeLogsDF, glueContext, "combinedLambdaEdgeLogs") ## Join the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid #combinedLambdaEdgeLogs = Join.apply(modifiedLEViewerRequestLogs, modifiedLEOriginRequestLogs, 'requestid', 'origin_requestid') ## Drop the origin_requestid field lambdaEdgeLogs = DropFields.apply(frame = combinedLambdaEdgeLogs, paths=["origin_requestid"], transformation_ctx ="lambdaEdgeLogs") ## Drop the "year", "month", "date", "hour" fields trimmedLambdaEdgeLogs = DropFields.apply(frame =lambdaEdgeLogs, paths=["year", "month", "date", "hour", "useragentstring"], transformation_ctx ="trimmedLambdaEdgeLogs") ## Convert to DataFrame trimmedLambdaEdgeLogsDF = trimmedLambdaEdgeLogs.toDF() #Destnation S3 loaction for combine Lambda@Edge logs leLogDestPath = "s3://" + args['target_s3_bucket'] + "/combined/lelogs"