def main(args, yesterday): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) database = args['database_name'] table_name = '{0}kinesis_data'.format(args['table_prefix']) partition_predicate = \ 'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format( yesterday.year, yesterday.month, yesterday.day) datasource0 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=table_name, push_down_predicate=partition_predicate, transformation_ctx='datasource0') if datasource0.count() > 0: applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=[('device_id', 'string', 'device_id', 'string'), ('timestamp', 'long', 'timestamp', 'long')], transformation_ctx='applymapping1') df = applymapping1.toDF() dev = df.drop_duplicates(['device_id']) device_list = dev.collect() result_list = [] for device in device_list: device_id = device['device_id'] df2 = df[df.device_id == device_id] df_list = df2.sort('timestamp', ascending=True).collect() result_list.extend(create_diff_timestamp_list(device_id, df_list)) df = spark.createDataFrame(result_list) dyf = DynamicFrame.fromDF(df, glueContext, 'dyf') # Hive format output_path = os.path.join(args['target_bucket'], 'year={0}'.format(yesterday.year), 'month={0:02}'.format(yesterday.month), 'day={0:02}'.format(yesterday.day)) glueContext.write_dynamic_frame.from_options( frame=dyf, connection_type='s3', connection_options={'path': output_path}, format='parquet', transformation_ctx='datasink2') job.commit()
def mappingForAll(dynamicFrame, mapping): applymapping2 = ApplyMapping.apply(frame=dynamicFrame, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping): # for k, v in add_collum.items(): # df_communication = df_communication.withColumn(k, v) applymapping2 = ApplyMapping.apply(frame=dynamicFrame, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def retrieve_dynamic_frame(glue_context, spark, data, header, name, mapping): data_frame = spark.createDataFrame(data, header) dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, name) display(dynamic_frame, "retrieve" + name) apply_dynamic_frame = ApplyMapping.apply(frame=dynamic_frame, mappings=mapping) dyf_result = ResolveChoice.apply(frame=apply_dynamic_frame, choice="make_cols", transformation_ctx="resolvechoice2") display(dynamic_frame, "apply" + name) return dyf_result
def mappingForAll(dynamicFrame,mapping,add_collum): df_communication = dynamicFrame.toDF() df_communication = df_communication.dropDuplicates() for k, v in add_collum.items(): df_communication = df_communication.withColumn(k, v) dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication') applymapping2 = ApplyMapping.apply(frame=dyf_communication, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping): df = dynamicFrame.toDF() df = df.dropDuplicates() dyf = DynamicFrame.fromDF(df, glueContext, "dyf") print("-------------------------------------------------") dyf.printSchema() print(mapping) applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_mapping
## @type: ApplyMapping ## @args: [mapping = [("col0", "string", "col0", "string"), ("col1", "string", "col1", "string"), ("col2", "string", "col2", "string"), ("col3", "string", "col3", "string"), ("col4", "string", "col4", "string"), ("col5", "string", "col5", "string"), ("col6", "string", "col6", "string"), ("col7", "string", "col7", "string"), ("col8", "string", "col8", "string"), ("col9", "string", "col9", "string"), ("col10", "string", "col10", "string"), ("col11", "string", "col11", "string"), ("col12", "string", "col12", "string"), ("col13", "string", "col13", "string"), ("col14", "string", "col14", "string"), ("col15", "string", "col15", "string"), ("col16", "string", "col16", "string")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[ ("col0", "string", "col0", "string"), ("col1", "string", "col1", "string"), ("col2", "string", "col2", "string"), ("col3", "string", "col3", "string"), ("col4", "string", "col4", "string"), ("col5", "string", "col5", "string"), ("col6", "string", "col6", "string"), ("col7", "string", "col7", "string"), ("col8", "string", "col8", "string"), ("col9", "string", "col9", "string"), ("col10", "string", "col10", "string"), ("col11", "string", "col11", "string"), ("col12", "string", "col12", "string"), ("col13", "string", "col13", "string"), ("col14", "string", "col14", "string"), ("col15", "string", "col15", "string"), ("col16", "string", "col16", "string"), ], transformation_ctx="applymapping1", ) ## @type: ResolveChoice ## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"] ## @return: resolvechoice2 ## @inputs: [frame = applymapping1]
datasource0 = glueContext.create_dynamic_frame.from_catalog( database="dotz_challenge_raw", table_name="price_quote_csv", transformation_ctx="datasource0", ) ## @type: ApplyMapping ## @args: [mapping = [("tube_assembly_id", "string", "tube_assembly_id", "string"), ("supplier", "string", "supplier", "string"), ("quote_date", "string", "quote_date", "string"), ("annual_usage", "long", "annual_usage", "long"), ("min_order_quantity", "long", "min_order_quantity", "long"), ("bracket_pricing", "string", "bracket_pricing", "string"), ("quantity", "long", "quantity", "long"), ("cost", "double", "cost", "double")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[ ("tube_assembly_id", "string", "tube_assembly_id", "string"), ("supplier", "string", "supplier", "string"), ("quote_date", "string", "quote_date", "string"), ("annual_usage", "long", "annual_usage", "long"), ("min_order_quantity", "long", "min_order_quantity", "long"), ("bracket_pricing", "string", "bracket_pricing", "string"), ("quantity", "long", "quantity", "long"), ("cost", "double", "cost", "double"), ], transformation_ctx="applymapping1", ) ## @type: ResolveChoice ## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"] ## @return: resolvechoice2 ## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2" ) ## @type: DropNullFields ## @args: [transformation_ctx = "dropnullfields3"]
## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog( database="demo-mysql", table_name="salesdb_supplier", transformation_ctx="datasource0") print "MySQL Rows read : " + str(datasource0.count()) datasource0.toDF().show() ## @type: ApplyMapping ## @args: [mapping = [("country", "string", "country", "string"), ("address", "string", "address", "string"), ("name", "string", "name", "string"), ("state", "string", "state", "string"), ("supplier_id", "int", "supplier_id", "int"), ("city", "string", "city", "string"), ("phone", "string", "phone", "string")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[("COUNTRY", "string", "country", "string"), ("ADDRESS", "string", "address", "string"), ("NAME", "string", "name", "string"), ("STATE", "string", "state", "string"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CITY", "string", "city", "string"), ("PHONE", "string", "phone", "string")], transformation_ctx="applymapping1") print "Source count datasource0: " + str(applymapping1.count()) applymapping1.toDF().show() ## @type: SelectFields ## @args: [paths = ["country", "address", "city", "phone", "name", "state", "supplier_id"], transformation_ctx = "selectfields2"] ## @return: selectfields2 ## @inputs: [frame = applymapping1] selectfields2 = SelectFields.apply(frame=applymapping1, paths=[ "country", "address", "city", "phone", "name", "state", "supplier_id" ],
def main(): """This script will load data from the supplied DynamoDB Table to S3 so it can be analyzed with Athena""" if (JOB_TYPE == "issues"): DDB_TABLE_NAME = DDB_ISSUES_TABLE_NAME GLUE_TABLE_NAME = GLUE_ISSUES_TABLE_NAME FIELD_PATHS = [ "eventid", "acknowledged", "created", "sitename", "issuesource", "priority", "areaname#status#processname#eventdescription#stationname#devicename#created", "version", "devicename", "devicename#eventid", "createdat", "areaname", "processname", "createddateutc", "eventdescription", "areaname#status#processname#stationname#devicename#created", "stationname", "id", "acknowledgedtime", "status", "updatedat", "closed", "resolutiontime", "createdby", "acknowledgedby", "closedby", "rejectedby", "additionaldetails" ] elif (JOB_TYPE == "hierarchy"): DDB_TABLE_NAME = DDB_DATA_HIERARCHY_TABLE_NAME GLUE_TABLE_NAME = GLUE_DATA_HIERARCHY_TABLE_NAME FIELD_PATHS = [ "createdat", "name", "description", "id", "devicestationid", "type", "version", "parentid", "updatedat", "areasiteid", "eventprocessid", "eventtype", "priority", "rootcauses", "sms", "eventimgkey", "email", "protocol", "endpoint", "filterpolicy", "subscriptionarn", "stationareaid", "processareaid", "alias" ] else: raise JobInputException( f"JOB_TYPE was invalid ({JOB_TYPE}). Expecting either \"issues\" or \"hierarchy\"" ) log_message([ "Running with the following context:", f"DDB_TABLE_NAME: {DDB_TABLE_NAME}", f"GLUE_TABLE_NAME: {GLUE_TABLE_NAME}", f"GLUE_DB_NAME: {GLUE_DB_NAME}", f"GLUE_OUTPUT_BUCKET: {GLUE_OUTPUT_BUCKET}" ]) DDB_TABLE_NAME_FORMATTED = DDB_TABLE_NAME.lower().replace('-', '_') log_message("Mapping columns") COLUMN_MAPPINGS = list(map(lambda x: get_column_mapping(x), FIELD_PATHS)) log_message("Creating a Dynamic Frame from the DynamoDB table schema") datasource0 = glue_context.create_dynamic_frame.from_catalog( database=GLUE_DB_NAME, table_name=DDB_TABLE_NAME_FORMATTED, transformation_ctx="datasource0") log_message("Applying column mappings") applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=COLUMN_MAPPINGS, transformation_ctx="applymapping1") log_message("Selecting fields") selectfields2 = SelectFields.apply(frame=applymapping1, paths=FIELD_PATHS, transformation_ctx="selectfields2") log_message("Resolving") resolvechoice3 = ResolveChoice.apply(frame=selectfields2, choice="MATCH_CATALOG", database=GLUE_DB_NAME, table_name=GLUE_TABLE_NAME, transformation_ctx="resolvechoice3") resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3, choice="make_struct", transformation_ctx="resolvechoice4") log_message("Persisting data in S3") glue_context.write_dynamic_frame.from_catalog( frame=resolvechoice4, database=GLUE_DB_NAME, table_name=GLUE_TABLE_NAME, transformation_ctx="datasink5") job.commit() log_message("Done")
transformation_ctx="datasource0") ## @type: ApplyMapping ## @args: [mapping = [("component_id", "string", "component_id", "string"), ("component_type_id", "string", "component_type_id", "string"), ("type", "string", "type", "string"), ("connection_type_id", "string", "connection_type_id", "string"), ("outside_shape", "string", "outside_shape", "string"), ("base_type", "string", "base_type", "string"), ("height_over_tube", "double", "height_over_tube", "double"), ("bolt_pattern_long", "string", "bolt_pattern_long", "string"), ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"), ("groove", "string", "groove", "string"), ("base_diameter", "string", "base_diameter", "string"), ("shoulder_diameter", "string", "shoulder_diameter", "string"), ("unique_feature", "string", "unique_feature", "string"), ("orientation", "string", "orientation", "string"), ("weight", "string", "weight", "string")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[ ("component_id", "string", "component_id", "string"), ("component_type_id", "string", "component_type_id", "string"), ("type", "string", "type", "string"), ("connection_type_id", "string", "connection_type_id", "string"), ("outside_shape", "string", "outside_shape", "string"), ("base_type", "string", "base_type", "string"), ("height_over_tube", "double", "height_over_tube", "double"), ("bolt_pattern_long", "string", "bolt_pattern_long", "string"), ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"), ("groove", "string", "groove", "string"), ("base_diameter", "string", "base_diameter", "string"), ("shoulder_diameter", "string", "shoulder_diameter", "string"), ("unique_feature", "string", "unique_feature", "string"), ("orientation", "string", "orientation", "string"), ("weight", "string", "weight", "string"), ], transformation_ctx="applymapping1", ) ## @type: ResolveChoice ## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"] ## @return: resolvechoice2 ## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
def etl_student_advisor(): print_is_dev( "etl_advisor_profile___****************************************************************************************" ) # ETL for profile # dyf_student_advisor = connectGlue(database="tig_advisor", table_name="log_change_assignment_advisor", # # select_fields=["id", "contact_id", "advisor_id_old", "advisor_id_new","created_at", "updated_at"], # # fillter=["id"], # # duplicates=["id"]) # ------------------------------------------------------------------------------------------------------------------# df_student_advisor = get_dyf_student_advisor(glueContext) number_student_advisor = df_student_advisor.count() if is_dev: print("number_advisor_account: ", number_student_advisor) df_student_advisor.printSchema() df_student_advisor.show(3) if number_student_advisor < 1: return # ------------------------------------------------------------------------------------------------------------------# # process contact_id with advisor from log_change_assignment_advisor and student_contact df_contact_id_list = df_student_advisor.select('contact_id') df_contact_id_list = df_contact_id_list.dropDuplicates(['contact_id']) df_change_advisor_new = get_df_change_advisor_new(df_student_advisor) #df_change_advisor_init = get_df_change_advisor_init(df_student_advisor) #df_advisor_from_student_contact = get_advisor_from_student_contact(glueContext, df_contact_id_list) df_change_advisor_total = df_change_advisor_new # .union(df_change_advisor_init) #.union(df_advisor_from_student_contact) if is_dev: print('df_change_advisor_total') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) #-------------------------------------------------------------------------------------------------------------------# df_change_advisor_total = df_change_advisor_total\ .withColumn('created_at', f.unix_timestamp('created_at', format='yyyy-MM-dd HH:mm:ss')) df_change_advisor_total = df_change_advisor_total.orderBy( "contact_id", "created_at") if is_dev: print('df_change_advisor_total_after_order') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) df_change_advisor_total = df_change_advisor_total.select( "contact_id", f.struct('advisor_id', 'created_at').alias('end_start')) print('df_change_advisor_total') df_change_advisor_total.printSchema() df_change_advisor_total.show(3) df_change_advisor_group_contact = df_change_advisor_total.groupBy("contact_id") \ .agg(f.collect_list("end_start").alias("l_created_at")) print('df_change_advisor_group_contact_______________') df_change_advisor_group_contact.printSchema() df_change_advisor_group_contact.show(3) df_change_advisor_group_contact = df_change_advisor_group_contact\ .withColumn("l_created_at", transpose_start_end_udf("l_created_at")) df_change_advisor_group_contact.printSchema() df_change_advisor_explore = df_change_advisor_group_contact\ .select("contact_id", f.explode("l_created_at").alias("start_end_transpose") ) df_change_advisor_explore.printSchema() df_change_advisor_explore = df_change_advisor_explore.select( "contact_id", f.col("start_end_transpose").getItem("advisor_id").alias("advisor_id"), f.col("start_end_transpose").getItem("start").alias("start"), f.col("start_end_transpose").getItem("end").alias("end")) df_change_advisor_explore.printSchema() df_change_advisor_explore = addCollum(df_change_advisor_explore, STUDENT_ADVISOR_ADD_COL) dyf_change_advisor_explore = DynamicFrame.fromDF( df_change_advisor_explore, glueContext, "dyf_change_advisor_explore") # dyf_change_advisor_explore = mappingForAll(dyf_change_advisor_explore, mapping = STUDENT_ADVISOR_MAPPING) print('dyf_change_advisor_explore') dyf_change_advisor_explore.printSchema() dyf_change_advisor_explore.show(10) ######## save to s3###### # parquetToS3(dyf_student_advisor_mapping, path="s3://dtsodin/student_behavior/advisor_thangvm/advisor_history") ############################################################ # ------------------------------------------------------------------------------------------------------------------# # | -- student_advisor_id: string # | -- contact_id: string # | -- advisor_id: string # | -- start_date: string # | -- end_date: string # | -- created_at: int # | -- updated_at: string apply_ouput = ApplyMapping.apply( frame=dyf_change_advisor_explore, mappings=[("contact_id", "string", "contact_id", "string"), ("advisor_id", "string", "advisor_id", "long"), ("start", "string", "start_date", "long"), ("end", "string", "end_date", "long"), ("created_at", "int", "created_at", "long")]) # dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output, catalog_connection="glue_redshift", connection_options={ # "preactions": """TRUNCATE TABLE ad_student_advisor """, "dbtable": "ad_student_advisor", "database": "transaction_log" }, redshift_tmp_dir= "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_package", transformation_ctx="datasink4")
DYNAMODB_INPUT_TABLE_NAME = 'aws-glue-local-test-table' S3_OUTPUT_BUCKET_NAME = 'aws-glue-local-test-bucket' args = getResolvedOptions(sys.argv, ['JOB_NAME']) JOB_NAME = args['JOB_NAME'] sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(JOB_NAME, args) datasource = glueContext.create_dynamic_frame.from_options( connection_type='dynamodb', connection_options={'dynamodb.input.tableName': DYNAMODB_INPUT_TABLE_NAME}) applymapping = ApplyMapping.apply( frame=datasource, mappings=[('Id', 'string', 'Id', 'string'), ('Column1', 'string', 'Column1', 'string'), ('Column2', 'string', 'Column2', 'string'), ('Column3', 'string', 'Column3', 'string')]) glueContext.write_dynamic_frame.from_options( frame=datasource, connection_type='s3', connection_options={'path': 's3://' + S3_OUTPUT_BUCKET_NAME}, format='csv') job.commit()
relationalized_dynamic_frames = Relationalize.apply( frame=ec2s, staging_path=tempDir, transformation_ctx="relationalized_dynamic_frames") # 'roottable' is the default prefix of relationalization rdf = relationalized_dynamic_frames.select( 'roottable_Reservations.val.Instances') rdf.show(1) # 2. Map fields to bulk load CSV column headings format applymapping1 = ApplyMapping.apply( frame=rdf, mappings=[("`Reservations.val.Instances.val.InstanceId`", "string", "instanceId:String", "string"), ("`Reservations.val.Instances.val.InstanceType`", "string", "instanceType:String", "string")], transformation_ctx="applymapping1") applymapping1.show(1) # 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph) applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'instanceId:String', 'ec2')]) # 4. Select fields for upsert selectfields1 = SelectFields.apply( frame=applymapping2, paths=["~id", 'instanceId:String', 'instanceType:String'], transformation_ctx="selectfields1")
print "Creating Order vertices..." datasource0 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=order_table, transformation_ctx="datasource0") df0 = datasource0.toDF().filter(col("ORDER_DATE") == checkpoint) datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1') print "Total orders : " + str(datasource0.count()) print "Orders for checkpoint: " + str(datasource1.count()) applymapping1 = ApplyMapping.apply( frame=datasource1, mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"), ("SHIP_MODE", "string", "shipMode", "string"), ("SITE_ID", "double", "siteId", "int"), ("ORDER_ID", "int", "orderId", "int")], transformation_ctx="applymapping1") applymapping1 = GremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'orderId', 'o')]) selectfields1 = SelectFields.apply(frame=applymapping1, paths=["~id", "orderDate", "shipMode"], transformation_ctx="selectfields1") selectfields1.toDF().foreachPartition(neptune.add_vertices('Order')) print "Creating OrderDetail vertices..." datasource2 = glueContext.create_dynamic_frame.from_catalog( database=database,
"Select a.*, bround(a.QUANTITY*a.UNIT_PRICE,2) as EXTENDED_PRICE, \ bround(QUANTITY*(UNIT_PRICE-SUPPLY_COST) ,2) as PROFIT, \ DATE_FORMAT(ORDER_DATE,'yyyyMMdd') as DATE_KEY \ from (Select * from tbl0) a") df1.show(5) datasource4 = DynamicFrame.fromDF(df1, glueContext, 'datasource4') applymapping1 = ApplyMapping.apply( frame=datasource4, mappings=[("DISCOUNT", "decimal(10,2)", "discount", "decimal(10,2)"), ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("TAX", "decimal(10,2)", "tax", "decimal(10,2)"), ("SUPPLY_COST", "decimal(10,2)", "supply_cost", "decimal(10,2)"), ("PRODUCT_ID", "int", "product_id", "int"), ("QUANTITY", "int", "quantity", "int"), ("LINE_ID", "int", "line_id", "int"), ("LINE_NUMBER", "int", "line_number", "int"), ("ORDER_DATE", "timestamp", "order_date", "timestamp"), ("SHIP_MODE", "string", "ship_mode", "string"), ("SITE_ID", "double", "site_id", "int"), ("PROFIT", "decimal(10,2)", "profit", "decimal(10,2)"), ("EXTENDED_PRICE", "decimal(10,2)", "extended_price", "decimal(10,2)"), ("DATE_KEY", "string", "date_key", "string"), ("ORDER_ID", "int", "order_id", "int")], transformation_ctx="applymapping1") print "After ApplyMapping :" applymapping1.printSchema() applymapping1.toDF().show(5, False) #selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["quantity", "discount", "tax", "unit_price", "line_id", "date_key", "order_date", "extended_price", "ship_mode", "line_number", "product_id", "site_id", "supply_cost", "order_id", "profit"], transformation_ctx = "selectfields2") selectfields2 = SelectFields.apply(frame=applymapping1,
product_table = '{}salesdb_product'.format(args['TABLE_PREFIX']) product_category_table = '{}salesdb_product_category'.format(args['TABLE_PREFIX']) supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX']) gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME']) gremlin_client = GlueGremlinClient(gremlin_endpoints) # Product vertices print("Creating Product vertices...") datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1") datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join") applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1") applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')]) selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1") selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100)) # Supplier vertices print("Creating Supplier vertices...") datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3") applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1") applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')]) selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3")
job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product_category", transformation_ctx = "datasource1") datasource3=datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join") print "Joined Rows : "+str(datasource3.count()) datasource3.printSchema() ## @type: ApplyMapping ## @args: [mapping = [("name", "string", "name", "string"), ("unit_price", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("product_id", "double", "product_id", "int"), ("quantity_per_unit", "double", "quantity_per_unit", "int"), ("category_id", "double", "category_id", "int"), ("supplier_id", "double", "supplier_id", "int")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource3] applymapping1 = ApplyMapping.apply(frame = datasource3, mappings = [("NAME", "string", "name", "string"), ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("PRODUCT_ID", "int", "product_id", "int"), ("QUANTITY_PER_UNIT", "int", "quantity_per_unit", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CATEGORY_NAME", "string", "category_name", "string"), ("DESCRIPTION", "string", "description", "string"), ("IMAGE_URL", "string", "image_url", "string")], transformation_ctx = "applymapping1") #applymapping1.toDF().show() ## @type: SelectFields ## @args: [paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2"] ## @return: selectfields2 ## @inputs: [frame = applymapping1] selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2") print "selectfields2 Rows : "+str(selectfields2.count()) selectfields2.toDF().show() ## @type: DataSink ## @args: [database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"] ## @return: datasink3 ## @inputs: [frame = selectfields2] datasink3 = glueContext.write_dynamic_frame.from_catalog(frame = selectfields2, database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink3") job.commit()
sdf_row_data = raw_dynamic_frame.toDF() sdf_row_data.select(column_sort_values).show() sdf_conveted_column_to_sort_data = sdf_row_data.withColumn( column_sort_values, col(column_sort_values).cast("timestamp")) sdf_sorted_data = sdf_conveted_column_to_sort_data.orderBy( desc(column_sort_values)) sdf_deduplicate_data = sdf_sorted_data.drop_duplicates( [column_drop_duplicates]) types_mapping_dict = load_mapping_types(jsonObject, sdf_deduplicate_data.dtypes) df_deduplicate_data = DynamicFrame.fromDF(sdf_deduplicate_data, glueContext, "df_deduplicate_data") df_converted_type_data = ApplyMapping.apply( frame=df_deduplicate_data, mappings=types_mapping_dict, transformation_ctx="df_converted_type_data") datasink = glueContext.write_dynamic_frame.from_options( frame=df_converted_type_data, connection_type="s3", connection_options={"path": "s3://glue-terraform/data/output"}, format="parquet", transformation_ctx="datasink") job.commit()