from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from pyspark.ml.feature import StringIndexer glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session args = getResolvedOptions(sys.argv, ['s3_bucket']) s3_bucket = args['s3_bucket'] print(s3_bucket) input_dir = f's3://{s3_bucket}/2020/' print(input_dir) output_dir = f's3://{s3_bucket}/output-dir' print(output_dir) df = glueContext.create_dynamic_frame_from_options('s3', {'paths':[input_dir], 'recurse':True, 'groupFiles': 'inPartition', 'groupSize': '1048576'}, format="json") df.printSchema() df1 = df.toDF() indexer = StringIndexer(inputCol="GENDER", outputCol="GENDER_INDEX") indexed = indexer.fit(df1).transform(df1) indexed.show() df = df.apply_mapping([ ('FIRST', 'string', 'FIRST', 'string'), ('LAST', 'string', 'LAST', 'string'), ('AGE', 'integer', 'AGE', 'integer'), ('GENDER_INDEX', 'integer', 'GENDER', 'integer'), ('LATITUDE', 'string', 'LATITUDE', 'string'), ('LONGITUDE', 'string', 'LONGITUDE', 'string'), ]) glueContext.write_dynamic_frame.from_options(frame = df, connection_type = "s3", connection_options = {"path": output_dir}, format = "csv")
from pyspark.context import SparkContext from pyspark.sql.functions import unix_timestamp ## @params: [JOB_NAME,S3_CSV_OUTPUT_PATH] args = getResolvedOptions(sys.argv, ['JOB_NAME', 'S3_CSV_OUTPUT_PATH']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Load JSON files into dynamic frame. datasource0 = glueContext.create_dynamic_frame_from_options( "s3", { 'paths': ["s3://segment-personalize-workshop/segment-logs"], 'recurse': True }, format="json") print("Input file total record count: ", datasource0.count()) # Filters the JSON documents that we want included in the output CSV. # These are the event types we're interested for our dataset. supported_events = ['Product Added', 'Order Completed', 'Product Clicked'] def filter_function(dynamicRecord): if ('userId' in dynamicRecord and 'properties' in dynamicRecord and 'sku' in dynamicRecord["properties"] and 'event' in dynamicRecord and dynamicRecord['event'] in supported_events): return True
(query_df1.ApplicationId == applicationId)) ##query_df_filter2.toDF() querySql_final = query_df_final.select(col("query")) querySql = querySql_final.first()["query"] ## read the data from s3 #var bucketname = s3_input_data_folder paths = "s3://" + s3_input_data_folder + "/" data_df = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={ "paths": [paths], "useS3ListImplementation": True, "recurse": True }, format=file_format, ## "csv", format_options={ "withHeader": True }, transformation_ctx="data_df").toDF() data_df.show(10) ## read data from input table to a data frame #data_df=glueContext.create_dynamic_frame.from_catalog(database=database,table_name=table_name).toDF() ## running sql query on the dataframe created with input dataset data_df.createOrReplaceTempView('data_df') data_df = spark.sql('{} from data_df'.format(querySql)) query_columns = ['werk', 'spj', 'knr', 'result', 'probability', 'time']
from awsglue.context import GlueContext glueConext = GlueContext(spark) dyf = glueConext.create_dynamic_frame_from_catalog( database='<database>', table_name='<table>' ) dyf = glueConext.create_dynamic_frame_from_options( connection_type='sqlserver', connection_options={ "url": "jdbc:sqlserver://database-2.xxx.eu-west-1.rds.amazonaws.com:1433/test", "user": "******", "password": "******", "dbtable": "<table>" } ) # with recurse, """ in S3: a/b1/c1/files.csv a/b1/c2/files.csv """ dyf = glueContext.create_dynamic_frame.from_options( 's3', connection_options={"paths": ["s3://a/"], 'recurse':True}, format='json',
# FilterPredicateTest # for JDBC connector only options_filterPredicateTest = { "query": "select NumberOfEmployees, CreatedDate from Account where", "url": "jdbc:some:url:user=${user};Password=${Password};SecurityToken=${SecurityToken};", "secretId": "test-partner-driver", "className" : "partner.jdbc.some.Driver", # test parameter "filterPredicate": "BillingState='CA'" } ##################################### read data from data source ###################################### datasource0 = glueContext.create_dynamic_frame_from_options( connection_type = "marketplace.jdbc", connection_options = options_secretsManagerTest) # pick up the right test conection options ######################################## validate data reading ######################################## ## validate data schema and count # more data type: https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-crawler-pyspark-extensions-types.html expected_schema = StructType([Field("NumberOfEmployees", IntegerType()), Field("CreatedDate", TimestampType())]) expected_count = 2 assert datasource0.schema() == expected_schema print("expected schema: " + str(expected_schema.jsonValue())) print("result schema: " + str(datasource0.schema().jsonValue())) print("result schema in tree structure: ") datasource0.printSchema()
spark = glueContext.spark_session job = Job(glueContext) job.init(job_name, args) df = sqlContext.createDataFrame([ (7,"ishan","kompressor","mbenz"), (14,"john","wrangler","jeep"),], ["HOUR","NAME","car","brand"]) df.write.parquet("s3://xx-xx-xx/Glue/oge/cars/") datasource0 = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://xx-xx-xx/Glue/ogenew/"], "recurse":True}, format="parquet",transformation_ctx = "datasource0") datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3://xx-xx-xx/Glue/ogenew_pp/", "partitionKeys": ["brand","car"]}, format = "parquet",transformation_ctx = "datasink2") df = sqlContext.createDataFrame([ (2019,10,7,7,"ishan","chicago"), (2018,11,8,9,"james","italy"), (2017,12,9,14,"john","plano"), (2016,1,10,13,"adam","texas"), (2015,2,11,12,"chris","mexico"), (2014,3,12,22,"niel","portland"),], ["YEAR","MONTH","DAY","HOUR","NAME","CITY"]) df.write.parquet("/aws-xx-logs/Glue/glue_bookmark_issue_non_partitioned/")
spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) # Job input parameters project = args['project'] stage = args['stage'] env = Env.get_config(project, stage) incoming_path = "s3://{}/{}/food".format(env["s3_data_lake"], env["incoming_folder"]) primary_path = "s3://{}/{}/food".format(env["s3_data_lake"], env["primary_folder"]) # Read Data datasource0 = glue_context.create_dynamic_frame_from_options(connection_type='s3', connection_options = {"paths": [incoming_path]}, format = "csv", \ format_options={"withHeader":True}, transformation_ctx = "readctx") datasource0.printSchema() datasource0.show(2) # mask numbers in data with _number_ bpSamplePySpark = BpSamplePySpark(spark, env) df = bpSamplePySpark.process_data(datasource0.toDF()) dyf = DynamicFrame.fromDF(df, glue_context, 'dyf') # Write data glue_context.write_dynamic_frame_from_options(frame = dyf, connection_type = "s3", \ connection_options = {"path": primary_path, \ }, format = "parquet", transformation_ctx = "writectx") job.commit()
sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) glueContext = GlueContext(SparkContext.getOrCreate()) schema = StructType([ StructField('category_id', IntegerType(), False), StructField('sum_value', IntegerType(), False), StructField('userid', IntegerType(), False), ]) dyf = glueContext.create_dynamic_frame_from_options( 's3', {'paths': [dir_path]}, format='csv', format_options={'withHeader': True}) # DynamicFrame から DataFrame への変換 df = dyf.toDF() df = df.withColumn(target_column, f.col(target_column).cast(IntegerType())) df = df.withColumn("sum_value", f.col("sum_value").cast(IntegerType())) df = df.withColumn("userid", f.col("userid").cast(IntegerType())) print("size of dataframe") print((df.count(), len(df.columns))) # category_id で横持ちへpivot df2 = df.groupby('userid').pivot(target_column).sum('sum_value').fillna(0) print("pivot") print((df2.count(), len(df2.columns)))
ddb_split = 8 * (int(num_workers) - 1) else: num_executers = (int(num_workers) - 1) * 2 - 1 ddb_split = 4 * num_executers print(str(ddb_split)) args = getResolvedOptions(sys.argv, ["JOB_NAME"]) glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(args["JOB_NAME"], args) dyf = glue_context.create_dynamic_frame_from_options( connection_type="dynamodb", connection_options={ "dynamodb.region": region, "dynamodb.splits": str(ddb_split), "dynamodb.throughput.read.percent": "1.2", "dynamodb.input.tableName": source_ddb_name }) dyf.show() sts_client = boto3.client('sts') sts_response = sts_client.assume_role(RoleArn=target_role_arn, RoleSessionName='assume-role') ddb_key = sts_response['Credentials']['AccessKeyId'] ddb_secret = sts_response['Credentials']['SecretAccessKey'] ddb_token = sts_response['Credentials']['SessionToken'] glue_context.write_dynamic_frame_from_options( frame=dyf,
## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Tables with important attributes # ozone = glueContext.create_dynamic_frame.from_catalog(database = "1strategy-insights-dm", table_name = "insights_dm_stage_cached_applications", transformation_ctx = "applications") ozone = glueContext.create_dynamic_frame_from_options( "mysql", connection_options={ "url": "jdbc:mysql://air-quality.crwizazpv2rg.us-west-2.rds.amazonaws.com:3306/airquality", "user": "******", "password": "******", "dbtable": "ozone" }, format=None) # Create Data Frames and Views to query ozone_df = ozone.toDF() ozone_df.createOrReplaceTempView("ozone_view") utah_subset_query = """ select * from ozone where state = 'utah'; """ utah_ozone = spark.sql(utah_subset_query)
import boto3 import pandas as pd import numpy as np ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ### create temp view of S3 cafe dynamic frame cafeFLData = glueContext.create_dynamic_frame_from_options( connection_type="s3", format="parquet", connection_options={"paths": ["s3://cafe/cafe_FL_PoC"]}) cafeFLData_SDF = cafeFLData.toDF() print(" S3 cafeFLData_SDF Count: ", cafeFLData_SDF.count()) cafeFLData_SDF.createOrReplaceTempView("cafe_temp_view") cafeRefreshData = glueContext.create_dynamic_frame_from_options( connection_type="s3", format="parquet", connection_options={"paths": ["s3://cafe_refresh"]}) cafeRefreshData_SDF = cafeRefreshData.toDF() cafeRefreshData_SDF.createOrReplaceTempView("cafe_refresh_temp_view") ###################### INSERTION ####################################### #print('Starting to INSERT') df_insert = spark.sql("""
s3_object_exists = False try: s3_resource.Object(bucket, key).load() s3_object_exists = True except Exception as e: print("S3 object not found. error message: "+str(e)) if (s3_object_exists): try: table = glue_client.get_table(DatabaseName=glue_db, Name=glue_table) table_columns = table['Table']['StorageDescriptor']['Columns'] s3_destination = str(table['Table']['StorageDescriptor']['Location']) # Create Dynamic Frame from S3 CSV Object dynamicFrame = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": [s3_source_path]}, format_options={"withHeader": True,"separator": ","}, format = "csv") # Convert to Spark Data Frame dataFrame = dynamicFrame.toDF() # Cast Column types from Glue Table into Spark Data Frame for column in table_columns: dataFrame = dataFrame.withColumn(column['Name'], dataFrame[column['Name']].cast(column['Type'])) # Convert back to Glue Dynamic Frame for S3 upload final_dynamicFrame = DynamicFrame.fromDF(dataFrame, glueContext, "final_dynamicFrame") # Delete any unnecessary columns final_dynamicFrame = final_dynamicFrame.drop_fields(['col4', 'col5', 'col6']) # Send dynamic frame to S3 as parquet files. S3 location specified by the given Glue table
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import datetime import boto3 args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame_from_options( "s3", {'paths': ["s3://xxx-xx-logs/Glue/parquet_sample_dataset/"]}, format="parquet", transformation_ctx="datasource0") datasink3 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={"path": "s3://xx-xx-logs/Glue/glue_bm_issue_11_12/"}, format="parquet", transformation_ctx="datasink3") job.commit()
args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "green__tripdata_staging", table_name = "stagingfhv_tripdata_2015_01_csv", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] origin = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={ "paths": ["s3://taxi-data-etl/fhv_trippdata_test.csv"] }, format="csv", format_options={ "withHeader": True, "separator": "," }) # origin = glueContext.create_dynamic_frame.from_catalog(database="green__tripdata_staging", # table_name="stagingfhv_tripdata_2015_01_csv", # transformation_ctx="origin") ## @type: ApplyMapping ## @args: [mapping = [("dispatching_base_num", "string", "dispatching_base_num", "string"), ("pickup_date", "string", "pickup_datetime", "timestamp"), ("locationid", "string", "pickup_location_id", "long")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=origin, mappings=[("dispatching_base_num", "string", "dispatching_base_num",
path_input = "s3://" + bucket + "/" + key path_output = "s3://" + bucket + "/pyspark_output/" + output_name print("bucket: " + bucket) print("key: " + key) print("output_name: " + output_name) print("path_input: " + path_input) print("path_output: " + path_output) glueContext = GlueContext(SparkContext.getOrCreate()) df = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={"paths": [path_input]}, format="csv", format_options={ "withHeader": True, "separator": "," }, ) glueContext.write_dynamic_frame.from_options( frame=df, connection_type="s3", connection_options={ "path": path_output, "partitionKeys": ["col3"] }, format="parquet", )
prefix = "s3://" + workflow_params['jsonlines_bucket'] + "/" + workflow_params[ 'jsonlines_object'] output = "s3://" + workflow_params['jsonlines_bucket'] + "/abcstations" logger = glueContext.get_logger() logger.info("prefix: {}".format(prefix)) logger.info("output: {}".format(output)) ## @params: [JOB_NAME] job.init(args['JOB_NAME'], args) dynamic_frame = glueContext.create_dynamic_frame_from_options( connection_type="s3", format="json", connection_options={ # "paths" : ["s3://mig-test-bucket/input"] "paths": [prefix] }, transformation_ctx="dynamic_frame" ) #optional - needed if you are using bookmarks # printSchema() doesnt save to a variable # myschema=dynamic_frame.printSchema() myschema = dynamic_frame._jdf.schema().treeString() logger.info("{}".format(myschema)) # Perform transformations here if you need # # # Glue just adds a new parquet file to output with the data - in this case it leads to duplicate data so use a spark df frame with overwrite mode instead.
datepath = "/{}/{}/{}".format(year, month, day) inputpath = inputpath + datepath ## database source jdbcurl = "jdbc:postgresql://preventech-cn-stage.cmusx9j6p12i.rds.cn-north-1.amazonaws.com.cn:5432/da_nxg_chinadev" dbusernam = "DA_NGCA_DEV_APP" dbpassword = "******" grouptable = "master_data.groups" customertable = "master_data.customer" print("===================== start ETL step 1 =====================") print(" ER json data: " + inputpath) ## DataSource 原始json 数据 erjsondata = glueContext.create_dynamic_frame_from_options( "s3", { 'paths': [inputpath], 'recurse': True }, format="json", transformation_ctx="erjsondata") ## group & subgroup 数据 groupdata = glueContext.create_dynamic_frame_from_options( "postgresql", { 'url': jdbcurl, 'user': dbusernam, 'password': dbpassword, 'dbtable': grouptable }, transformation_ctx="groupdata") ## customer 数据 customerdata = glueContext.create_dynamic_frame_from_options( "postgresql", {
spark_session = glueContext.spark_session sqlContext = SQLContext(spark_session.sparkContext, spark_session) args = getResolvedOptions( sys.argv, ['JOB_NAME', 'bucket', 'path', 'outputBucket', 'staticCSVPath']) bucket = args['bucket'] csvPath = args['path'] outputBucket = args['outputBucket'] staticCSVPath = args['staticCSVPath'] print(bucket, csvPath) newFile = glueContext.create_dynamic_frame_from_options("s3", { 'paths': ["s3://" + bucket + "/" + csvPath] }, format="csv", format_options={ 'withHeader': True }).drop_fields( ['name']).toDF() staticFile = glueContext.create_dynamic_frame_from_options("s3", { 'paths': [staticCSVPath] }, format="csv", format_options={ 'withHeader': True }).toDF() newFile.registerTempTable("newCSV") staticFile.registerTempTable("staticCSV")
'output_path', 'number_of_partitions' ]) glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) inputPath = args['input_path'] outputPath = args['output_path'] numberOfPartitions = int(args.get('number_of_partitions', 1)) input_dyf = glueContext.create_dynamic_frame_from_options("s3", { "paths": [ inputPath ], "recurse": True, "groupFiles": "inPartition" }, format = "parquet" ) repartitionedDYF = input_dyf.repartition(numberOfPartitions) glueContext.write_dynamic_frame.from_options( frame = repartitionedDYF, connection_type = "s3", connection_options = {"path": outputPath}, format = "glueparquet" ) job.commit()
def count_null(c): """Use conversion between boolean and integer - False -> 0 - True -> 1 """ pred = col(c).isNull() return sum(pred.cast("integer")).alias(c) def null_perc (df): exprs = [((count_null(c) / count("*"))*100).alias(c) for c in df.columns] return df.agg(*exprs).collect() ## Creating glue dynamic frame, map respective columns, drop null value columns and drop fields for feature selection patients = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/patients/*/*/*'],},format="parquet",transformation_ctx = "patients_csv") patients_dropcol = patients.drop_fields(['patient_county','patient_deathdate','patient_drivers','patient_ssn','patient_passport','patient_suffix','patient_birthplace','patient_prefix','patient_maiden','patient_state','patient_zip','patient_first','patient_last','patient_city','patient_address','patient_lon','patient_lat','partition_0','partition_1','partition_2']) ##Dropping fields for feature engineering conditions = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/conditions/*/*/*'],},format="parquet",transformation_ctx = "conditions_csv") conditions_dropcol = conditions.drop_fields(['conditions_stop','conditions_start','conditions_description','partition_0','partition_1','partition_2']) encounters = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/encounters/*/*/*'],},format="parquet",transformation_ctx = "encounters_csv") encounters_dropcol = encounters.drop_fields(['encounters_description','encounters_reasondescription','partition_0','partition_1','partition_2']) immunizations = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/immunizations/*/*/*'],},format="parquet",transformation_ctx = "immunizations_csv") immunizations_dropcol = immunizations.drop_fields(['immunizations_patient','immunizations_date','immunizations_code','immunizations_description','immunizations_base_cost','partition_0','partition_1','partition_2']) medications = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/medications/*/*/*'],},format="parquet",transformation_ctx = "medications_csv") medications_dropcol = medications.drop_fields(['medications_dispenses','medications_start','medications_payer','medications_code','medications_stop','medications_reasoncode','medications_reasondescription','medications_description','partition_0','partition_1','partition_2'])
# Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 from pyspark.context import SparkContext from awsglue.context import GlueContext glue_context = GlueContext(SparkContext()) logger = glue_context.get_logger() connection_options = { "table": "customer", "scale": "1", "numPartitions": "1", "connectionName": "GlueTPCDSConnection" } # read data from data source datasource0 = glue_context.create_dynamic_frame_from_options( connection_type="marketplace.spark", connection_options=connection_options) # validate data reading and row count expected_count = 100000 result_count = datasource0.count() assert result_count == expected_count logger.info( f'Expected record count: {expected_count}, result record count: {result_count}' )
class generate_testdata(): def __init__(self): self.glueContext = None self.json_obj = None self.new_json_obj = None self.df_readschema = None self.sqlContext = None self.df_tgt = None self.spark = None self.df_customer_table = None self.df_order_table = None def initialize(self): spark_context = SparkContext() self.glueContext = GlueContext(spark_context) self.sqlContext = SQLContext(spark_context) self.spark = self.glueContext.spark_session def extract(self): args_new = getResolvedOptions(sys.argv, ['cfg_file_path_new']) cfg_file_path_new = args_new['cfg_file_path_new'] file_res = urlparse(cfg_file_path_new) s3_resource = boto3.resource('s3') file_obj = s3_resource.Object(file_res.netloc, file_res.path.lstrip('/')) content = file_obj.get()['Body'].read() content_new = content self.new_json_obj = json.loads(content_new) print( '===================================Second Json=======================================' ) print(content_new) self.df_order_table = self.glueContext.create_dynamic_frame_from_options( connection_type='s3', connection_options={ "paths": [self.new_json_obj['Schema']['src_file_path']] }, format='csv', format_options={ "withHeader": True }).toDF() self.df_customer_table = self.glueContext.create_dynamic_frame_from_options( connection_type='s3', connection_options={ "paths": [self.new_json_obj['Schema']['src_file_path_2']] }, format=self.new_json_obj['Schema']['format'], format_options={ "withHeader": True }).toDF() self.df_order_table.show() self.df_customer_table.show() def transform(self): self.df_order_table = self.df_order_table.withColumn( 'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd')) self.df_order_table = self.df_order_table.withColumn( 'order_month', func.month(col('order_datetime'))) df_filter_cust = self.df_customer_table.where(col('age') > 18) ###inner join df_order_customer = self.df_order_table.join( df_filter_cust, on=(self.df_order_table['customer_id'] == df_filter_cust['customer_id']), how='inner').select(df_filter_cust['customer_id'], self.df_order_table['order_id'], self.df_order_table['order_month'], self.df_order_table['amount']) # total sales amount for each month of each customer who are greater than age 18 wind = Window.partitionBy('customer_id', 'order_month') df_order_customer = df_order_customer.withColumn( 'total_sale', func.sum(col('amount')).over(wind)) df_order_customer.distinct() df_order_customer.show() ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss ######################## wind = Window.partitionBy('customer_id', 'order_date').orderBy( func.col('order_id').asc()) df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\ df_temp = df_temp.withColumn( 'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd')) df_temp = df_temp.withColumn( 'diff_days', func.datediff('current_date', 'order_date')) df_temp = df_temp.withColumn( "diff", when((col('diff_days') <= lit(20)), lit(1)).otherwise(0)) df_temp = df_temp.where(col('diff') == 1) wind = Window.partitionBy('customer_id') df_temp = df_temp.withColumn('count', func.count('order_id').over(wind)) df_temp = df_temp.where((col('count') > 2) & (col('row') == 2)) df_temp.show() def load(self): # self.df_tgt.coalesce(1) # self.df_tgt.write.mode('Overwrite').format('csv').save( # 's3 path', # header='true') dynf = DynamicFrame.fromDF(self.df_tgt, self.glueContext, "glue_job") self.glueContext.write_dynamic_frame_from_options( frame=dynf, connection_type="s3", connection_options={ "path": 's3://slf-ca-dev-glue/amp/test/testing_schema/output', }, format="parquet", format_options={}, transformation_ctx="")
from pyspark.sql.functions import col, udf from pyspark.sql.types import DateType import pyspark.sql.functions as f from functools import reduce import boto3 import json #initiate glue context glueContext = GlueContext(SparkContext.getOrCreate()) #read the source file---RPA rpa_dyf = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={ "paths": ["s3://smart-ingest-bucket/RPA-source-file/"] }, format="csv", format_options={'withHeader': True}) #read the lookup file---RPA-NAVIXA-AE RO rpa_AE_RO_dyf = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={ "paths": [ "s3://smart-ingest-bucket/Dimension-files/Navixa Employee History_RPA.csv" ] }, format="csv", format_options={'withHeader': True})
def main(args): if args.verbose: print("Got arguments: %s" % (args)) glue_client = boto3.client("glue", region_name=args.region) s3_client = boto3.client("s3", region_name=args.region) # Verify source DB and table exist if validate_db(args.athenaDatabase, glue_client=glue_client) and args.verbose: print("Validated source database %s exists." % (args.athenaDatabase)) if (validate_table( args.athenaDatabase, args.athenaTable, glue_client=glue_client) and args.verbose): print("Validated source table %s exists." % (args.athenaTable)) # Verify input and output buckets exist and are accessible. for bucket in [args.outputBucket, args.inputBucket]: if validate_bucket(bucket, s3_client=s3_client) and args.verbose: print("Verified bucket s3://%s exists and is accessible." % (args.outputBucket)) # Use latest file in bucket that matches prefix string as input. input_csv, latest_dt = get_latest_file(args.inputBucket, prefix=args.inputPrefix, s3_client=s3_client) if not input_csv: raise Exception( "Found no candidate CSV files in bucket %s with prefix %s." % ( args.inputBucket, "(no prefix)" if not args.inputPrefix else args.inputPrefix, )) print("Got latest CSV file s3://%s/%s with write time %s." % (args.inputBucket, input_csv, latest_dt)) sc = SparkContext() gc = GlueContext(sc) sparkSession = gc.spark_session job = Job(gc) job.init(args.JOB_NAME, vars(args)) sparkSession.udf.register("json_clean", json_clean) # For requester payer sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") # Get minimum hour for existing data prelim_min_hour = 0 aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER # * 1000000 needed since start_time is in microseconds prelim_min_hour = (( (START_TIME - (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000 # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo. input_df = (gc.create_dynamic_frame_from_options( "s3", { "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)] }, "csv", { "withHeader": True }, ).toDF().select("*")) print("Input DF count: %s" % (input_df.count())) csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" % (prelim_min_hour)) print("Date-bounded DF count: %s" % (csv_prelim_df.count())) # If --hourly is not set, query for policy hits for current hour # and previous 24 hours. # When --hourly is set, query for policy hits only for the current # and previous hour. (Querying for the previous hour is necessary) # as the previous CSV generation job may have run in the middle of the # hour, meaning new events for that hour may exist.) # (now - (now % 3600)) = start of the current hour # ((now - (now % 3600)) - 3600) = start of the previous hour hours_ago = 1 if args.hourly else 24 increm_min_hour_sec = (START_TIME - (START_TIME % 3600)) - (3600 * hours_ago) increm_min_hour_micro = increm_min_hour_sec * 1000000 # When nonStrict is set, query for the latest hourly epoch in the input file # as well as all subsequent hourly epochs. This query may be more expensive # to complete, but allows for the possibility of filling in gaps if previous # runs of the job failed or were paused. if args.nonStrict: # We may need to re-query for transactions in the latest epoch depending on # when the previous query was run. Filter these rows out as well, then # set newest_epoch to our minimum hour epoch. last_seen_epoch = ( input_df.selectExpr("cast(start_time as bigint) as ts").agg({ "start_time": "max" }).collect().pop()[0]) last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner # Make sure we don't go outside our --maxHoursAgo range. # This check is necessary as increm_min_hour_sec is used in the pushdown # predicate when querying against the Athena source table/view. increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec) increm_min_hour_micro = increm_min_hour_sec * 1000000 print("Set minimum hour in seconds: %s" % (increm_min_hour_sec)) csv_hits_in_range = csv_prelim_df.filter( "cast(start_time as bigint) < %s" % (increm_min_hour_micro)) print("Total CSV hits in timeframe of interest: %s" % (csv_hits_in_range.count())) # Set timeframe for pushdown predicate pushdown = "(hour >= %s)" % (increm_min_hour_sec) # Get table contents starting from minimum hour settings. raw_data = gc.create_dynamic_frame.from_catalog( database=args.athenaDatabase, table_name=args.athenaTable, transformation_ctx="raw_data", push_down_predicate=pushdown, ) # Determine what fields we should extract based on table definition. # Deriving the fields from the table schema is preferable to inferring # it from the underlying parquet because the two may differ. tmp_schema = get_table_schema(args.athenaDatabase, args.athenaTable, glue_client=glue_client) # Verify we only have one matching column on which to explode. if not [col[0] for col in tmp_schema].count(args.policyType) == 1: raise Exception( "Wrong number of matching %s columns found in schema: %s" % (args.policyType, tmp_schema)) # Sub out the policies/parent_policies field for an explode expression. # Lightly modify other fields to promote clean and consistent writes to CSV file. schema = [] for col in tmp_schema: if col[0] == args.policyType: schema.append("explode(%s) as policy" % (col[0])) elif "map" in col[1].lower() or "struct" in col[1].lower(): schema.append("json_clean(%s) as %s" % (col[0], col[0])) else: schema.append("cast(%s as string)" % (col[0])) # Get targeted policies/parent_policies. # Should be passed to SparkSQL as quoted strings. pol_arr = ", ".join([ "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter) if pol ]) # Filter table contents according to policy hits. # A non-null policies field implies a non-null parent_policies field # (and vice versa), so it's OK to filter on just one. new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr( *schema).filter("policy in (%s)" % (pol_arr))) print("New policy triggers found since last job: %s" % (new_hits.count())) # Combine newly collected policy hits with dataframe of previous CSV contents. write_df = new_hits.union(csv_hits_in_range).orderBy( "start_time").coalesce(1) uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver) s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir, uniq)) if args.verbose: print("S3 Results Location: %s" % s3_loc) write_df.write.option("quoteAll", True).csv(s3_loc, header=True) # Rename output file, if requested. if args.outputFilename: rename_resp = rename_file( args.outputBucket, args.outputFilename, prefix=args.outputDir, dont_preserve_dir=args.dontPreserveOutputDir, keep_orig=args.keepOrigOnRename, ) if rename_resp and args.verbose: print("Renamed file to %s/%s." % (args.outputDir, args.outputFilename)) job.commit()
from awsglue.transforms import * from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) # Please update the values in the options to connect to your own data source options = { "tableName": "all_log_streams", "schemaName": "/aws-glue/jobs/output", "connectionName": "my-connection" } datasource = glueContext.create_dynamic_frame_from_options( connection_type="custom.dynamodb", connection_options=options, transformation_ctx="datasource") datasource.show() job.commit()
from pyspark.sql.functions import lit from datetime import datetime from pyspark.sql.functions import col, udf from pyspark.sql.types import DateType import pyspark.sql.functions as f from functools import reduce from pyspark.sql.types import * import datetime import boto3 import json #initiate glue context glueContext = GlueContext(SparkContext.getOrCreate()) ##Loading the source files from s3 bucket and converting the corresponding dynamic dataframes to apache spark data frames Jas_Labour_Paid_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_paid_export.csv"]}, format="csv",format_options={'withHeader' : True}) Jas_Labour_Paid_df = Jas_Labour_Paid_dyf.toDF() Jas_Labour_Production_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_production_export.csv"]}, format="csv",format_options={'withHeader' : True}) Jas_Labour_Production_df = Jas_Labour_Production_dyf.toDF() ##Performing Transformations on Jas_Labour_Paid_df ##Jas_Labour_Paid_df.printSchema() Jas_Labour_Paid_df=Jas_Labour_Paid_df.select(col('EXTERNAL_ID').alias('EMPLOYEE_NUMBER'),\ col('USER_NAME').alias('EMPLOYEE_NAME'),'ATTENTION',\ trim(split(col('DEPT_NAME'),'-')[0]).alias('PROGRAM_DESC'),\ to_date(substring(col('TIME_START'),1,9),'dd-MMM-yy').alias('TRANSACTION_DATE'),\ col('HOURS_OVER_TIME').cast(DoubleType()).alias('AVAIL_OT_HRS'),\ col('HOURS_TOTAL').cast(DoubleType()).alias('HOURS_TOTAL'),'STATUS',\ col('HOURS_INDIRECT').cast(DoubleType()).alias('HOURS_INDIRECT'),\ col('HOURS_TIMED').cast(DoubleType()).alias('HOURS_TIMED'), 'TAC_CODE')\
## @params: [TempDir, JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME', 'db_name', 'redshift_connection', 'cis_bucket', 'geo_bucket']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) """ Copy CIS Demo Data to Redshift """ cis_datasource = glueContext.create_dynamic_frame_from_options("s3", \ {"paths": ["s3://{}".format(args['cis_bucket'])],"recurse": True}, \ format="csv", \ format_options={ \ "withHeader": False, \ "separator": ";" \ }) cis_mapping = ApplyMapping.apply(frame = cis_datasource, mappings = [("col0", "string", "customer_id", "string"), \ ("col1", "string", "name", "string"), \ ("col2", "long", "zip", "long"), \ ("col3", "string", "city", "string"), \ ("col4", "string", "state", "string"), \ ("col5", "string", "street", "string"), \ ("col6", "string", "phone", "string"), \ ("col7", "string", "meter_id", "string"), \ ("col8", "double", "lat", "double"), \ ("col9", "double", "long", "double")\ ], transformation_ctx ="cis_mapping")
from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.utils import getResolvedOptions args = getResolvedOptions(sys.argv, ["JOB_NAME", "table_name", "role_arn"]) table_name = args["table_name"] role_arn = args["role_arn"] glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(args["JOB_NAME"], args) dyf = glue_context.create_dynamic_frame_from_options( connection_type="dynamodb", connection_options={ "dynamodb.region": "us-west-2", "dynamodb.input.tableName": table_name, "dynamodb.sts.roleArn": role_arn }) dyf.show() glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type="dynamodb", connection_options={ "dynamodb.output.tableName": table_name, "dynamodb.throughput.write.percent": "0.5" }) job.commit()
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 from awsglue.transforms import * from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) # Please update the values in the options to connect to your own data source options = { "sfDatabase": "snowflake_sample_data", "sfSchema": "PUBLIC", "sfWarehouse": "WORKSHOP_123", "dbtable": "lineitem", "connectionName": "my-connection" # please refer to Glue Studio Create Custom Connector doc to create a connection } datasource = glueContext.create_dynamic_frame_from_options( connection_type= "custom.spark", # for marketplace workflow, use marketplace.spark connection_options=options, transformation_ctx="datasource") datasource.show() job.commit()
from pyspark.sql.functions import * from awsglue.dynamicframe import DynamicFrame import boto3 ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ### Print the changed records cafePrintData = glueContext.create_dynamic_frame_from_options( connection_type="s3", format="parquet", connection_options={"paths": ["s3://cafe-poc/cafe_FL_PoC/"]}) cafePrintData_SDF = dimcafePrintData.toDF() cafePrintData_SDF.createOrReplaceTempView("cafe_new_view") cafe_df = spark.sql(""" SELECT * From cafe_new_view as p WHERE p.cafeKey IN (2642,2645,2647) """) #-WHERE p.cafeKey in (2642,2644,2645,2646) print(" S3 cafe Count: ", dimcafe_df.count()) cafe_df.show(truncate=False)