def write(database, table, df, jdbc_conf, cut_off_percentage, repartition=False, s3path='', format='csv'): """ Python wrapper function for Scala class CustomJDBCUtils @params: database - string name of the database to write to table - string name of the table being written to (format = 'schema.table') df - pyspark.sql.DataFrame to be written jdbc_conf - jdbc_conf for the connection, extracted from glueContext.extract_jdbc_conf() cut_off_percentage - integer representing the bad records threshhold (ex. 50 = 50%) s3path - path in s3 where the bad records should be written to (optional) format - file format to write out the bad records as (default = 'csv') """ sc = SparkContext.getOrCreate() glueContext = GlueContext(sc) jvm = sc._jvm jsc = sc._jsc jdf = df._jdf if s3path is not "": df = DataFrame( jvm.com.slf.CustomJDBCUtils.write(database, table, jdf, jdbc_conf, cut_off_percentage), jsc) try: df.repartition(1).write.format(format).save(s3path) return df except: # Spark Writing Failed. Reverting to GlueContext glueContext.write_dynamic_frame_from_options( frame=DynamicFrame.fromDF(df, glueContext, 'dynamic_frame'), connection_type='s3', connection_options={'path': s3path}, format=format) else: return DataFrame( jvm.com.slf.CustomJDBCUtils.write(database, table, jdf, jdbc_conf, cut_off_percentage), jsc)
def test_duplicate_rows_in_target_are_deduped(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "02", "firstname": "Bob from staging", "modifieddate": "2019-01-02T00:40:32Z" } ]) self.__mock_existing_target(glue_context, [ { "id": "01", "first_name": "John", "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00") }, { "id": "02", "first_name": "Bob", "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00") }, { "id": "01", "first_name": "Bill", "modified_date": datetime.fromisoformat("2019-01-02T00:40:32+00:00") } ]) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) expected_df = input_df = self.spark.createDataFrame([ ["01", "Bill", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")], ["02", "Bob from staging", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")] ], schema=self.output_schema) write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
def test_the_target_path_is_purged(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "01", "firstname": "John", "lastname": "Smith", "birthdate": "1990-01-01", "zipcode": "12345", "modifieddate": "2019-01-01T00:40:32Z", } ]) self.__mock_existing_target(glue_context, []) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) glue_context.purge_s3_path.assert_called_with( s3_path = "s3://ut_target_path", options = { "retentionPeriod": 0 } )
def test_no_existing_output(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "01", "firstname": "John", "lastname": "Smith", "birthdate": "1990-01-01", "zipcode": "12345", "modifieddate": "2019-01-01T00:40:32Z", } ]) self.__mock_existing_target(glue_context, []) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) expected_df = input_df = self.spark.createDataFrame([ ["01", "John", "Smith", date(1990, 1, 1), "12345", datetime.fromisoformat("2019-01-01T00:40:32+00:00")] ], schema=self.output_schema) write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
job = Job(glue_context) job.init(args['JOB_NAME'], args) # Job input parameters project = args['project'] stage = args['stage'] env = Env.get_config(project, stage) incoming_path = "s3://{}/{}/food".format(env["s3_data_lake"], env["incoming_folder"]) primary_path = "s3://{}/{}/food".format(env["s3_data_lake"], env["primary_folder"]) # Read Data datasource0 = glue_context.create_dynamic_frame_from_options(connection_type='s3', connection_options = {"paths": [incoming_path]}, format = "csv", \ format_options={"withHeader":True}, transformation_ctx = "readctx") datasource0.printSchema() datasource0.show(2) # mask numbers in data with _number_ bpSamplePySpark = BpSamplePySpark(spark, env) df = bpSamplePySpark.process_data(datasource0.toDF()) dyf = DynamicFrame.fromDF(df, glue_context, 'dyf') # Write data glue_context.write_dynamic_frame_from_options(frame = dyf, connection_type = "s3", \ connection_options = {"path": primary_path, \ }, format = "parquet", transformation_ctx = "writectx") job.commit()
from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.utils import getResolvedOptions args = getResolvedOptions(sys.argv, ["JOB_NAME", "table_name", "role_arn"]) table_name = args["table_name"] role_arn = args["role_arn"] glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(args["JOB_NAME"], args) dyf = glue_context.create_dynamic_frame_from_options( connection_type="dynamodb", connection_options={ "dynamodb.region": "us-west-2", "dynamodb.input.tableName": table_name, "dynamodb.sts.roleArn": role_arn }) dyf.show() glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type="dynamodb", connection_options={ "dynamodb.output.tableName": table_name, "dynamodb.throughput.write.percent": "0.5" }) job.commit()
class generate_testdata(): def __init__(self): self.glueContext = None self.json_obj = None self.new_json_obj = None self.df_readschema = None self.sqlContext = None self.df_tgt = None self.spark = None self.df_customer_table = None self.df_order_table = None def initialize(self): spark_context = SparkContext() self.glueContext = GlueContext(spark_context) self.sqlContext = SQLContext(spark_context) self.spark = self.glueContext.spark_session def extract(self): args_new = getResolvedOptions(sys.argv, ['cfg_file_path_new']) cfg_file_path_new = args_new['cfg_file_path_new'] file_res = urlparse(cfg_file_path_new) s3_resource = boto3.resource('s3') file_obj = s3_resource.Object(file_res.netloc, file_res.path.lstrip('/')) content = file_obj.get()['Body'].read() content_new = content self.new_json_obj = json.loads(content_new) print( '===================================Second Json=======================================' ) print(content_new) self.df_order_table = self.glueContext.create_dynamic_frame_from_options( connection_type='s3', connection_options={ "paths": [self.new_json_obj['Schema']['src_file_path']] }, format='csv', format_options={ "withHeader": True }).toDF() self.df_customer_table = self.glueContext.create_dynamic_frame_from_options( connection_type='s3', connection_options={ "paths": [self.new_json_obj['Schema']['src_file_path_2']] }, format=self.new_json_obj['Schema']['format'], format_options={ "withHeader": True }).toDF() self.df_order_table.show() self.df_customer_table.show() def transform(self): self.df_order_table = self.df_order_table.withColumn( 'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd')) self.df_order_table = self.df_order_table.withColumn( 'order_month', func.month(col('order_datetime'))) df_filter_cust = self.df_customer_table.where(col('age') > 18) ###inner join df_order_customer = self.df_order_table.join( df_filter_cust, on=(self.df_order_table['customer_id'] == df_filter_cust['customer_id']), how='inner').select(df_filter_cust['customer_id'], self.df_order_table['order_id'], self.df_order_table['order_month'], self.df_order_table['amount']) # total sales amount for each month of each customer who are greater than age 18 wind = Window.partitionBy('customer_id', 'order_month') df_order_customer = df_order_customer.withColumn( 'total_sale', func.sum(col('amount')).over(wind)) df_order_customer.distinct() df_order_customer.show() ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss ######################## wind = Window.partitionBy('customer_id', 'order_date').orderBy( func.col('order_id').asc()) df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\ df_temp = df_temp.withColumn( 'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd')) df_temp = df_temp.withColumn( 'diff_days', func.datediff('current_date', 'order_date')) df_temp = df_temp.withColumn( "diff", when((col('diff_days') <= lit(20)), lit(1)).otherwise(0)) df_temp = df_temp.where(col('diff') == 1) wind = Window.partitionBy('customer_id') df_temp = df_temp.withColumn('count', func.count('order_id').over(wind)) df_temp = df_temp.where((col('count') > 2) & (col('row') == 2)) df_temp.show() def load(self): # self.df_tgt.coalesce(1) # self.df_tgt.write.mode('Overwrite').format('csv').save( # 's3 path', # header='true') dynf = DynamicFrame.fromDF(self.df_tgt, self.glueContext, "glue_job") self.glueContext.write_dynamic_frame_from_options( frame=dynf, connection_type="s3", connection_options={ "path": 's3://slf-ca-dev-glue/amp/test/testing_schema/output', }, format="parquet", format_options={}, transformation_ctx="")
job.init(args["JOB_NAME"], args) dyf = glue_context.create_dynamic_frame_from_options( connection_type="dynamodb", connection_options={ "dynamodb.region": region, "dynamodb.splits": str(ddb_split), "dynamodb.throughput.read.percent": "1.2", "dynamodb.input.tableName": source_ddb_name }) dyf.show() sts_client = boto3.client('sts') sts_response = sts_client.assume_role(RoleArn=target_role_arn, RoleSessionName='assume-role') ddb_key = sts_response['Credentials']['AccessKeyId'] ddb_secret = sts_response['Credentials']['SecretAccessKey'] ddb_token = sts_response['Credentials']['SessionToken'] glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type="dynamodb", connection_options={ "dynamodb.region": region, "dynamodb.output.tableName": target_ddb_name, "dynamodb.awsAccessKeyId": ddb_key, "dynamodb.awsSecretAccessKey": ddb_secret, "dynamodb.awsSessionToken": ddb_token }) job.commit()
# import necessary dependencies from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import sys args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(args['JOB_NAME'], args) # extract source s3 file as glue dynamic frame using data catalogue. dynf_s3_src = glueContext.create_dynamic_frame.from_catalog(database='ctg_db_s3', table_name='ctg_tbl_nm_corresponding_to_s3_file', transformation_ctx="s3_src_extract") # load s3 file using glue api. glueContext.write_dynamic_frame_from_options(frame=dynf_s3_src, connection_type="s3", connection_options={"path": 's3://bucket_name/path'}, format="parquet", format_options={}, transformation_ctx="") job.commit()
import sys from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.utils import getResolvedOptions args = getResolvedOptions(sys.argv, ["JOB_NAME"]) glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(args["JOB_NAME"], args) dyf = glue_context.create_dynamic_frame.from_options( connection_type="dynamodb", connection_options={ "dynamodb.input.tableName": "logfile_scan", "dynamodb.throughput.read.percent": "1.0", "dynamodb.splits": "100" }) print(dyf.getNumPartitions()) glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type="dynamodb", connection_options={ "dynamodb.output.tableName": "logfile_scan_2", "dynamodb.throughput.write.percent": "1.0" }) job.commit()
dyf = glue_context.create_dynamic_frame.from_options( connection_type='dynamodb', connection_options={ 'dynamodb.input.tableName': 'NatureRemo', 'dynamodb.splits': '72' # Should be (DPU-1)*8 }) dyf.printSchema() dyf = dyf.resolveChoice(specs=[('value', 'cast:double')]) dyf = dyf.repartition( 1) # You need to modify this number to your preferred file number. glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type='s3', connection_options={'path': "s3://example-bucket/glue/ddb2s3/"}, format='parquet') # You will need to specify partitionKeys if you want to write data with hive partitioning based on column 'year', 'month', and 'day #glue_context.write_dynamic_frame_from_options( # frame=dyf, # connection_type='s3', # connection_options={ # 'path': "s3://example-bucket/glue/ddb2s3/" # 'partitionKeys': ["year", "month", "day"] # }, # format='parquet' #) job.commit()
args = getResolvedOptions(sys.argv, [‘JOB_NAME’]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args[‘JOB_NAME’], args) # Initialize the Dynamic frame using Glue Data Catalog DB and Table. Replace <GlueDatabaseName> and <GlueTableName> with Glue DB and table names respectively Source = glueContext.create_dynamic_frame.from_catalog(database = “<GlueDatabaseName>”, table_name = “<GlueTableName>”, transformation_ctx = “Source”) # Map the source field names and data types to target values. The target values should be exactly the same as the source DyanmoDB table values Mapped = ApplyMapping.apply(frame = Source, mappings = [ (“item.ID.S”, “string”, “ID”, “string”), (“item.date.M”, “map”, “date”, “map”), (“item.location.M.lng.S”, “string”, “location.lng”, “string”), (“item.location.M.lat.S”, “string”, “location.lat”, “string”)], transformation_ctx = “Mapped”) # Write to target DynamoDB table. Replace <TargetRegion> and <TargetDynamoDBTable> with region and table name respectively glueContext.write_dynamic_frame_from_options( frame=Mapped, connection_type=“dynamodb”, connection_options={ “dynamodb.region”: “<TargetRegion>”, “dynamodb.output.tableName”: “<TargetDynamoDBTable>”, “dynamodb.throughput.write.percent”: “1.0” } ) job.commit()
# "N": "0" # } # } # } mapped = ApplyMapping.apply( frame=source, mappings=[("item.Code.S", "string", "Code", "string"), ("item.Title.S", "string", "Title", "string"), ("item.Date.S", "string", "Date", "string"), ("item.Comments.S", "string", "Comments", "string"), ("item.Author.S", "string", "Author", "string"), ("item.LastPushDateTime.S", "string", "LastPushDateTime", "string"), ("item.Link.S", "string", "Link", "string"), ("item.ID.N", "integer", "ID", "integer"), ("item.Board.S", "string", "Board", "string"), ("item.PushSum.N", "integer", "PushSum", "integer")], transformation_ctx="mapped") mapped.show() glueContext.write_dynamic_frame_from_options( frame=mapped, connection_type="dynamodb", connection_options={ "dynamodb.output.tableName": "articles", "dynamodb.throughput.write.percent": "1.0" }) job.commit()