''' ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://bond-s3-forspark/invoice-application/"], "recurse":True}, transformation_ctx = "DataSource0"] ## @return: DataSource0 ## @inputs: [] DataSource0 = glueContext.create_dynamic_frame.from_options(connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://bond-s3-forspark/invoice-application/"], "recurse":True}, transformation_ctx = "DataSource0") ## @type: SqlCode ## @args: [sqlAliases = {"myDataSource": DataSource0}, sqlName = SqlQuery0, transformation_ctx = "Transform0"] ## @return: Transform0 ## @inputs: [dfc = DataSource0] Transform0 = sparkSqlQuery(glueContext, query = SqlQuery0, mapping = {"myDataSource": DataSource0}, transformation_ctx = "Transform0") ## @type: DataSink ## @args: [connection_type = "s3", catalog_database_name = "frillmart", format = "glueparquet", connection_options = {"path": "s3://bond-s3-forspark/invoice-sandbox/", "partitionKeys": ["Country"], "enableUpdateCatalog":true, "updateBehavior":"UPDATE_IN_DATABASE"}, catalog_table_name = "invoice_sandbox", transformation_ctx = "DataSink0"] ## @return: DataSink0 ## @inputs: [frame = Transform0] DataSink0 = glueContext.getSink(path = "s3://bond-s3-forspark/invoice-sandbox/", connection_type = "s3", updateBehavior = "UPDATE_IN_DATABASE", partitionKeys = ["Country"], enableUpdateCatalog = True, transformation_ctx = "DataSink0") DataSink0.setCatalogInfo(catalogDatabase = "frillmart",catalogTableName = "invoice_sandbox") DataSink0.setFormat("glueparquet") DataSink0.writeFrame(Transform0) job.commit()
]).toDF().dropDuplicates(['archiveid']) rownum = inventory.withColumn( "row_num", row_number().over( Window.orderBy(inventory['creationdate'], inventory['archiveid'])).cast("long")) merged = rownum.join(mapped, "archiveid", how='left_outer') frame = DynamicFrame.fromDF(merged, glueContext, "merged") def transform(rec): rec["part"] = rec["row_num"] // partiton_size rec["archivedescription"] = rec["override"] if rec["override"] and rec[ "override"].strip() else rec["archivedescription"] rec.pop('override', None) return rec trans0 = Map.apply(frame=frame, f=transform) sink = glueContext.getSink(connection_type="s3", path='s3://' + STAGING_BUCKET + '/partitioned/', enableUpdateCatalog=True, partitionKeys=["part"]) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=DATABASE, catalogTableName=OUTPUT_TABLE) sink.writeFrame(trans0) job.commit()
field.dataType.typeName())) dyf = dyf.apply_mapping(mapping) # Add partition columns df = dyf.toDF() if 'year' in partition_keys: df = df.withColumn('year', year(timestamp_column_name)) if 'month' in partition_keys: df = df.withColumn('month', month(timestamp_column_name)) if 'day' in partition_keys: df = df.withColumn('day', dayofmonth(timestamp_column_name)) if 'hour' in partition_keys: df = df.withColumn('hour', hour(timestamp_column_name)) if 'minute' in partition_keys: df = df.withColumn('minute', minute(timestamp_column_name)) df.drop(col(tmp_timestamp_column_name)) dyf = DynamicFrame.fromDF(df, glue_context, "add_partitions") # Write DynamicFrame to S3 in glueparquet format sink = glue_context.getSink(connection_type="s3", path=output_path, enableUpdateCatalog=True, partitionKeys=partition_keys) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=output_database, catalogTableName=output_table) sink.writeFrame(dyf) job.commit()
try: glue.create_table(DatabaseName=output_database, TableInput=table_to_create) except Exception as e: print( f"Failed to create destination table. If {output_database}.{output_table} table exists, please remove it." ) print(e) raise # Begin Lake Formation transaction tx_id = glue_context.start_transaction(read_only=False) # Write DynamicFrame into Lake Formation governed table using transaction sink = glue_context.getSink(connection_type="s3", path=output_path, enableUpdateCatalog=True, partitionKeys=partition_key_names, transactionId=tx_id) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=output_database, catalogTableName=output_table) try: sink.writeFrame(dyf) glue_context.commit_transaction(tx_id) except Exception: glue_context.cancel_transaction(tx_id) raise job.commit()
) glueContext.write_dynamic_frame.from_options(frame = l_history, connection_type = "s3", connection_options = {"path": "s3://leeys-virginia/output-dir/legislator_history"}, format = "parquet") response = glueContext.write_dynamic_frame.from_jdbc_conf(frame = df_mapped, catalog_connection = "red1", connection_options = {"dbtable": "public.test1", "database": "leeysdb"}, redshift_tmp_dir = "s3://leeys-virginia/glue/temp-dir/") response = glueContext.write_dynamic_frame.from_jdbc_conf(frame = df_mapped, catalog_connection = "red1", connection_options = {"dbtable": "public.test1", "database": "leeysdb" , "preactions":"delete from public.test1;", "extracopyoptions":"MAXERROR 2"}, redshift_tmp_dir = "s3://leeys-virginia/glue/temp-dir/") datasink4 = glueContext.write_dynamic_frame.from_options(frame = addpartitionfield, connection_type = "s3", connection_options = {"path": "s3://t1-saas-insights-dev-ap-southeast-2/glue/processed", "partitionKeys": ["date_p"]}, format = "parquet", transformation_ctx = "datasink4") #### use Get sink rather than write_dynamic_frame data_sink = glueContext.getSink("s3://leeys-virginia/glue/temp/" , format = "json" ) ( or data_sink.setFormat("json") ) data_sink.writeFrame(myFrame)
def main(): ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'datetime_column', 'date_column', 'partition_column', 'output_bucket_name' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] datetime_column = args['datetime_column'] date_column = args['date_column'] output_bucket_name = args['output_bucket_name'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"] ## @return: sink ## @inputs: [frame = enriched_data] sink = glue_context.getSink(connection_type="s3", path="s3://" + output_bucket_name + "/" + clean_table_name, enableUpdateCatalog=True, updateBehavior="UPDATE_IN_DATABASE", partitionKeys=[partition_column]) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=clean_db_name, catalogTableName=clean_table_name) sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result')) job.commit()
# Maps a transformation function over each record to re-build date partitions using the timestamp # rather than the Firehose ingestion timestamp transformed_pose_data = Map.apply(frame=resolved_pose_data, f=applyTransform) print("---- Processed data schema: ----") transformed_pose_data.printSchema() record_count = transformed_pose_data.count() print("Processed record count: {}".format(record_count)) # Avoid errors if Glue Job Bookmark detects no new data to process and records = 0. if record_count > 0: try: sink = glueContext.getSink( connection_type="s3", path=analytics_bucket_output, enableUpdateCatalog=True, updateBehavior="UPDATE_IN_DATABASE", partitionKeys=["year", "month", "day", "hour"], transformation_ctx="output") sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=db_name, catalogTableName=processed_pose_data_table) sink.writeFrame(transformed_pose_data) except: print("There was an error writing out the results to S3") else: print("Partition saved.") else: print("Glue Job Bookmark detected no new files to process") job.commit()