class JobRunner(object): """This class manages the orchestration of new jobs, including parsing arguments and control flow""" SERVICE_DEFINITIONS = { 'alb': [ALBRawCatalog, ALBConvertedCatalog], 'elb': [ELBRawCatalog, ELBConvertedCatalog], 'cloudtrail': [CloudTrailRawCatalog, CloudTrailConvertedCatalog], 'cloudfront': [CloudFrontRawCatalog, CloudFrontConvertedCatalog], 's3_access': [S3AccessRawCatalog, S3AccessConvertedCatalog], 'vpc_flow': [VPCFlowRawCatalog, VPCFlowConvertedCatalog], } def __init__(self, service_name): args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments()) # Validate the service name if not self.is_valid_service(service_name): raise Exception("'%s' is not yet a supported service." % service_name) self.glue_context = self._init_glue_context() self.job = Job(self.glue_context) region = self.get_instance_region() # Create data catalog references raw_klas = self.SERVICE_DEFINITIONS[service_name][0] converted_klas = self.SERVICE_DEFINITIONS[service_name][1] self.raw_catalog = raw_klas( region, args['raw_database_name'], args['raw_table_name'], args['s3_source_location'] ) self.optimized_catalog = converted_klas( region, args['converted_database_name'], args['converted_table_name'], args['s3_converted_target'] ) # Assume that if the raw table does not exist, this is our first run self.initial_run = not self.raw_catalog.does_table_exist() # Create a converter object and initialize the glue job! self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog) self.job.init(args['JOB_NAME'], args) @staticmethod def is_valid_service(service_name): """Determines whether the given service_name is a supported service or not""" return service_name in JobRunner.SERVICE_DEFINITIONS def get_instance_region(self): """Retrieve the current AWS Region from the Instance Metadata""" contents = urllib2.urlopen("http://169.254.169.254/latest/dynamic/instance-identity/document").read() return json.loads(contents).get('region') def create_tables_if_needed(self): """If this is the initial run of the Job, create both the raw and optmized tables in the Data Catalog""" if self.initial_run is True: # TODO: Fail if the table already exists, or for converted tables if the S3 path already exists LOGGER.info("Initial run, scanning S3 for partitions.") self.raw_catalog.initialize_table_from_s3() # Note that if the source table is partitionless, this is a null-op. self.optimized_catalog.initialize_with_partitions(self.raw_catalog.partitioner.build_partitions_from_s3()) def add_new_raw_partitions(self): """For the raw catalog, check and see if any new partitions exist for UTC today. Continue this check for every day previous until we reach a day where a partition exists.""" if self.initial_run is not True: LOGGER.info("Recurring run, only looking for recent partitions on raw catalog.") self.raw_catalog.add_recent_partitions() def add_new_optimized_partitions(self): """For the optimized catalog, check and see if any new partitions exist for UTC today. Continue this check for every day previous until we reach a day where a partition exists. If this is the initial run, add whatever partitions we can find. """ if self.initial_run and isinstance(self.raw_catalog.partitioner, NullPartitioner): LOGGER.info("Initial run with source NullPartitioner, adding all partitions from S3.") self.optimized_catalog.get_and_create_partitions() else: self.optimized_catalog.add_recent_partitions() def trigger_conversion(self): """Trigger the DataConverter""" self.converter.run() def finish(self): """Take any actions necessary to finish the job""" self.job.commit() def convert_and_partition(self): """A wrapper for the most common operations of these jobs. This allows for a simple one-line interface to the consumer, but allows them to use more-specific methods if need be. """ self.create_tables_if_needed() self.add_new_raw_partitions() self.trigger_conversion() self.add_new_optimized_partitions() self.finish() @staticmethod def _job_arguments(): return [ 'raw_database_name', 'raw_table_name', 'converted_database_name', 'converted_table_name', 's3_source_location', 's3_converted_target' ] @staticmethod def _init_glue_context(): # Imports are done here so we can isolate the configuration of this job from awsglue.context import GlueContext from pyspark.context import SparkContext spark_context = SparkContext.getOrCreate() spark_context._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # noqa pylint: disable=protected-access spark_context._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") # noqa pylint: disable=protected-access return GlueContext(spark_context)
raw_df = pd.concat([pd.read_csv(i) for i in all_filenames], join="outer") data = raw_df['0'].str.split(r'Trade Name:', expand=True) raw_df['Trade'] = data[1] raw_df['0'] = data[0] data = raw_df['0'].str.split(r'Trade Name:', expand=True) raw_df['Trade'] = data[1] raw_df['0'] = data[0] raw_df.to_csv(destination_file, index=False, encoding='utf-8-sig') # raw_df = wr.s3.read_csv(path=f, path_suffix = ['.csv'], dataset=True) # wr.s3.delete_objects(f"s3://{dest_bucket}/{destination_file}") # raw_df.to_csv(f,index=False, encoding='utf-8-sig') # Calling the function with latest available files date = (datetime.today() - timedelta(0)).strftime('%Y%m%d') year = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[0] month = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[1] day = (datetime.today() - timedelta(0)).strftime('%Y-%m-%d').split('-')[2] filenames = ['growers', 'processor', 'transporter', 'dispensaries', 'laboratory', 'waste_disposal'] for file in filenames: raw_path_dir = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/' source_filename = 'US/OK/CannabisLifecycle/' + file + '_' + date + '.pdf' dest_filename = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/' + file + '_' + date combained = 'US/OK/CannabisLifecycle/' + file + '/' + year + '/' + month + '/' + day + '/' + file + '_final_' + date + '.csv' pdf_to_csv(source_bucket, source_filename, dest_bucket, dest_filename) csvToCombinedcsv(dest_bucket, combained, dest_bucket, raw_path_dir) for j in all_filenames: if dest_filename in j: s3.rm(j) job.commit()
latestTimestampVal=functions.getMaxValue(returnDF,"endtime",job_configs) functions.updateLastProcessedTSValue(clusterId+"_stl_utilitytext",latestTimestampVal[0],job_configs) #### Alert Event Log ##### stlAlertEventLogProcessedTSValue = functions.getLastProcessedTSValue(clusterId+"_stl_alert_event_log",job_configs) returnDF=functions.runQuery("select '{}' as clusterId,trunc(event_time) as startDate,* from stl_alert_event_log where event_time > '{}'".format(clusterId,stlAlertEventLogProcessedTSValue),"stl_alert_event_log",job_configs) functions.saveToS3(returnDF,s3Prefix,"stl_alert_event_log",["clusterid","startdate"],job_configs) latestTimestampVal=functions.getMaxValue(returnDF,"event_time",job_configs) functions.updateLastProcessedTSValue(clusterId+"_stl_alert_event_log",latestTimestampVal[0],job_configs) #### STL_SCAN ##### stlScanLastProcessedTSValue= functions.getLastProcessedTSValue(clusterId+"_stl_scan",job_configs) returnDF=functions.runQuery("select '{}' as clusterId,trunc(starttime) as startDate,* from stl_scan where endtime > '{}'".format(clusterId,stlScanLastProcessedTSValue),"stl_scan",job_configs) functions.saveToS3(returnDF,s3Prefix,"stl_scan",["clusterid","startdate"],job_configs) latestTimestampVal=functions.getMaxValue(returnDF,"endtime",job_configs) functions.updateLastProcessedTSValue(clusterId+"_stl_scan",latestTimestampVal[0],job_configs) #### STL_WLM_QUERY ##### stlWLMQueryLastProcessedTSValue= functions.getLastProcessedTSValue(clusterId+"_stl_wlm_query",job_configs) returnDF=functions.runQuery("select '{}' as clusterId,trunc(queue_start_time) as startDate,* from stl_wlm_query where queue_end_time > '{}'".format(clusterId,stlWLMQueryLastProcessedTSValue),"stl_wlm_query",job_configs) functions.saveToS3(returnDF,s3Prefix,"stl_wlm_query",["clusterid","startdate"],job_configs) latestTimestampVal=functions.getMaxValue(returnDF,"queue_end_time",job_configs) functions.updateLastProcessedTSValue(clusterId+"_stl_wlm_query",latestTimestampVal[0],job_configs) job.commit()
.withColumn('Records_frm_src', F.lit(INPUT_RECORDS)) \ .withColumn('Datasink_Elapsed_Time', F.lit(DATASINK_ELAPSED_TIME)) \ .withColumn('Job_Elapsed_Time', F.lit(JOB_ELAPSED_TIME)) #Creating Temp View for COUNTS_DF2 COUNTS_DF2.createOrReplaceTempView("final_counts_dataframe") #One DataLog Dataframe Written To Cloud Watch Logs #Flat Files dont have any op_val column so counts cant be calculated AUDITING_COUNTS_DF = SPARK.sql(""" select JobName as JobName, JobRunId as JobId, Job_Start_Time, Job_End_Time, Job_Elapsed_Time, --coalesce(InsertsVal, 0) as InsertVal, --coalesce(UpdatesVal, 0) as UpdateVal, --coalesce(DeletesVal, 0) as DeleteVal, Datasink_Elapsed_Time, Records_frm_src, StepLog as StepLog from final_counts_dataframe """) AUDITING_COUNTS_DF.show(10, False) JOB.commit()
def main(): ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'partition_column', 'output_bucket_name', 'primary_key', 'parallelism', 'date_column', 'datetime_column' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] date_column = args['date_column'] datetime_column = args['datetime_column'] hudi_primary_key = args['primary_key'] output_bucket_name = args['output_bucket_name'] parallelism = args['parallelism'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job spark = SparkSession.builder.config( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer').getOrCreate() glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) logger = glue_context.get_logger() logger.info('Initialization.') # Initialization of Glue client to connect to Glue Catalog and retrieve table information glueClient = boto3.client('glue') ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) isTableExists = False try: glueClient.get_table(DatabaseName=clean_db_name, Name=target_entity_name) isTableExists = True logger.info(clean_db_name + '.' + target_entity_name + ' exists.') except ClientError as e: if e.response['Error']['Code'] == 'EntityNotFoundException': isTableExists = False logger.info(clean_db_name + '.' + target_entity_name + ' does not exist. Table will be created.') partition_path = '' if partition_column == 'None' else partition_column common_config = { 'className': 'org.apache.hudi', 'hoodie.datasource.hive_sync.use_jdbc': 'false', 'hoodie.index.type': 'GLOBAL_BLOOM', 'hoodie.datasource.write.precombine.field': datetime_column, 'hoodie.datasource.write.recordkey.field': hudi_primary_key, 'hoodie.table.name': target_entity_name, 'hoodie.consistency.check.enabled': 'true', 'hoodie.datasource.hive_sync.database': clean_db_name, 'hoodie.datasource.hive_sync.table': target_entity_name, 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.write.partitionpath.field': partition_path, 'hoodie.datasource.hive_sync.partition_fields': partition_path, 'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor', 'hoodie.datasource.write.hive_style_partitioning': 'false' if partition_column == 'None' else 'true', 'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator' } incremental_config = { 'hoodie.upsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert', 'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 'hoodie.cleaner.commits.retained': 10 } initLoad_config = { 'hoodie.bulkinsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert' } if (isTableExists): logger.info('Incremental upsert.') combinedConf = {**common_config, **incremental_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Append').save("s3://" + output_bucket_name + "/" + clean_table_name) else: logger.info('Inital load.') combinedConf = {**common_config, **initLoad_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Overwrite').save("s3://" + output_bucket_name + "/" + clean_table_name) job.commit()
from pyspark.context import SparkContext from awsglue.job import Job import sys from awsglue.utils import getResolvedOptions from awsglue.dynamicframe import DynamicFrame glueContext = GlueContext(SparkContext.getOrCreate()) glueJob = Job(glueContext) args = getResolvedOptions(sys.argv,['JOB_NAME']) glueJob.init(args['JOB_NAME'],args) #sparkSession = glueContext.sparkSession spark = glueContext.spark_session #df = sparkSession.read.csv("s3a://pkm") #df.show() #dfnew = spark.read.option("header","true").option("delimiter", ",").csv("s3a://pkm") df = spark.read.option("header","true").format("csv").load("s3a://pkm #inputGDF = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://pkm"], "recurse":True}, format = "csv") #df=inputGDF.toDF() df.show(2) dynamic_dframe = DynamicFrame.fromDF(df, glueContext, "dynamic_df") ##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket. ##It is possible to write to any Amazon data store (SQL Server, Redshift, etc) by using any previously defined connections. retDatasink4 = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe, connection_type = "s3", connection_options = {"path": "s3://pkm-target"}, format = "csv", transformation_ctx = "datasink4") glueJob.commit()
def main(args): if args.verbose: print("Got arguments: %s" % (args)) glue_client = boto3.client("glue", region_name=args.region) s3_client = boto3.client("s3", region_name=args.region) # Verify source DB and table exist if validate_db(args.athenaDatabase, glue_client=glue_client) and args.verbose: print("Validated source database %s exists." % (args.athenaDatabase)) if (validate_table( args.athenaDatabase, args.athenaTable, glue_client=glue_client) and args.verbose): print("Validated source table %s exists." % (args.athenaTable)) # Verify input and output buckets exist and are accessible. for bucket in [args.outputBucket, args.inputBucket]: if validate_bucket(bucket, s3_client=s3_client) and args.verbose: print("Verified bucket s3://%s exists and is accessible." % (args.outputBucket)) # Use latest file in bucket that matches prefix string as input. input_csv, latest_dt = get_latest_file(args.inputBucket, prefix=args.inputPrefix, s3_client=s3_client) if not input_csv: raise Exception( "Found no candidate CSV files in bucket %s with prefix %s." % ( args.inputBucket, "(no prefix)" if not args.inputPrefix else args.inputPrefix, )) print("Got latest CSV file s3://%s/%s with write time %s." % (args.inputBucket, input_csv, latest_dt)) sc = SparkContext() gc = GlueContext(sc) sparkSession = gc.spark_session job = Job(gc) job.init(args.JOB_NAME, vars(args)) sparkSession.udf.register("json_clean", json_clean) # For requester payer sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") # Get minimum hour for existing data prelim_min_hour = 0 aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER # * 1000000 needed since start_time is in microseconds prelim_min_hour = (( (START_TIME - (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000 # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo. input_df = (gc.create_dynamic_frame_from_options( "s3", { "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)] }, "csv", { "withHeader": True }, ).toDF().select("*")) print("Input DF count: %s" % (input_df.count())) csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" % (prelim_min_hour)) print("Date-bounded DF count: %s" % (csv_prelim_df.count())) # If --hourly is not set, query for policy hits for current hour # and previous 24 hours. # When --hourly is set, query for policy hits only for the current # and previous hour. (Querying for the previous hour is necessary) # as the previous CSV generation job may have run in the middle of the # hour, meaning new events for that hour may exist.) # (now - (now % 3600)) = start of the current hour # ((now - (now % 3600)) - 3600) = start of the previous hour hours_ago = 1 if args.hourly else 24 increm_min_hour_sec = (START_TIME - (START_TIME % 3600)) - (3600 * hours_ago) increm_min_hour_micro = increm_min_hour_sec * 1000000 # When nonStrict is set, query for the latest hourly epoch in the input file # as well as all subsequent hourly epochs. This query may be more expensive # to complete, but allows for the possibility of filling in gaps if previous # runs of the job failed or were paused. if args.nonStrict: # We may need to re-query for transactions in the latest epoch depending on # when the previous query was run. Filter these rows out as well, then # set newest_epoch to our minimum hour epoch. last_seen_epoch = ( input_df.selectExpr("cast(start_time as bigint) as ts").agg({ "start_time": "max" }).collect().pop()[0]) last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner # Make sure we don't go outside our --maxHoursAgo range. # This check is necessary as increm_min_hour_sec is used in the pushdown # predicate when querying against the Athena source table/view. increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec) increm_min_hour_micro = increm_min_hour_sec * 1000000 print("Set minimum hour in seconds: %s" % (increm_min_hour_sec)) csv_hits_in_range = csv_prelim_df.filter( "cast(start_time as bigint) < %s" % (increm_min_hour_micro)) print("Total CSV hits in timeframe of interest: %s" % (csv_hits_in_range.count())) # Set timeframe for pushdown predicate pushdown = "(hour >= %s)" % (increm_min_hour_sec) # Get table contents starting from minimum hour settings. raw_data = gc.create_dynamic_frame.from_catalog( database=args.athenaDatabase, table_name=args.athenaTable, transformation_ctx="raw_data", push_down_predicate=pushdown, ) # Determine what fields we should extract based on table definition. # Deriving the fields from the table schema is preferable to inferring # it from the underlying parquet because the two may differ. tmp_schema = get_table_schema(args.athenaDatabase, args.athenaTable, glue_client=glue_client) # Verify we only have one matching column on which to explode. if not [col[0] for col in tmp_schema].count(args.policyType) == 1: raise Exception( "Wrong number of matching %s columns found in schema: %s" % (args.policyType, tmp_schema)) # Sub out the policies/parent_policies field for an explode expression. # Lightly modify other fields to promote clean and consistent writes to CSV file. schema = [] for col in tmp_schema: if col[0] == args.policyType: schema.append("explode(%s) as policy" % (col[0])) elif "map" in col[1].lower() or "struct" in col[1].lower(): schema.append("json_clean(%s) as %s" % (col[0], col[0])) else: schema.append("cast(%s as string)" % (col[0])) # Get targeted policies/parent_policies. # Should be passed to SparkSQL as quoted strings. pol_arr = ", ".join([ "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter) if pol ]) # Filter table contents according to policy hits. # A non-null policies field implies a non-null parent_policies field # (and vice versa), so it's OK to filter on just one. new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr( *schema).filter("policy in (%s)" % (pol_arr))) print("New policy triggers found since last job: %s" % (new_hits.count())) # Combine newly collected policy hits with dataframe of previous CSV contents. write_df = new_hits.union(csv_hits_in_range).orderBy( "start_time").coalesce(1) uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver) s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir, uniq)) if args.verbose: print("S3 Results Location: %s" % s3_loc) write_df.write.option("quoteAll", True).csv(s3_loc, header=True) # Rename output file, if requested. if args.outputFilename: rename_resp = rename_file( args.outputBucket, args.outputFilename, prefix=args.outputDir, dont_preserve_dir=args.dontPreserveOutputDir, keep_orig=args.keepOrigOnRename, ) if rename_resp and args.verbose: print("Renamed file to %s/%s." % (args.outputDir, args.outputFilename)) job.commit()
#now = datetime.datetime.now(timezone('Asia/Seoul')) now = datetime.datetime.utcnow() print('current time : ' + str(now)) target_date_hour= now + datetime.timedelta(hours=-1) print('taget time : ' + str(target_date_hour)) timepartition = target_date_hour.strftime('_year=%Y/_month=%m/_day=%d/_hour=%H/') for work in targetlist: src = 's3://{bucket}/{path}/{timepart}'.format( bucket=bucketname, path=work['src'], timepart=timepartition ) dest = 's3://{bucket}/{path}/{timepart}'.format( bucket=bucketname, path=work['dest'], timepart=timepartition ) print( 'source src : ' + src) print( 'target src : ' + dest) dir_exist = checkdir_ifexist( bucketname, '{path}/{timepart}'.format(path=work['src'], timepart=timepartition) ) if not dir_exist: print( 'Not yet created json path : {0}'.format(src)) continue # if checkdir_ifexist(bucketname, '{path}/{timepart}'.format(path=work['dest'], timepart=timepartition) ): # print( 'alreay exist dest path , so delete all parquet before make it :' + dest) # delete_destfile( bucketname, '{path}/{timepart}'.format(path=work['dest'], timepart=timepartition)) print( ' transforming work started for source src : ' + src) json2parquet( src, dest) print( ' transforming work ended in dest path : ' + dest) print("############ json2parquet job ended ########") # Glue에서 실행시 Job.commit()
def main(): ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'datetime_column', 'date_column', 'partition_column', 'output_bucket_name' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] datetime_column = args['datetime_column'] date_column = args['date_column'] output_bucket_name = args['output_bucket_name'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"] ## @return: sink ## @inputs: [frame = enriched_data] sink = glue_context.getSink(connection_type="s3", path="s3://" + output_bucket_name + "/" + clean_table_name, enableUpdateCatalog=True, updateBehavior="UPDATE_IN_DATABASE", partitionKeys=[partition_column]) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=clean_db_name, catalogTableName=clean_table_name) sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result')) job.commit()