def main( input_path, output_path, conn_type, module_name, function_name, glue_config=None ): """ There is a main function of Glue job driver. The idea is to keep it generic to be able to reuse for running multiple Glue jobs. Driver is responsible to validate input parameters and invoke a proper Glue job. It does not contain any specific business logic. @param input_path: input path contains input files to the job @param output_path: output path used to write Glue job results @param conn_type: use 'file' to read files from a local file system or 's3' for reading files from S3 @param module_name and function_name: are used to call Glue job using reflection @param glue_config: optional additional Glue configuration parameters Parameters example: --JOB_NAME=movie_analytics_job --CONN_TYPE=file --INPUT_PATH=/input --OUTPUT_PATH=/results --MODULE_NAME=tasks --FUNCTION_NAME=create_and_stage_average_rating """ try: glue_context = create_glue_context() module = importlib.import_module(module_name) module.init_glue(glue_context) module.init_spark(glue_context.spark_session) f = getattr(module, function_name) f(input_path, output_path, conn_type) job = Job(glue_context) job.init(args["JOB_NAME"], args) job.commit() except Exception as e: sys.error( f"Couldn't execute job due to {str(e)} for module: {module_name} and function: {function_name}" ) raise e
def __init__(self, service_name): args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments()) # Validate the service name if not self.is_valid_service(service_name): raise Exception("'%s' is not yet a supported service." % service_name) self.glue_context = self._init_glue_context() self.job = Job(self.glue_context) region = self.get_instance_region() # Create data catalog references raw_klas = self.SERVICE_DEFINITIONS[service_name][0] converted_klas = self.SERVICE_DEFINITIONS[service_name][1] self.raw_catalog = raw_klas( region, args['raw_database_name'], args['raw_table_name'], args['s3_source_location'] ) self.optimized_catalog = converted_klas( region, args['converted_database_name'], args['converted_table_name'], args['s3_converted_target'] ) # Assume that if the raw table does not exist, this is our first run self.initial_run = not self.raw_catalog.does_table_exist() # Create a converter object and initialize the glue job! self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog) self.job.init(args['JOB_NAME'], args)
def processMain(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ds0 = glueContext.create_dynamic_frame.from_catalog(database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="ds0") rc1 = ResolveChoice.apply(frame=ds0, choice="make_struct", transformation_ctx="rc1") ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="redshift", connection_options={ "url": "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world", "dbtable": "atn.gluetable211"}, transformation_ctx="ds3") ds7 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="s3", connection_options={ "path": "s3://asgqatestautomation4/Targetdata211"}, format="json", transformation_ctx="ds7") # ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="postgresql", # connection_options={ # "url": "jdbc:postgresql://decheqaperf01v.asg.com:5432/postgres", # "dbtable": "public.gluetable26"}, # transformation_ctx="ds3") job.commit()
def processMain(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog( database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="datasource0") resolvechoice2 = ResolveChoice.apply(frame=datasource0, choice="make_struct", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields3, connection_type="s3", connection_options={"path": "s3://asgqatestautomation4/Targetdata1A"}, format="csv", transformation_ctx="datasink4") datasink5 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields3, connection_type="s3", connection_options={"path": "s3://asgqatestautomation4/Targetdata1B"}, format="parquet", transformation_ctx="datasink5") job.commit()
def _get_spark_context(): spark_context = GlueContext(SparkContext.getOrCreate()) job = Job(spark_context) args = _get_arguments({}) job.init(args['JOB_NAME'], args) return (spark_context, job)
def create_glue_context(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session job = Job(glueContext) args = getResolvedOptions(sys.argv, ['JOB_NAME', 'input']) job.init(args['JOB_NAME'], args) return spark, job
def get_spark_session_and_glue_job( glue_args: Dict, conf=None, py_files: Iterable[str] = None, extra_jars: List[str] = None, ): """ Get spark session and AWS glue job. Parameters ---------- glue_args Dictionary of Argument Name: Argument value extra_jars Path to dependent jar files conf : Union[pyspark.SparkConf, Dict[str, str]] Spark config, either object or dictionary of config options. py_files Paths to python files (.py, .zip, .egg) Returns ------- pyspark.sql.SparkSession, awsglue.job.Job """ from awsglue.context import GlueContext from awsglue.job import Job from pyspark import SparkContext, SparkConf LOGGER.debug("Creating spark session with parameters") LOGGER.debug("conf=%s", conf) LOGGER.debug("py_files=%s", py_files) LOGGER.debug("extra_jars=%s", extra_jars) if isinstance(conf, dict): spark_conf = SparkConf() spark_conf.setAll(conf.items()) elif isinstance(conf, SparkConf): spark_conf = conf else: spark_conf = None if extra_jars and spark_conf: spark_dependencies = ",".join(extra_jars) spark_conf.set("spark.jars.packages", spark_dependencies) sc = SparkContext.getOrCreate(conf=spark_conf) if py_files: LOGGER.debug("Adding PYFILEs: %s", py_files) for py_file in py_files: sc.addPyFile(py_file) glue_context = GlueContext(sparkContext=sc) job = Job(glue_context=glue_context) job.init(glue_args["JOB_NAME"], glue_args) # .py, .zip or .egg return glue_context.spark_session, job
def main(args, yesterday): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) database = args['database_name'] table_name = '{0}kinesis_data'.format(args['table_prefix']) partition_predicate = \ 'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format( yesterday.year, yesterday.month, yesterday.day) datasource0 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=table_name, push_down_predicate=partition_predicate, transformation_ctx='datasource0') if datasource0.count() > 0: applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=[('device_id', 'string', 'device_id', 'string'), ('timestamp', 'long', 'timestamp', 'long')], transformation_ctx='applymapping1') df = applymapping1.toDF() dev = df.drop_duplicates(['device_id']) device_list = dev.collect() result_list = [] for device in device_list: device_id = device['device_id'] df2 = df[df.device_id == device_id] df_list = df2.sort('timestamp', ascending=True).collect() result_list.extend(create_diff_timestamp_list(device_id, df_list)) df = spark.createDataFrame(result_list) dyf = DynamicFrame.fromDF(df, glueContext, 'dyf') # Hive format output_path = os.path.join(args['target_bucket'], 'year={0}'.format(yesterday.year), 'month={0:02}'.format(yesterday.month), 'day={0:02}'.format(yesterday.day)) glueContext.write_dynamic_frame.from_options( frame=dyf, connection_type='s3', connection_options={'path': output_path}, format='parquet', transformation_ctx='datasink2') job.commit()
def _get_spark_session_and_glue_job(glue_args): from awsglue.context import GlueContext from awsglue.job import Job sc = SparkContext.getOrCreate() glue_context = GlueContext(sparkContext=sc) job = Job(glue_context=glue_context) job.init(glue_args["JOB_NAME"], glue_args) return glue_context.spark_session, job
def create_glue_context(): """ Creates Glue Context / Spark session """ args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) return spark, job
def main(): console("Starting Job") ## @params: [job_name] # 1. Start Glue Context glueContext = GlueContext(SparkContext.getOrCreate()) # 2. Initialize Job job = Job(glueContext) args = get_args() job.init(args["JOB_NAME"], args) client = boto3.client("glue", region_name="us-east-1") Tables = client.get_tables(DatabaseName=args["athena_database"]) tableList = Tables["TableList"] # 3. Clear bucket contents console(f"Excluding S3 files for: {args['target_bucket']}") s3 = boto3.resource("s3") bucket = s3.Bucket(args["target_bucket"]) bucket.objects.delete() for table in tableList: tableName = table["Name"] if re.search(f"[0-9]+", tableName): # Ignoring Athena Tables already processed... continue else: console("Processing Table {}".format(tableName)) # 4. Load dynamic dataframe datasource0 = glueContext.create_dynamic_frame.from_catalog( database=args["athena_database"], table_name=tableName, transformation_ctx="datasource0", ) # 5. Process Dynamo database # Drop null fields dropnullfields1 = DropNullFields.apply( frame=datasource0, transformation_ctx="dropnullfields1") # Save Dynamic Frame on S3 using Glue glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://{}/{}/".format(args["target_bucket"], tableName) }, format="parquet", transformation_ctx="datasink2", ) job.commit()
def main(): args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE', 'ANONYMIZATION_CONF' ]) sc = SparkContext() os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" # avoiding spark creates $folders$ in S3 hadoop_conf = sc._jsc.hadoopConfiguration() # hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") glueContext = GlueContext(sc) spark = glueContext.spark_session # Needed to reliably work between parquet, spark, and pandas dataframes. spark.conf.set("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1") spark.conf.set("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1") job = Job(glueContext) job.init(args['JOB_NAME'], args) input_conf_str_json = args["ANONYMIZATION_CONF"] print('input_conf_str_json:', input_conf_str_json) anonymization_conf_dict = json.loads(input_conf_str_json) print('anonimization_conf_dict:', anonymization_conf_dict) position_references_in_conf = {} for i in range(len(anonymization_conf_dict['datasets'])): position_references_in_conf[anonymization_conf_dict['datasets'][i] ['table']] = i print('position_references_in_conf:', position_references_in_conf) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] process(job, dataLakeBucket, dataLakePrefix, anonymization_conf_dict, position_references_in_conf, glueContext, glue_database, aws_region) delete_temporary_s3_objects(dataLakeBucket, aws_region)
def run(): """ Run method. This method is the starting point of the actual logic of the Glue job """ args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_bucket', 'job_id']) bucket = args['s3_bucket'] job_id = args['job_id'] glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) aggregate_key = 's3://' + bucket + '/output/' + job_id + '/rnorm-aggregate.json' aggregate_dataframe = spark.read.json(aggregate_key) basics_key = 's3://' + bucket + '/input/title.basics.tsv.gz' basics_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(basics_key) basics_dataframe = basics_dataframe.toDF("tconstTmp", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "endYear", "runtimeMinutes", "genres") aggregate_dataframe = aggregate_dataframe.join(basics_dataframe, aggregate_dataframe.tconst == basics_dataframe.tconstTmp, how='left').drop('tconstTmp') ratings_key = 's3://' + bucket + '/input/title.ratings.tsv.gz' ratings_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(ratings_key) ratings_dataframe = ratings_dataframe.toDF("tconstTmp", "averageRating", "numVotes") aggregate_dataframe = aggregate_dataframe.join(ratings_dataframe, aggregate_dataframe.tconst == ratings_dataframe.tconstTmp, how='left').drop('tconstTmp') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'ynorm') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp90') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp10') results_folder = 's3://' + bucket + '/athena/job_id=' + job_id + '/' aggregate_dataframe.repartition(10).write.parquet(results_folder, mode="overwrite") job.commit()
def main(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE' ]) sc = SparkContext() # avoiding spark creates $folders$ in S3 hadoop_conf = sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] process(job, glueContext, glue_database, dataLakeBucket, dataLakePrefix, aws_region)
def create_job(glueContext, args): job = Job(glueContext) job.init(args["JOB_NAME"], args) return job
from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.functions import col ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glue_context = GlueContext(sc) spark = glue_context.spark_session logger = glue_context.get_logger() job = Job(glue_context) input_path = "s3://" output_path = "s3://" #loading the test csv file for our operations #renaming multiple columns in spark df1 = spark.read.format("csv").option("header", "true").load(input_path) df1 = df1.withColumnRenamed("Semiotic Class", "semioticclass").withColumnRenamed( "Input Token", "inputtoken").withColumnRenamed( "Output Token", "outputtoken") #schema schema = df1.printSchema() logger.info(f'the schema before convertion is {schema}')
email_id,source_update_dtm, source_creation_dtm,pohne_nbr, full_nam, last_nam, fe_nam, fe_id, fe_idn, source_name, posting_agent, data_origin, load_dtm, update_dtm, versn_flg, hr_sso, hr_nam, mgr_sso, mgr_nam, prt_persn_d_md5, prt_persn_d_md5_scd1, start_dtm, end_dtm, current_flag_ind, deleted_flag_ind""" # Initialize the GlueContext and SparkContext SC = SparkContext() GLUECONTEXT = GlueContext(SC) SPARK = GLUECONTEXT.spark_session JOB = Job(GLUECONTEXT) JOB.init(ARGS['JOB_NAME'], ARGS) RUN_ID = ARGS['JOB_RUN_ID'] JOB_NAME = ARGS['JOB_NAME'] TEMPDIR = ARGS['TempDir'] #CurrentTimeStamp DATETIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S') END_DTM = '9999-12-31 23:59:59' JOB_START_TIME = timeit.default_timer() # creating DataFrames from source glue crawlers #SRC_NOTEMPTY = True
def main(): ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'partition_column', 'output_bucket_name', 'primary_key', 'parallelism', 'date_column', 'datetime_column' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] date_column = args['date_column'] datetime_column = args['datetime_column'] hudi_primary_key = args['primary_key'] output_bucket_name = args['output_bucket_name'] parallelism = args['parallelism'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job spark = SparkSession.builder.config( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer').getOrCreate() glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) logger = glue_context.get_logger() logger.info('Initialization.') # Initialization of Glue client to connect to Glue Catalog and retrieve table information glueClient = boto3.client('glue') ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) isTableExists = False try: glueClient.get_table(DatabaseName=clean_db_name, Name=target_entity_name) isTableExists = True logger.info(clean_db_name + '.' + target_entity_name + ' exists.') except ClientError as e: if e.response['Error']['Code'] == 'EntityNotFoundException': isTableExists = False logger.info(clean_db_name + '.' + target_entity_name + ' does not exist. Table will be created.') partition_path = '' if partition_column == 'None' else partition_column common_config = { 'className': 'org.apache.hudi', 'hoodie.datasource.hive_sync.use_jdbc': 'false', 'hoodie.index.type': 'GLOBAL_BLOOM', 'hoodie.datasource.write.precombine.field': datetime_column, 'hoodie.datasource.write.recordkey.field': hudi_primary_key, 'hoodie.table.name': target_entity_name, 'hoodie.consistency.check.enabled': 'true', 'hoodie.datasource.hive_sync.database': clean_db_name, 'hoodie.datasource.hive_sync.table': target_entity_name, 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.write.partitionpath.field': partition_path, 'hoodie.datasource.hive_sync.partition_fields': partition_path, 'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor', 'hoodie.datasource.write.hive_style_partitioning': 'false' if partition_column == 'None' else 'true', 'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator' } incremental_config = { 'hoodie.upsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert', 'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 'hoodie.cleaner.commits.retained': 10 } initLoad_config = { 'hoodie.bulkinsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert' } if (isTableExists): logger.info('Incremental upsert.') combinedConf = {**common_config, **incremental_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Append').save("s3://" + output_bucket_name + "/" + clean_table_name) else: logger.info('Inital load.') combinedConf = {**common_config, **initLoad_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Overwrite').save("s3://" + output_bucket_name + "/" + clean_table_name) job.commit()
# Glue Script to read from S3, filter data and write to Dynamo DB. # First read S3 data using Spark Context, Glue Context can also be used. Using Spark Context just to illustrate that # dataframe can be conveted to dynamic filter. import sys from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from pyspark import SparkContext args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_file_path', 'dynamodb_table']) sc = SparkContext() glue_ctx = GlueContext(sc) spark = glue_ctx.spark_session job = Job(glue_ctx) job.init(args['JOB_NAME'], args) s3_file_path = args['s3_file_path'] dynamodb_table = args['dynamodb_table'] df_src = spark.read.format("csv").option("header", "true").load(s3_file_path) df_fil_dept = df_src.filter(df_src.dept_no == 101) dyf_result = DynamicFrame.fromDF(df_fil_dept, glue_ctx, "dyf_result") glue_ctx. \ write_dynamic_frame_from_options( frame=dyf_result, connection_type="dynamodb", connection_options={"dynamodb.output.tableName": dynamodb_table, "dynamodb.throughput.write.percent": "1.0"})
repartitioned_stream = DynamicFrame.fromDF(result_df, glueContext, 'result_df') \ .repartition(2) glueContext.purge_s3_path(s3_path=target_path, options={"retentionPeriod": 0}) glueContext.write_dynamic_frame_from_options( frame=repartitioned_stream, connection_type="s3", connection_options={"path": target_path}, format="glueparquet") def main(argv, glueContext, job): args = getResolvedOptions( argv, ["JOB_NAME", "database_name", "source_table", "target_path"]) job.init(args['JOB_NAME'], args) staging_df = __load_staging_data(args, glueContext) mapped_staging = __map_staging_data(staging_df) unioned_data = __union_with_existing_data(mapped_staging, args, glueContext) merged_result = __merge_rows(unioned_data) __repartition_and_store(merged_result, args, glueContext) job.commit() if __name__ == '__main__': glueContext = GlueContext(SparkContext.getOrCreate()) main(sys.argv, glueContext, Job(glueContext))
def main(args): if args.verbose: print("Got arguments: %s" % (args)) glue_client = boto3.client("glue", region_name=args.region) s3_client = boto3.client("s3", region_name=args.region) # Verify source DB and table exist if validate_db(args.athenaDatabase, glue_client=glue_client) and args.verbose: print("Validated source database %s exists." % (args.athenaDatabase)) if (validate_table( args.athenaDatabase, args.athenaTable, glue_client=glue_client) and args.verbose): print("Validated source table %s exists." % (args.athenaTable)) # Verify input and output buckets exist and are accessible. for bucket in [args.outputBucket, args.inputBucket]: if validate_bucket(bucket, s3_client=s3_client) and args.verbose: print("Verified bucket s3://%s exists and is accessible." % (args.outputBucket)) # Use latest file in bucket that matches prefix string as input. input_csv, latest_dt = get_latest_file(args.inputBucket, prefix=args.inputPrefix, s3_client=s3_client) if not input_csv: raise Exception( "Found no candidate CSV files in bucket %s with prefix %s." % ( args.inputBucket, "(no prefix)" if not args.inputPrefix else args.inputPrefix, )) print("Got latest CSV file s3://%s/%s with write time %s." % (args.inputBucket, input_csv, latest_dt)) sc = SparkContext() gc = GlueContext(sc) sparkSession = gc.spark_session job = Job(gc) job.init(args.JOB_NAME, vars(args)) sparkSession.udf.register("json_clean", json_clean) # For requester payer sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") # Get minimum hour for existing data prelim_min_hour = 0 aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER # * 1000000 needed since start_time is in microseconds prelim_min_hour = (( (START_TIME - (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000 # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo. input_df = (gc.create_dynamic_frame_from_options( "s3", { "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)] }, "csv", { "withHeader": True }, ).toDF().select("*")) print("Input DF count: %s" % (input_df.count())) csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" % (prelim_min_hour)) print("Date-bounded DF count: %s" % (csv_prelim_df.count())) # If --hourly is not set, query for policy hits for current hour # and previous 24 hours. # When --hourly is set, query for policy hits only for the current # and previous hour. (Querying for the previous hour is necessary) # as the previous CSV generation job may have run in the middle of the # hour, meaning new events for that hour may exist.) # (now - (now % 3600)) = start of the current hour # ((now - (now % 3600)) - 3600) = start of the previous hour hours_ago = 1 if args.hourly else 24 increm_min_hour_sec = (START_TIME - (START_TIME % 3600)) - (3600 * hours_ago) increm_min_hour_micro = increm_min_hour_sec * 1000000 # When nonStrict is set, query for the latest hourly epoch in the input file # as well as all subsequent hourly epochs. This query may be more expensive # to complete, but allows for the possibility of filling in gaps if previous # runs of the job failed or were paused. if args.nonStrict: # We may need to re-query for transactions in the latest epoch depending on # when the previous query was run. Filter these rows out as well, then # set newest_epoch to our minimum hour epoch. last_seen_epoch = ( input_df.selectExpr("cast(start_time as bigint) as ts").agg({ "start_time": "max" }).collect().pop()[0]) last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner # Make sure we don't go outside our --maxHoursAgo range. # This check is necessary as increm_min_hour_sec is used in the pushdown # predicate when querying against the Athena source table/view. increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec) increm_min_hour_micro = increm_min_hour_sec * 1000000 print("Set minimum hour in seconds: %s" % (increm_min_hour_sec)) csv_hits_in_range = csv_prelim_df.filter( "cast(start_time as bigint) < %s" % (increm_min_hour_micro)) print("Total CSV hits in timeframe of interest: %s" % (csv_hits_in_range.count())) # Set timeframe for pushdown predicate pushdown = "(hour >= %s)" % (increm_min_hour_sec) # Get table contents starting from minimum hour settings. raw_data = gc.create_dynamic_frame.from_catalog( database=args.athenaDatabase, table_name=args.athenaTable, transformation_ctx="raw_data", push_down_predicate=pushdown, ) # Determine what fields we should extract based on table definition. # Deriving the fields from the table schema is preferable to inferring # it from the underlying parquet because the two may differ. tmp_schema = get_table_schema(args.athenaDatabase, args.athenaTable, glue_client=glue_client) # Verify we only have one matching column on which to explode. if not [col[0] for col in tmp_schema].count(args.policyType) == 1: raise Exception( "Wrong number of matching %s columns found in schema: %s" % (args.policyType, tmp_schema)) # Sub out the policies/parent_policies field for an explode expression. # Lightly modify other fields to promote clean and consistent writes to CSV file. schema = [] for col in tmp_schema: if col[0] == args.policyType: schema.append("explode(%s) as policy" % (col[0])) elif "map" in col[1].lower() or "struct" in col[1].lower(): schema.append("json_clean(%s) as %s" % (col[0], col[0])) else: schema.append("cast(%s as string)" % (col[0])) # Get targeted policies/parent_policies. # Should be passed to SparkSQL as quoted strings. pol_arr = ", ".join([ "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter) if pol ]) # Filter table contents according to policy hits. # A non-null policies field implies a non-null parent_policies field # (and vice versa), so it's OK to filter on just one. new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr( *schema).filter("policy in (%s)" % (pol_arr))) print("New policy triggers found since last job: %s" % (new_hits.count())) # Combine newly collected policy hits with dataframe of previous CSV contents. write_df = new_hits.union(csv_hits_in_range).orderBy( "start_time").coalesce(1) uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver) s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir, uniq)) if args.verbose: print("S3 Results Location: %s" % s3_loc) write_df.write.option("quoteAll", True).csv(s3_loc, header=True) # Rename output file, if requested. if args.outputFilename: rename_resp = rename_file( args.outputBucket, args.outputFilename, prefix=args.outputDir, dont_preserve_dir=args.dontPreserveOutputDir, keep_orig=args.keepOrigOnRename, ) if rename_resp and args.verbose: print("Renamed file to %s/%s." % (args.outputDir, args.outputFilename)) job.commit()
from awsglue.utils import getResolvedOptions from pyspark.sql import SparkSession from pyspark.sql.types import NullType, StructType, StructField from pyspark.sql.types import StringType, DateType, LongType, ArrayType from pyspark.sql.functions import * import itertools import boto3 import time args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucketName' ,'workspace', 'jdbc_url', 'username', 'pswd']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) #ccn-alerts-104328-{workspace}-sfmc-migration workspace = args['workspace'] bucketName = args['bucketName'] jdbc_url = args['jdbc_url'] username = args['username'] pswd = args['pswd'] print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username) ) s3Location = 's3://{}'.format(bucketName) glue_database = '{}_sfmc_migration'.format(workspace) outputLocation = ''+s3Location+'/athena-results-ccn-data/'
def main(): ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'datetime_column', 'date_column', 'partition_column', 'output_bucket_name' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] datetime_column = args['datetime_column'] date_column = args['date_column'] output_bucket_name = args['output_bucket_name'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"] ## @return: sink ## @inputs: [frame = enriched_data] sink = glue_context.getSink(connection_type="s3", path="s3://" + output_bucket_name + "/" + clean_table_name, enableUpdateCatalog=True, updateBehavior="UPDATE_IN_DATABASE", partitionKeys=[partition_column]) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=clean_db_name, catalogTableName=clean_table_name) sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result')) job.commit()