def processMain(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ds0 = glueContext.create_dynamic_frame.from_catalog(database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="ds0") rc1 = ResolveChoice.apply(frame=ds0, choice="make_struct", transformation_ctx="rc1") ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="redshift", connection_options={ "url": "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world", "dbtable": "atn.gluetable211"}, transformation_ctx="ds3") ds7 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="s3", connection_options={ "path": "s3://asgqatestautomation4/Targetdata211"}, format="json", transformation_ctx="ds7") # ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="postgresql", # connection_options={ # "url": "jdbc:postgresql://decheqaperf01v.asg.com:5432/postgres", # "dbtable": "public.gluetable26"}, # transformation_ctx="ds3") job.commit()
def _get_spark_context(): spark_context = GlueContext(SparkContext.getOrCreate()) job = Job(spark_context) args = _get_arguments({}) job.init(args['JOB_NAME'], args) return (spark_context, job)
def main( input_path, output_path, conn_type, module_name, function_name, glue_config=None ): """ There is a main function of Glue job driver. The idea is to keep it generic to be able to reuse for running multiple Glue jobs. Driver is responsible to validate input parameters and invoke a proper Glue job. It does not contain any specific business logic. @param input_path: input path contains input files to the job @param output_path: output path used to write Glue job results @param conn_type: use 'file' to read files from a local file system or 's3' for reading files from S3 @param module_name and function_name: are used to call Glue job using reflection @param glue_config: optional additional Glue configuration parameters Parameters example: --JOB_NAME=movie_analytics_job --CONN_TYPE=file --INPUT_PATH=/input --OUTPUT_PATH=/results --MODULE_NAME=tasks --FUNCTION_NAME=create_and_stage_average_rating """ try: glue_context = create_glue_context() module = importlib.import_module(module_name) module.init_glue(glue_context) module.init_spark(glue_context.spark_session) f = getattr(module, function_name) f(input_path, output_path, conn_type) job = Job(glue_context) job.init(args["JOB_NAME"], args) job.commit() except Exception as e: sys.error( f"Couldn't execute job due to {str(e)} for module: {module_name} and function: {function_name}" ) raise e
def processMain(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog( database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="datasource0") resolvechoice2 = ResolveChoice.apply(frame=datasource0, choice="make_struct", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields3, connection_type="s3", connection_options={"path": "s3://asgqatestautomation4/Targetdata1A"}, format="csv", transformation_ctx="datasink4") datasink5 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields3, connection_type="s3", connection_options={"path": "s3://asgqatestautomation4/Targetdata1B"}, format="parquet", transformation_ctx="datasink5") job.commit()
def create_glue_context(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session job = Job(glueContext) args = getResolvedOptions(sys.argv, ['JOB_NAME', 'input']) job.init(args['JOB_NAME'], args) return spark, job
def get_spark_session_and_glue_job( glue_args: Dict, conf=None, py_files: Iterable[str] = None, extra_jars: List[str] = None, ): """ Get spark session and AWS glue job. Parameters ---------- glue_args Dictionary of Argument Name: Argument value extra_jars Path to dependent jar files conf : Union[pyspark.SparkConf, Dict[str, str]] Spark config, either object or dictionary of config options. py_files Paths to python files (.py, .zip, .egg) Returns ------- pyspark.sql.SparkSession, awsglue.job.Job """ from awsglue.context import GlueContext from awsglue.job import Job from pyspark import SparkContext, SparkConf LOGGER.debug("Creating spark session with parameters") LOGGER.debug("conf=%s", conf) LOGGER.debug("py_files=%s", py_files) LOGGER.debug("extra_jars=%s", extra_jars) if isinstance(conf, dict): spark_conf = SparkConf() spark_conf.setAll(conf.items()) elif isinstance(conf, SparkConf): spark_conf = conf else: spark_conf = None if extra_jars and spark_conf: spark_dependencies = ",".join(extra_jars) spark_conf.set("spark.jars.packages", spark_dependencies) sc = SparkContext.getOrCreate(conf=spark_conf) if py_files: LOGGER.debug("Adding PYFILEs: %s", py_files) for py_file in py_files: sc.addPyFile(py_file) glue_context = GlueContext(sparkContext=sc) job = Job(glue_context=glue_context) job.init(glue_args["JOB_NAME"], glue_args) # .py, .zip or .egg return glue_context.spark_session, job
def main(args, yesterday): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) database = args['database_name'] table_name = '{0}kinesis_data'.format(args['table_prefix']) partition_predicate = \ 'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format( yesterday.year, yesterday.month, yesterday.day) datasource0 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=table_name, push_down_predicate=partition_predicate, transformation_ctx='datasource0') if datasource0.count() > 0: applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=[('device_id', 'string', 'device_id', 'string'), ('timestamp', 'long', 'timestamp', 'long')], transformation_ctx='applymapping1') df = applymapping1.toDF() dev = df.drop_duplicates(['device_id']) device_list = dev.collect() result_list = [] for device in device_list: device_id = device['device_id'] df2 = df[df.device_id == device_id] df_list = df2.sort('timestamp', ascending=True).collect() result_list.extend(create_diff_timestamp_list(device_id, df_list)) df = spark.createDataFrame(result_list) dyf = DynamicFrame.fromDF(df, glueContext, 'dyf') # Hive format output_path = os.path.join(args['target_bucket'], 'year={0}'.format(yesterday.year), 'month={0:02}'.format(yesterday.month), 'day={0:02}'.format(yesterday.day)) glueContext.write_dynamic_frame.from_options( frame=dyf, connection_type='s3', connection_options={'path': output_path}, format='parquet', transformation_ctx='datasink2') job.commit()
def _get_spark_session_and_glue_job(glue_args): from awsglue.context import GlueContext from awsglue.job import Job sc = SparkContext.getOrCreate() glue_context = GlueContext(sparkContext=sc) job = Job(glue_context=glue_context) job.init(glue_args["JOB_NAME"], glue_args) return glue_context.spark_session, job
def create_glue_context(): """ Creates Glue Context / Spark session """ args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) return spark, job
def main(): console("Starting Job") ## @params: [job_name] # 1. Start Glue Context glueContext = GlueContext(SparkContext.getOrCreate()) # 2. Initialize Job job = Job(glueContext) args = get_args() job.init(args["JOB_NAME"], args) client = boto3.client("glue", region_name="us-east-1") Tables = client.get_tables(DatabaseName=args["athena_database"]) tableList = Tables["TableList"] # 3. Clear bucket contents console(f"Excluding S3 files for: {args['target_bucket']}") s3 = boto3.resource("s3") bucket = s3.Bucket(args["target_bucket"]) bucket.objects.delete() for table in tableList: tableName = table["Name"] if re.search(f"[0-9]+", tableName): # Ignoring Athena Tables already processed... continue else: console("Processing Table {}".format(tableName)) # 4. Load dynamic dataframe datasource0 = glueContext.create_dynamic_frame.from_catalog( database=args["athena_database"], table_name=tableName, transformation_ctx="datasource0", ) # 5. Process Dynamo database # Drop null fields dropnullfields1 = DropNullFields.apply( frame=datasource0, transformation_ctx="dropnullfields1") # Save Dynamic Frame on S3 using Glue glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://{}/{}/".format(args["target_bucket"], tableName) }, format="parquet", transformation_ctx="datasink2", ) job.commit()
def main(): args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE', 'ANONYMIZATION_CONF' ]) sc = SparkContext() os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" # avoiding spark creates $folders$ in S3 hadoop_conf = sc._jsc.hadoopConfiguration() # hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") glueContext = GlueContext(sc) spark = glueContext.spark_session # Needed to reliably work between parquet, spark, and pandas dataframes. spark.conf.set("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1") spark.conf.set("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1") job = Job(glueContext) job.init(args['JOB_NAME'], args) input_conf_str_json = args["ANONYMIZATION_CONF"] print('input_conf_str_json:', input_conf_str_json) anonymization_conf_dict = json.loads(input_conf_str_json) print('anonimization_conf_dict:', anonymization_conf_dict) position_references_in_conf = {} for i in range(len(anonymization_conf_dict['datasets'])): position_references_in_conf[anonymization_conf_dict['datasets'][i] ['table']] = i print('position_references_in_conf:', position_references_in_conf) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] process(job, dataLakeBucket, dataLakePrefix, anonymization_conf_dict, position_references_in_conf, glueContext, glue_database, aws_region) delete_temporary_s3_objects(dataLakeBucket, aws_region)
def run(): """ Run method. This method is the starting point of the actual logic of the Glue job """ args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_bucket', 'job_id']) bucket = args['s3_bucket'] job_id = args['job_id'] glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) aggregate_key = 's3://' + bucket + '/output/' + job_id + '/rnorm-aggregate.json' aggregate_dataframe = spark.read.json(aggregate_key) basics_key = 's3://' + bucket + '/input/title.basics.tsv.gz' basics_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(basics_key) basics_dataframe = basics_dataframe.toDF("tconstTmp", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "endYear", "runtimeMinutes", "genres") aggregate_dataframe = aggregate_dataframe.join(basics_dataframe, aggregate_dataframe.tconst == basics_dataframe.tconstTmp, how='left').drop('tconstTmp') ratings_key = 's3://' + bucket + '/input/title.ratings.tsv.gz' ratings_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(ratings_key) ratings_dataframe = ratings_dataframe.toDF("tconstTmp", "averageRating", "numVotes") aggregate_dataframe = aggregate_dataframe.join(ratings_dataframe, aggregate_dataframe.tconst == ratings_dataframe.tconstTmp, how='left').drop('tconstTmp') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'ynorm') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp90') aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp10') results_folder = 's3://' + bucket + '/athena/job_id=' + job_id + '/' aggregate_dataframe.repartition(10).write.parquet(results_folder, mode="overwrite") job.commit()
def main(): ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE' ]) sc = SparkContext() # avoiding spark creates $folders$ in S3 hadoop_conf = sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] process(job, glueContext, glue_database, dataLakeBucket, dataLakePrefix, aws_region)
from awsglue.job import Job import time import pg8000 import boto3 import re from decimal import * import extract_rs_query_logs_functions as functions ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME','REGION','CLUSTER_ENDPOINT']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) job_configs={} job_configs.update(args) clusterId= re.search('jdbc:redshift://(.+?)\..*',args['CLUSTER_ENDPOINT']).group(1) job_configs.update(functions.getJobConfigurations(clusterId,job_configs)) job_configs['CLUSTER_ID']=clusterId tempDir=args['TempDir'] s3Prefix=job_configs['s3_prefix'] credentials=boto3.Session().get_credentials() job_configs['aws_access_key_id'] = credentials.access_key job_configs['aws_secret_access_key'] = credentials.secret_key job_configs['aws_session_token'] = credentials.token job_configs.update(args) job_configs['jdbcURL']="{}?user={}&password={}".format(args['CLUSTER_ENDPOINT'],job_configs['user'],job_configs['password']) job_configs['region_name']=boto3.session.Session().region_name job_configs['spark_session']=spark
def main(): ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'partition_column', 'output_bucket_name', 'primary_key', 'parallelism', 'date_column', 'datetime_column' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] date_column = args['date_column'] datetime_column = args['datetime_column'] hudi_primary_key = args['primary_key'] output_bucket_name = args['output_bucket_name'] parallelism = args['parallelism'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job spark = SparkSession.builder.config( 'spark.serializer', 'org.apache.spark.serializer.KryoSerializer').getOrCreate() glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) logger = glue_context.get_logger() logger.info('Initialization.') # Initialization of Glue client to connect to Glue Catalog and retrieve table information glueClient = boto3.client('glue') ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) isTableExists = False try: glueClient.get_table(DatabaseName=clean_db_name, Name=target_entity_name) isTableExists = True logger.info(clean_db_name + '.' + target_entity_name + ' exists.') except ClientError as e: if e.response['Error']['Code'] == 'EntityNotFoundException': isTableExists = False logger.info(clean_db_name + '.' + target_entity_name + ' does not exist. Table will be created.') partition_path = '' if partition_column == 'None' else partition_column common_config = { 'className': 'org.apache.hudi', 'hoodie.datasource.hive_sync.use_jdbc': 'false', 'hoodie.index.type': 'GLOBAL_BLOOM', 'hoodie.datasource.write.precombine.field': datetime_column, 'hoodie.datasource.write.recordkey.field': hudi_primary_key, 'hoodie.table.name': target_entity_name, 'hoodie.consistency.check.enabled': 'true', 'hoodie.datasource.hive_sync.database': clean_db_name, 'hoodie.datasource.hive_sync.table': target_entity_name, 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.write.partitionpath.field': partition_path, 'hoodie.datasource.hive_sync.partition_fields': partition_path, 'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor', 'hoodie.datasource.write.hive_style_partitioning': 'false' if partition_column == 'None' else 'true', 'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator' } incremental_config = { 'hoodie.upsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert', 'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 'hoodie.cleaner.commits.retained': 10 } initLoad_config = { 'hoodie.bulkinsert.shuffle.parallelism': parallelism, 'hoodie.datasource.write.operation': 'upsert' } if (isTableExists): logger.info('Incremental upsert.') combinedConf = {**common_config, **incremental_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Append').save("s3://" + output_bucket_name + "/" + clean_table_name) else: logger.info('Inital load.') combinedConf = {**common_config, **initLoad_config} enriched_data.write.format('org.apache.hudi').options( **combinedConf).mode('Overwrite').save("s3://" + output_bucket_name + "/" + clean_table_name) job.commit()
from awsglue.context import GlueContext from pyspark.context import SparkContext from awsglue.job import Job import sys from awsglue.utils import getResolvedOptions from awsglue.dynamicframe import DynamicFrame glueContext = GlueContext(SparkContext.getOrCreate()) glueJob = Job(glueContext) args = getResolvedOptions(sys.argv,['JOB_NAME']) glueJob.init(args['JOB_NAME'],args) #sparkSession = glueContext.sparkSession spark = glueContext.spark_session #df = sparkSession.read.csv("s3a://pkm") #df.show() #dfnew = spark.read.option("header","true").option("delimiter", ",").csv("s3a://pkm") df = spark.read.option("header","true").format("csv").load("s3a://pkm #inputGDF = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://pkm"], "recurse":True}, format = "csv") #df=inputGDF.toDF() df.show(2) dynamic_dframe = DynamicFrame.fromDF(df, glueContext, "dynamic_df") ##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket. ##It is possible to write to any Amazon data store (SQL Server, Redshift, etc) by using any previously defined connections. retDatasink4 = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe, connection_type = "s3", connection_options = {"path": "s3://pkm-target"}, format = "csv", transformation_ctx = "datasink4") glueJob.commit()
args = getResolvedOptions(sys.argv, ['BucketName']) s3 = boto3.resource('s3') s3_client = boto3.client('s3') athena_client = boto3.client(service_name='athena', region_name='us-west-2') bucket_name = args['BucketName'] BucketName = args['BucketName'] database = 'c360view_stage' sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init('n1_c360_dispositions') def run_query(client, query): response = client.start_query_execution( QueryString=query, QueryExecutionContext={'Database': database}, ResultConfiguration={ 'OutputLocation': 's3://{}/athenaoutput/'.format(bucket_name) }, ) return response def validate_query(client, query_id): resp = ["FAILED", "SUCCEEDED", "CANCELLED"]
from pyspark.sql.functions import * from pyspark.context import SparkContext from awsglue.transforms import * from awsglue.context import GlueContext from awsglue.job import Job glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session job = Job(glue_context) job.init('ddb2s3') dyf = glue_context.create_dynamic_frame.from_options( connection_type='dynamodb', connection_options={ 'dynamodb.input.tableName': 'NatureRemo', 'dynamodb.splits': '72' # Should be (DPU-1)*8 }) dyf.printSchema() dyf = dyf.resolveChoice(specs=[('value', 'cast:double')]) dyf = dyf.repartition( 1) # You need to modify this number to your preferred file number. glue_context.write_dynamic_frame_from_options( frame=dyf, connection_type='s3', connection_options={'path': "s3://example-bucket/glue/ddb2s3/"}, format='parquet') # You will need to specify partitionKeys if you want to write data with hive partitioning based on column 'year', 'month', and 'day #glue_context.write_dynamic_frame_from_options(
# get the arguments kwargs = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'instruments', 'columns', 'test_weeks', 'n_tests', 'holdout_weeks', 'length', 's3_bucket', 's3_path', 'user', 'labid' ]) kwargs['instruments'] = kwargs['instruments'].split(',') kwargs['columns'] = kwargs['columns'].split(',') kwargs['test_weeks'] = int(kwargs['test_weeks']) kwargs['n_tests'] = int(kwargs['n_tests']) kwargs['holdout_weeks'] = int(kwargs['holdout_weeks']) kwargs['length'] = int(kwargs['length']) # register the udf spark.udf.register( "create_tests", create_tests, ArrayType( StructType([ StructField('ts_start', TimestampType(), True), StructField('target', ArrayType(DoubleType()), True) ]))) # register the udf spark.udf.register("to_delta", to_delta, ArrayType(DoubleType())) job = Job(glueContext) job.init(kwargs['JOB_NAME'], kwargs) create_all_features(**kwargs) job.commit()
import datetime import boto3 args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'region', 'source_database', 'target_location', 'target_prefix' ]) job_name = args['JOB_NAME'] region = args['region'] source_db = args['source_database'] target_bucket = args['target_location'] target_prefix = args['target_prefix'] sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(job_name, args) client = boto3.client('glue', region_name=region) # Make this a global variable so it can't change if we run near midnight UTC # Plus I'm not creating a new object for every record! now = datetime.datetime.now() def AddPartitions(rec): rec["yyyy"] = now.year rec["mm"] = now.month rec["day"] = now.day return rec def transform(source_db,
source_creation_dtm,pohne_nbr, full_nam, last_nam, fe_nam, fe_id, fe_idn, source_name, posting_agent, data_origin, load_dtm, update_dtm, versn_flg, hr_sso, hr_nam, mgr_sso, mgr_nam, prt_persn_d_md5, prt_persn_d_md5_scd1, start_dtm, end_dtm, current_flag_ind, deleted_flag_ind""" # Initialize the GlueContext and SparkContext SC = SparkContext() GLUECONTEXT = GlueContext(SC) SPARK = GLUECONTEXT.spark_session JOB = Job(GLUECONTEXT) JOB.init(ARGS['JOB_NAME'], ARGS) RUN_ID = ARGS['JOB_RUN_ID'] JOB_NAME = ARGS['JOB_NAME'] TEMPDIR = ARGS['TempDir'] #CurrentTimeStamp DATETIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S') END_DTM = '9999-12-31 23:59:59' JOB_START_TIME = timeit.default_timer() # creating DataFrames from source glue crawlers #SRC_NOTEMPTY = True PRT_PERSN_D_SOURCE_1 = GLUECONTEXT.create_dynamic_frame. \
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions ##Use Pandas library to execute Pivot function, popular Python libraries have already been integrated by default. Please check https://docs.amazonaws.cn/en_us/glue/latest/dg/aws-glue-programming-python-libraries.html import pandas as pd from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job args = getResolvedOptions(sys.argv, ['Target']) glueContext = GlueContext(SparkContext.getOrCreate()) sc = glueContext.spark_session job = Job(glueContext) job.init(args['Target'], args) #Read the table from preset database in Glue data source, change the value of database and table_name accordingly datasource0 = glueContext.create_dynamic_frame.from_catalog( database="default", table_name="jsonpivottest_json", transformation_ctx="datasource0") datasource0.printSchema() applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[("dimensions", "array", "dimensions", "array"), ("measurename", "string", "measurename", "string"), ("measurevalue", "string", "measurevalue", "string"), ("time", "string", "time", "string"), ("timeunit", "string", "timeunit", "string")],
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) #args = getResolvedOptions(sys.argv, ['JOB_NAME','VAL1','VAL2','VAL3','DEST_FOLDER']) #v_list=[{"VAL1":args['rohit_name']}] sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME']) freshers_data = spark.read.format("csv").option("header", "true").option( "inferSchema", "true").load('s3://bucketname-report.csv.gz') freshers_data.printSchema() freshers_data.show(20, False) #print('v_list') #print(v_list)
class JobRunner(object): """This class manages the orchestration of new jobs, including parsing arguments and control flow""" SERVICE_DEFINITIONS = { 'alb': [ALBRawCatalog, ALBConvertedCatalog], 'elb': [ELBRawCatalog, ELBConvertedCatalog], 'cloudtrail': [CloudTrailRawCatalog, CloudTrailConvertedCatalog], 'cloudfront': [CloudFrontRawCatalog, CloudFrontConvertedCatalog], 's3_access': [S3AccessRawCatalog, S3AccessConvertedCatalog], 'vpc_flow': [VPCFlowRawCatalog, VPCFlowConvertedCatalog], } def __init__(self, service_name): args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments()) # Validate the service name if not self.is_valid_service(service_name): raise Exception("'%s' is not yet a supported service." % service_name) self.glue_context = self._init_glue_context() self.job = Job(self.glue_context) region = self.get_instance_region() # Create data catalog references raw_klas = self.SERVICE_DEFINITIONS[service_name][0] converted_klas = self.SERVICE_DEFINITIONS[service_name][1] self.raw_catalog = raw_klas( region, args['raw_database_name'], args['raw_table_name'], args['s3_source_location'] ) self.optimized_catalog = converted_klas( region, args['converted_database_name'], args['converted_table_name'], args['s3_converted_target'] ) # Assume that if the raw table does not exist, this is our first run self.initial_run = not self.raw_catalog.does_table_exist() # Create a converter object and initialize the glue job! self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog) self.job.init(args['JOB_NAME'], args) @staticmethod def is_valid_service(service_name): """Determines whether the given service_name is a supported service or not""" return service_name in JobRunner.SERVICE_DEFINITIONS def get_instance_region(self): """Retrieve the current AWS Region from the Instance Metadata""" contents = urllib2.urlopen("http://169.254.169.254/latest/dynamic/instance-identity/document").read() return json.loads(contents).get('region') def create_tables_if_needed(self): """If this is the initial run of the Job, create both the raw and optmized tables in the Data Catalog""" if self.initial_run is True: # TODO: Fail if the table already exists, or for converted tables if the S3 path already exists LOGGER.info("Initial run, scanning S3 for partitions.") self.raw_catalog.initialize_table_from_s3() # Note that if the source table is partitionless, this is a null-op. self.optimized_catalog.initialize_with_partitions(self.raw_catalog.partitioner.build_partitions_from_s3()) def add_new_raw_partitions(self): """For the raw catalog, check and see if any new partitions exist for UTC today. Continue this check for every day previous until we reach a day where a partition exists.""" if self.initial_run is not True: LOGGER.info("Recurring run, only looking for recent partitions on raw catalog.") self.raw_catalog.add_recent_partitions() def add_new_optimized_partitions(self): """For the optimized catalog, check and see if any new partitions exist for UTC today. Continue this check for every day previous until we reach a day where a partition exists. If this is the initial run, add whatever partitions we can find. """ if self.initial_run and isinstance(self.raw_catalog.partitioner, NullPartitioner): LOGGER.info("Initial run with source NullPartitioner, adding all partitions from S3.") self.optimized_catalog.get_and_create_partitions() else: self.optimized_catalog.add_recent_partitions() def trigger_conversion(self): """Trigger the DataConverter""" self.converter.run() def finish(self): """Take any actions necessary to finish the job""" self.job.commit() def convert_and_partition(self): """A wrapper for the most common operations of these jobs. This allows for a simple one-line interface to the consumer, but allows them to use more-specific methods if need be. """ self.create_tables_if_needed() self.add_new_raw_partitions() self.trigger_conversion() self.add_new_optimized_partitions() self.finish() @staticmethod def _job_arguments(): return [ 'raw_database_name', 'raw_table_name', 'converted_database_name', 'converted_table_name', 's3_source_location', 's3_converted_target' ] @staticmethod def _init_glue_context(): # Imports are done here so we can isolate the configuration of this job from awsglue.context import GlueContext from pyspark.context import SparkContext spark_context = SparkContext.getOrCreate() spark_context._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") # noqa pylint: disable=protected-access spark_context._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") # noqa pylint: disable=protected-access return GlueContext(spark_context)
def main(args): if args.verbose: print("Got arguments: %s" % (args)) glue_client = boto3.client("glue", region_name=args.region) s3_client = boto3.client("s3", region_name=args.region) # Verify source DB and table exist if validate_db(args.athenaDatabase, glue_client=glue_client) and args.verbose: print("Validated source database %s exists." % (args.athenaDatabase)) if (validate_table( args.athenaDatabase, args.athenaTable, glue_client=glue_client) and args.verbose): print("Validated source table %s exists." % (args.athenaTable)) # Verify input and output buckets exist and are accessible. for bucket in [args.outputBucket, args.inputBucket]: if validate_bucket(bucket, s3_client=s3_client) and args.verbose: print("Verified bucket s3://%s exists and is accessible." % (args.outputBucket)) # Use latest file in bucket that matches prefix string as input. input_csv, latest_dt = get_latest_file(args.inputBucket, prefix=args.inputPrefix, s3_client=s3_client) if not input_csv: raise Exception( "Found no candidate CSV files in bucket %s with prefix %s." % ( args.inputBucket, "(no prefix)" if not args.inputPrefix else args.inputPrefix, )) print("Got latest CSV file s3://%s/%s with write time %s." % (args.inputBucket, input_csv, latest_dt)) sc = SparkContext() gc = GlueContext(sc) sparkSession = gc.spark_session job = Job(gc) job.init(args.JOB_NAME, vars(args)) sparkSession.udf.register("json_clean", json_clean) # For requester payer sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true") # Get minimum hour for existing data prelim_min_hour = 0 aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER # * 1000000 needed since start_time is in microseconds prelim_min_hour = (( (START_TIME - (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000 # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo. input_df = (gc.create_dynamic_frame_from_options( "s3", { "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)] }, "csv", { "withHeader": True }, ).toDF().select("*")) print("Input DF count: %s" % (input_df.count())) csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" % (prelim_min_hour)) print("Date-bounded DF count: %s" % (csv_prelim_df.count())) # If --hourly is not set, query for policy hits for current hour # and previous 24 hours. # When --hourly is set, query for policy hits only for the current # and previous hour. (Querying for the previous hour is necessary) # as the previous CSV generation job may have run in the middle of the # hour, meaning new events for that hour may exist.) # (now - (now % 3600)) = start of the current hour # ((now - (now % 3600)) - 3600) = start of the previous hour hours_ago = 1 if args.hourly else 24 increm_min_hour_sec = (START_TIME - (START_TIME % 3600)) - (3600 * hours_ago) increm_min_hour_micro = increm_min_hour_sec * 1000000 # When nonStrict is set, query for the latest hourly epoch in the input file # as well as all subsequent hourly epochs. This query may be more expensive # to complete, but allows for the possibility of filling in gaps if previous # runs of the job failed or were paused. if args.nonStrict: # We may need to re-query for transactions in the latest epoch depending on # when the previous query was run. Filter these rows out as well, then # set newest_epoch to our minimum hour epoch. last_seen_epoch = ( input_df.selectExpr("cast(start_time as bigint) as ts").agg({ "start_time": "max" }).collect().pop()[0]) last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner # Make sure we don't go outside our --maxHoursAgo range. # This check is necessary as increm_min_hour_sec is used in the pushdown # predicate when querying against the Athena source table/view. increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec) increm_min_hour_micro = increm_min_hour_sec * 1000000 print("Set minimum hour in seconds: %s" % (increm_min_hour_sec)) csv_hits_in_range = csv_prelim_df.filter( "cast(start_time as bigint) < %s" % (increm_min_hour_micro)) print("Total CSV hits in timeframe of interest: %s" % (csv_hits_in_range.count())) # Set timeframe for pushdown predicate pushdown = "(hour >= %s)" % (increm_min_hour_sec) # Get table contents starting from minimum hour settings. raw_data = gc.create_dynamic_frame.from_catalog( database=args.athenaDatabase, table_name=args.athenaTable, transformation_ctx="raw_data", push_down_predicate=pushdown, ) # Determine what fields we should extract based on table definition. # Deriving the fields from the table schema is preferable to inferring # it from the underlying parquet because the two may differ. tmp_schema = get_table_schema(args.athenaDatabase, args.athenaTable, glue_client=glue_client) # Verify we only have one matching column on which to explode. if not [col[0] for col in tmp_schema].count(args.policyType) == 1: raise Exception( "Wrong number of matching %s columns found in schema: %s" % (args.policyType, tmp_schema)) # Sub out the policies/parent_policies field for an explode expression. # Lightly modify other fields to promote clean and consistent writes to CSV file. schema = [] for col in tmp_schema: if col[0] == args.policyType: schema.append("explode(%s) as policy" % (col[0])) elif "map" in col[1].lower() or "struct" in col[1].lower(): schema.append("json_clean(%s) as %s" % (col[0], col[0])) else: schema.append("cast(%s as string)" % (col[0])) # Get targeted policies/parent_policies. # Should be passed to SparkSQL as quoted strings. pol_arr = ", ".join([ "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter) if pol ]) # Filter table contents according to policy hits. # A non-null policies field implies a non-null parent_policies field # (and vice versa), so it's OK to filter on just one. new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr( *schema).filter("policy in (%s)" % (pol_arr))) print("New policy triggers found since last job: %s" % (new_hits.count())) # Combine newly collected policy hits with dataframe of previous CSV contents. write_df = new_hits.union(csv_hits_in_range).orderBy( "start_time").coalesce(1) uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver) s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir, uniq)) if args.verbose: print("S3 Results Location: %s" % s3_loc) write_df.write.option("quoteAll", True).csv(s3_loc, header=True) # Rename output file, if requested. if args.outputFilename: rename_resp = rename_file( args.outputBucket, args.outputFilename, prefix=args.outputDir, dont_preserve_dir=args.dontPreserveOutputDir, keep_orig=args.keepOrigOnRename, ) if rename_resp and args.verbose: print("Renamed file to %s/%s." % (args.outputDir, args.outputFilename)) job.commit()
# compress string stream using gzip with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file: gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8')) # write stream to S3 client.put_object(Bucket=bucket, Key=key, Body=gz_buffer.getvalue()) job_args = getResolvedOptions( sys.argv, ['JOB_NAME', 'bucket', 'input-key', 'output-key', 'output-filename']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(job_args['JOB_NAME'], job_args) client = boto3.client('s3') data = spark.read.format('csv').options( header='true').load(f"s3://{job_args['bucket']}/{job_args['input_key']}") # order columns for export ordered_data = data \ .select(data.ticker, data.market) now = datetime.now() date_time = now.strftime("%Y-%m-%d_%H:%M:%S") destination = f"{job_args['output_key']}{date_time}" # create destination folder
from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.transforms import * from awsglue.job import Job from pyspark.sql import functions as F from pyspark.sql.functions import udf from datetime import datetime # Create a Glue context glueContext = GlueContext(SparkContext.getOrCreate()) # Create a Glue Job job = Job(glueContext) job.init("infrav-demo") # Create a dynamic frame from the catalog infrav_DyF = glueContext.create_dynamic_frame.from_catalog(database = "infrav", table_name = "vcenter") infrav_DyF.printSchema() # Apply schema applymapping1 = ApplyMapping.apply(frame = infrav_DyF, mappings = [("fields.usage_average", "double", "usage_average", "double"), ("name", "string", "name", "string"), ("tags.clustername", "string", "clustername", "string"), ("tags.cpu", "string", "cpu", "string"), ("tags.dcname", "string", "dcname", "string"), ("tags.esxhostname", "string", "esxhostname", "string"),("tags.host", "string", "host", "string"), ("tags.moid", "string", "moid", "string"), ("tags.source", "string", "source", "string"), ("tags.vcenter", "string", "vcenter", "string"), ("timestamp", "int", "timestamp", "int"), ("year", "string", "year", "string"), ("month", "string", "month", "string"), ("day", "string", "day", "string"), ("hour", "string", "hour", "string")]) #Print new Schema applymapping1.printSchema() # Convert DyF to DF df=applymapping1.toDF() # Create a function that returns the desired string from a timestamp def format_timestamp(ts):
import sys from awsglue.context import GlueContext from awsglue.job import Job from awsglue.transforms import ApplyMapping, DropNullFields, ResolveChoice from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ["JOB_NAME"]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args["JOB_NAME"], args) ## @type: DataSource ## @args: [database = "dotz_challenge_raw", table_name = "comp_boss_csv", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog( database="dotz_challenge_raw", table_name="comp_boss_csv", transformation_ctx="datasource0") ## @type: ApplyMapping ## @args: [mapping = [("component_id", "string", "component_id", "string"), ("component_type_id", "string", "component_type_id", "string"), ("type", "string", "type", "string"), ("connection_type_id", "string", "connection_type_id", "string"), ("outside_shape", "string", "outside_shape", "string"), ("base_type", "string", "base_type", "string"), ("height_over_tube", "double", "height_over_tube", "double"), ("bolt_pattern_long", "string", "bolt_pattern_long", "string"), ("bolt_pattern_wide", "string", "bolt_pattern_wide", "string"), ("groove", "string", "groove", "string"), ("base_diameter", "string", "base_diameter", "string"), ("shoulder_diameter", "string", "shoulder_diameter", "string"), ("unique_feature", "string", "unique_feature", "string"), ("orientation", "string", "orientation", "string"), ("weight", "string", "weight", "string")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[
fact_table_database = 'highereducation-dw-edudirectdb-staging-dev' fact_table_v1 = '' fact_table_name = args['DATABASE_STAG'] database = args[ 'DATABASE_PARQUET'] # possible values = ['highereducation-dw-edudirectdb-parquet-current', 'highereducation-dw-edudirectdb-parquet'] if args['TYPE'] == 'current_day': fact_table_v1 = 'lead_fact_table_' + ENVIRONMENT + '_current_v1' fact_table_name = 'lead_fact_table_' + ENVIRONMENT + '_current' else: fact_table_v1 = 'lead_fact_table_' + ENVIRONMENT + '_v1' fact_table_name = 'lead_fact_table_' + ENVIRONMENT spark = glueContext.spark_session job = Job(glueContext) job.init(JOB_NAME, args) # Required tables lead = dfu.get_dyf_frame(database=database, tbl='cddirect_production_lead').toDF() school_provider_program = dfu.get_dyf_frame( database=database, tbl='cddirect_production_school_provider_program').toDF() school_program = dfu.get_dyf_frame( database=database, tbl='cddirect_production_school_program').toDF() widget_degree = dfu.get_dyf_frame( database=database, tbl='cddirect_production_widget_degree').toDF() widget_subject = dfu.get_dyf_frame( database=database, tbl='cddirect_production_widget_subject').toDF() widget_category = dfu.get_dyf_frame( database=database, tbl='cddirect_production_widget_category').toDF()
from pyspark.sql import SparkSession from pyspark.sql.types import NullType, StructType, StructField from pyspark.sql.types import StringType, DateType, LongType, ArrayType from pyspark.sql.functions import * import itertools import boto3 import time args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucketName' ,'workspace', 'jdbc_url', 'username', 'pswd']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) #ccn-alerts-104328-{workspace}-sfmc-migration workspace = args['workspace'] bucketName = args['bucketName'] jdbc_url = args['jdbc_url'] username = args['username'] pswd = args['pswd'] print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username) ) s3Location = 's3://{}'.format(bucketName) glue_database = '{}_sfmc_migration'.format(workspace) outputLocation = ''+s3Location+'/athena-results-ccn-data/' print("workspace: ", workspace, ", s3Location: ", s3Location, ", glue_database: ", glue_database, ", outputLocation: ", outputLocation)
def create_job(glueContext, args): job = Job(glueContext) job.init(args["JOB_NAME"], args) return job