from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions import pyspark.sql.functions as F from pyspark.sql import Row, Window, SparkSession from pyspark.sql.types import * from pyspark.conf import SparkConf from pyspark.context import SparkContext args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") ## Read in data by pointing to its's table name in Glue Data Catalog schema = StructType() \ .add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \ .add('ts', StringType()) src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema) ## Build out some new columns with data extracted from the JSON string
from awsglue.context import GlueContext from awsglue.job import Job import time import pg8000 import boto3 import re from decimal import * import extract_rs_query_logs_functions as functions ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME','REGION','CLUSTER_ENDPOINT']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) job_configs={} job_configs.update(args) clusterId= re.search('jdbc:redshift://(.+?)\..*',args['CLUSTER_ENDPOINT']).group(1) job_configs.update(functions.getJobConfigurations(clusterId,job_configs)) job_configs['CLUSTER_ID']=clusterId tempDir=args['TempDir'] s3Prefix=job_configs['s3_prefix'] credentials=boto3.Session().get_credentials() job_configs['aws_access_key_id'] = credentials.access_key job_configs['aws_secret_access_key'] = credentials.secret_key job_configs['aws_session_token'] = credentials.token job_configs.update(args) job_configs['jdbcURL']="{}?user={}&password={}".format(args['CLUSTER_ENDPOINT'],job_configs['user'],job_configs['password']) job_configs['region_name']=boto3.session.Session().region_name
from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.types import Row, StringType, IntegerType, ArrayType, StructType, DoubleType, BooleanType, DateType from pyspark.sql.functions import input_file_name, concat, col from pyspark.sql.functions import first, last from pyspark.sql.types import IntegerType from pyspark.sql.functions import udf, struct, explode sc = SparkContext.getOrCreate() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) ## Get the argumentlist args=getResolvedOptions(sys.argv, ['JOB_NAME', 'project', 'workflow_type', 'output_bucket']) ## The GDC endpoint for files and the NCI endpoint to query for the S3 URL files_endpt = 'https://api.gdc.cancer.gov/files' indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ##### FROM FILES tedx_dataset_path = "s3://unibg-tedx-data-bucket/tedx_dataset.csv" ###### READ PARAMETERS args = getResolvedOptions(sys.argv, ['JOB_NAME']) ##### START JOB CONTEXT AND JOB sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session #la sessione di spark viene presa da glue job = Job(glueContext) job.init(args['JOB_NAME'], args) #### READ INPUT FILES TO CREATE AN INPUT DATASET tedx_dataset = spark.read.option("header", "true").option("quote", "\"").option( "escape", "\"").csv(tedx_dataset_path) #se un json devo mettere .csv#devo dire che il testo all'interno le virgolette devo considerarlo come contenuto unico. tedx_dataset.printSchema() #### FILTER ITEMS WITH NULL POSTING KEY count_items = tedx_dataset.count() count_items_null = tedx_dataset.filter( "idx is not null").count() #filtra i dati e togle i valori nulli
args = getResolvedOptions(sys.argv, ['BucketName']) s3 = boto3.resource('s3') s3_client = boto3.client('s3') athena_client = boto3.client(service_name='athena', region_name='us-west-2') bucket_name = args['BucketName'] BucketName = args['BucketName'] database = 'c360view_stage' sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init('n1_c360_dispositions') def run_query(client, query): response = client.start_query_execution( QueryString=query, QueryExecutionContext={'Database': database}, ResultConfiguration={ 'OutputLocation': 's3://{}/athenaoutput/'.format(bucket_name) }, ) return response def validate_query(client, query_id):
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame DYNAMODB_INPUT_TABLE_NAME = 'aws-glue-local-test-table' S3_OUTPUT_BUCKET_NAME = 'aws-glue-local-test-bucket' args = getResolvedOptions(sys.argv, ['JOB_NAME']) JOB_NAME = args['JOB_NAME'] sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(JOB_NAME, args) datasource = glueContext.create_dynamic_frame.from_options( connection_type='dynamodb', connection_options={'dynamodb.input.tableName': DYNAMODB_INPUT_TABLE_NAME}) applymapping = ApplyMapping.apply( frame=datasource, mappings=[('Id', 'string', 'Id', 'string'), ('Column1', 'string', 'Column1', 'string'), ('Column2', 'string', 'Column2', 'string'), ('Column3', 'string', 'Column3', 'string')]) glueContext.write_dynamic_frame.from_options( frame=datasource,
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job args = getResolvedOptions(sys.argv, ["JOB_NAME"]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args["JOB_NAME"], args) # Script generated for node Amazon S3 AmazonS3_node1641389272287 = glueContext.create_dynamic_frame.from_options( format_options={"quoteChar": '"', "withHeader": True, "separator": ","}, connection_type="s3", format="csv", connection_options={ "paths": [ "s3://db-mirgration-2112/migration-0501/NEXTDB/MARKS/LOAD00000001.csv" ], "recurse": True, },import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job
sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job_name = args["hca_job_name"] raw_location = args['raw_location'] datamart_location = args['datamart_location'] delta_location = args['delta_location'] archive_location = args['archive_location'] aws_region = args['aws_region'] # # initialize glue job with glue security config supplied in params # to encrypt cloudwatch logs # job = Job(glueContext) job.init(job_name, args) # # read the catalog # catalog: Dict[str, Any] = common.utils.get_catalog() # # get the jdbc connection from the glue catalog # client = boto3.client('glue', region_name=aws_region) glue_job = client.get_job(JobName=args['JOB_NAME'])['Job'] db_name = next(iter(glue_job['Connections'] ['Connections'])) if 'Connections' in glue_job else None
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import logging logging.getLogger().setLevel(logging.INFO) args = getResolvedOptions(sys.argv, ['JOB_NAME', 'json_input']) json_input = args['json_input'] job_name = args['JOB_NAME'] sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(job_name, args) def get_parquet_output(json_input): path_parts = json_input.split('/') convert_bucket = path_parts[2].replace('-delivery-', '-convert-') stream_name = path_parts[3] batch_id = '-'.join(path_parts[8][:-3].split('-')[10:]) partitions = 'landed_year={}/landed_month={}/landed_day={}/landed_hour={}/landed_batch_id={}'.format( path_parts[4], path_parts[5], path_parts[6], path_parts[7], batch_id) convert_path = '/'.join( ['s3:', '', convert_bucket, stream_name, partitions]) return convert_path
to the export Glue job. The Glue job runs this script to export data from DDB to S3. """ import sys import boto3 import re import json from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from datetime import datetime glueContext = GlueContext(SparkContext.getOrCreate()) job = Job(glueContext) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'jobId', 'exportType', 'transactionTime', 'since', 'outputFormat', 'ddbTableName', 'workerType', 'numberWorkers', 's3OutputBucket' ]) # type and tenantId are optional parameters type = None if ('--{}'.format('type') in sys.argv): type = getResolvedOptions(sys.argv, ['type'])['type'] groupId = None if ('--{}'.format('groupId') in sys.argv): groupId = getResolvedOptions(sys.argv, ['groupId'])['groupId'] tenantId = None
output_database = args['output_database'] tmp_table = args['tmp_table'] output_table = args['output_table'] output_path = args['output_path'] # Retrieve partition key information from tmp-table partition_keys = [] glue = boto3.client('glue') res = glue.get_table(DatabaseName=output_database, Name=tmp_table) for partition_key in res['Table']['PartitionKeys']: partition_keys.append(partition_key['Name']) # getOrCreate allows this to run as a job or in a notebook. glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) # Create DynamicFrame from Data Catalog dyf = glue_context.create_dynamic_frame.from_catalog(database=output_database, table_name=tmp_table, transformation_ctx='dyf') # Resolve choice type with make_struct dyf = ResolveChoice.apply(frame=dyf, choice='make_struct', transformation_ctx='resolvechoice') # Drop null fields dyf = DropNullFields.apply(frame=dyf, transformation_ctx='dropnullfields')
"glue_db_name", "glue_issues_table_name", "glue_data_hierarchy_table_name", "glue_output_bucket" ] args = getResolvedOptions(sys.argv, args_list) # NOSONAR: python:S4823 JOB_TYPE = args["job_type"] DDB_ISSUES_TABLE_NAME = args["ddb_issues_table_name"] DDB_DATA_HIERARCHY_TABLE_NAME = args["ddb_data_hierarchy_table_name"] GLUE_ISSUES_TABLE_NAME = args["glue_issues_table_name"] GLUE_DATA_HIERARCHY_TABLE_NAME = args["glue_data_hierarchy_table_name"] GLUE_DB_NAME = args["glue_db_name"] GLUE_OUTPUT_BUCKET = args["glue_output_bucket"] # Sets Glue context and logging spark_context = SparkContext() glue_context = GlueContext(spark_context) job = Job(glue_context) class JobInputException(Exception): """Raised when input to the job is not valid""" pass def log_message(msg): msg_arr = [f'****** LOG_MSG {datetime.now()} ******'] if not isinstance(msg, list): msg = [msg] # Add some preceding whitespace to each line for the log message. # This makes it easier to read in the Glue logs on Cloudwatch
def getResolvedOptions(args, options): parser = GlueArgumentParser() if Job.continuation_options()[0][2:] in options: raise RuntimeError("Using reserved arguments " + Job.continuation_options()[0][2:]) if Job.job_bookmark_options()[0][2:] in options: raise RuntimeError("Using reserved arguments " + Job.job_bookmark_options()[0][2:]) parser.add_argument(Job.job_bookmark_options()[0], choices=Job.job_bookmark_options()[1:], required=False) parser.add_argument(Job.continuation_options()[0], choices=Job.continuation_options()[1:], required=False) for option in Job.job_bookmark_range_options(): if option[2:] in options: raise RuntimeError("Using reserved arguments " + option) parser.add_argument(option, required=False) for option in Job.id_params()[1:]: if option in options: raise RuntimeError("Using reserved arguments " + option) # TODO: Make these mandatory, for now for backward compatability making these optional, also not including JOB_NAME in the reserved parameters list. parser.add_argument(option, required=False) if Job.encryption_type_options()[0] in options: raise RuntimeError("Using reserved arguments " + Job.encryption_type_options()[0]) parser.add_argument(Job.encryption_type_options()[0], choices=Job.encryption_type_options()[1:]) # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir' after TempDir is made mandatory for all Jobs # Remove 'RedshiftTempDir' and 'TempDir' from list of user supplied options options = [ opt for opt in options if opt not in {'RedshiftTempDir', 'TempDir'} ] parser.add_argument('--RedshiftTempDir', required=False) parser.add_argument('--TempDir', required=False) for option in options: parser.add_argument('--' + option, required=True) parsed, extra = parser.parse_known_args(args[1:]) parsed_dict = vars(parsed) # TODO: remove special handling after TempDir is made mandatory for all jobs if 'TempDir' in parsed_dict and parsed_dict['TempDir'] is not None: # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir' parsed_dict['RedshiftTempDir'] = parsed_dict['TempDir'] elif 'RedshiftTempDir' in parsed and parsed_dict[ 'RedshiftTempDir'] is not None: parsed_dict['TempDir'] = parsed_dict['RedshiftTempDir'] # Special handling for continuations. If --job-bookmark-option is set we # use that, regardless of whether --continuation-option is set. If # --job-bookmark-option is not set but --continuation-option is set, fall # back to that. bookmark_value = parsed_dict.pop("continuation_option", None) if 'job_bookmark_option' not in parsed_dict or parsed_dict[ 'job_bookmark_option'] is None: if bookmark_value is None: bookmark_value = Job.job_bookmark_options()[3] else: # translate old style continuation options into job-bookmark options option_index = Job.continuation_options().index(bookmark_value) bookmark_value = Job.job_bookmark_options()[option_index] parsed_dict['job_bookmark_option'] = bookmark_value absent_range_option = [] for option in Job.job_bookmark_range_options(): key = option[2:].replace('-', '_') if key not in parsed_dict or parsed_dict[key] is None: absent_range_option.append(option) if parsed_dict['job_bookmark_option'] == 'job-bookmark-pause': if len(absent_range_option) == 1: raise RuntimeError("Missing option or value for " + absent_range_option[0]) else: if len(absent_range_option) == 0: raise RuntimeError("Invalid option(s)" + ' '.join(Job.job_bookmark_range_options())) _global_args.update(parsed_dict) return parsed_dict
def main(): ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name', 'target_entity_name', 'datetime_column', 'date_column', 'partition_column', 'output_bucket_name' ]) job_name = args['JOB_NAME'] raw_db_name = args['raw_db_name'] clean_db_name = args['clean_db_name'] source_entity_name = args['source_entity_name'] target_entity_name = args['target_entity_name'] partition_column = args['partition_column'] datetime_column = args['datetime_column'] date_column = args['date_column'] output_bucket_name = args['output_bucket_name'] # Constants derived from parameters raw_table_name = source_entity_name clean_table_name = target_entity_name processing_start_datetime = datetime.now(timezone.utc) # Initialization of contexts and job glue_context = GlueContext(SparkContext.getOrCreate()) job = Job(glue_context) job.init(job_name, args) ## @type: DataSource ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"] ## @return: raw_data ## @inputs: [] raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog( database=raw_db_name, table_name=raw_table_name, transformation_ctx="raw_data") # Terminate early if there is no data to process if raw_data.toDF().head() is None: job.commit() return ## @type: CleanDataset ## @args: [] ## @return: cleaned_data ## @inputs: [frame = raw_data] input_data = raw_data.toDF() cleaned_data = input_data.select(*[ from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ to_date(c).alias(c) if c.endswith('_date') else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('string').alias(c) if c == 'zip' else col(c) for c in input_data.columns ]) cleaned_data = cleaned_data.select(*[ col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] == 'double' else col(c) for c in input_data.columns ]) ## @type: EnrichDataset ## @args: [] ## @return: enriched_data ## @inputs: [frame = cleaned_data] enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \ .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date")) ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"] ## @return: sink ## @inputs: [frame = enriched_data] sink = glue_context.getSink(connection_type="s3", path="s3://" + output_bucket_name + "/" + clean_table_name, enableUpdateCatalog=True, updateBehavior="UPDATE_IN_DATABASE", partitionKeys=[partition_column]) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=clean_db_name, catalogTableName=clean_table_name) sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result')) job.commit()
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from boto3 import client from pyspark.sql.functions import lit # ENVIRONMENT VARIABLES #args = getResolvedOptions(sys.argv, ['JOB_NAME', 'DATABASES', 'S3_RAW_PARQUET_PATH']) # Loading the contexts sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) #job.init(args['JOB_NAME'], args) job.init("billing_wiseOCS_accounts_csv_to_parquet") # Where the script will write the files to #s3_destiny = args['S3_RAW_PARQUET_PATH'] # Sources # list_databases = args['DATABASES'].split(',') list_databases = ["billing_wiseocs"] # Glue object to list the tables within each database client = client('glue', region_name='us-east-2') for database in list_databases: response = client.get_tables(DatabaseName=database.lower())