def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True)]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')), header=False, schema=schema, encoding='UTF-8') validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')), header=False, schema=schema, encoding='UTF-8') # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
quantile = smallSamples.stat.approxQuantile("timestampLong", [0.8], 0.05) splitTimestamp = quantile[0] training = smallSamples.where( F.col("timestampLong") <= splitTimestamp).drop("timestampLong") test = smallSamples.where( F.col("timestampLong") > splitTimestamp).drop("timestampLong") trainingSavePath = file_path + '/trainingSamples' testSavePath = file_path + '/testSamples' training.repartition(1).write.option("header", "true").mode('overwrite') \ .csv(trainingSavePath) test.repartition(1).write.option("header", "true").mode('overwrite') \ .csv(testSavePath) # 获取Glue Job传进来的参数 args = getResolvedOptions(sys.argv, ['JOB_NAME', 'SOURCE_PATH', 'OUTPUT_PATH']) # 获取Spark Context运行环境并生成Glue运行环境 glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # 开始Job job = Job(glueContext) job.init(args['JOB_NAME'], args) source_path = args['SOURCE_PATH'] output_path = args['OUTPUT_PATH'] movieResourcesPath = source_path + "movies.csv" ratingsResourcesPath = source_path + "ratings.csv" movieSamples = spark.read.format('csv').option('header', 'true').load(movieResourcesPath)
# By sticking with standard Spark, we can avoid having to deal with Glue dependencies locally # If developing outside of the Dev Container, don't forget to set the environment variable: ENVIRONMENT=local ENVIRONMENT = os.getenv(key="ENVIRONMENT", default="aws") if ENVIRONMENT not in ["local", "aws"]: raise ValueError("""ENVIRONMENT must be "local" or "aws" only""") elif ENVIRONMENT == "aws": try: from awsglue.utils import getResolvedOptions # Provide these arguments in your AWS Glue Job/JobRun definition job_parameters = getResolvedOptions(sys.argv, [ "temperatures_country_input_path", "temperatures_country_output_path", "temperatures_global_input_path", "temperatures_global_output_path", "co2_input_path", "co2_output_path" ]) except ModuleNotFoundError: raise ModuleNotFoundError(""" No module named 'awsglue' ******** Are you developing outside of the Dev Container? If so, don't forget to set the environment variable: ENVIRONMENT=local ******** """) # EDIT HERE - set the output paths for your local Spark jobs as desired elif ENVIRONMENT == "local":
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext, DynamicFrame from awsglue.job import Job from pyspark.sql.functions import to_date ## @params: [TempDir, JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "marketdata", table_name = "bbg", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "marketdata", table_name = "bbg", transformation_ctx = "datasource0") def NonEmptyTicker(rec): if not rec["ticker"]: return False return True def FixDate(rec): dt = str(rec["settle_dt"]) dt = '/'.join( (dt[4:6], dt[6:], dt[:4]) ) rec["settle_dt_rs"] = dt return rec
import sys from pyspark.context import SparkContext from pyspark.sql.types import DoubleType from pyspark.sql.functions import col, year, month, dayofmonth from awsglue.transforms import * from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame args = getResolvedOptions( sys.argv, ['JOB_NAME', 's3_output_path', 'database_name', 'table_name']) s3_output_path = args['s3_output_path'] database_name = args['database_name'] table_name = args['table_name'] sc = SparkContext.getOrCreate() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame\ .from_catalog(database=database_name, table_name=table_name, transformation_ctx="datasource0") applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[
import sys from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions args = getResolvedOptions(sys.argv, ["workload", "bucket"]) key = args.get("workload") bucket = args.get("bucket") output_name = key.replace(".csv", "") path_input = "s3://" + bucket + "/" + key path_output = "s3://" + bucket + "/pyspark_output/" + output_name print("bucket: " + bucket) print("key: " + key) print("output_name: " + output_name) print("path_input: " + path_input) print("path_output: " + path_output) glueContext = GlueContext(SparkContext.getOrCreate()) df = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={"paths": [path_input]}, format="csv", format_options={ "withHeader": True, "separator": "," }, )
import sys from awsglue.transforms import ApplyMapping, ResolveChoice, DropNullFields from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ["JOB_NAME"]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args["JOB_NAME"], args) ## @type: DataSource ## @args: [database = "dotz_challenge_raw", table_name = "price_quote_csv", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog( database="dotz_challenge_raw", table_name="price_quote_csv", transformation_ctx="datasource0", ) ## @type: ApplyMapping ## @args: [mapping = [("tube_assembly_id", "string", "tube_assembly_id", "string"), ("supplier", "string", "supplier", "string"), ("quote_date", "string", "quote_date", "string"), ("annual_usage", "long", "annual_usage", "long"), ("min_order_quantity", "long", "min_order_quantity", "long"), ("bracket_pricing", "string", "bracket_pricing", "string"), ("quantity", "long", "quantity", "long"), ("cost", "double", "cost", "double")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0] applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[
import sys import boto3 from awsglue.job import Job from awsglue.transforms import * from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions import pyspark.sql.functions as F from pyspark.sql import Row, Window, SparkSession from pyspark.sql.types import * from pyspark.conf import SparkConf from pyspark.context import SparkContext args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") ## Read in data by pointing to its's table name in Glue Data Catalog schema = StructType() \ .add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \
import sys from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from features.subscription import spark_job from features.utils import yesterday_dt args = getResolvedOptions(sys.argv, ['JOB_NAME', 'dt', 'input_path', 'output_path']) if args['dt'] == 'yesterday': args['dt'] = yesterday_dt() sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) spark_job.run(spark, args) job.commit()
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions( sys.argv, [ 'JOB_NAME', 'database_name', #array of arguemnts you want to grab. 'table_name', 'downstream_bucket', 'data_partition' ]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) print("arguments: ", args) downstreamBucket = 's3://' + args['downstream_bucket'] + '/' + args[ 'data_partition'] + '/' datasource0 = glueContext.create_dynamic_frame.from_catalog( database=args['database_name'],
# flake8: noqa import sys from hashlib import md5 from awsglue.transforms import Map from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'destination_table', 'source_table', 'number_of_workers', 'external_role_arn' ]) FAKE_DOMAIN = 'mig.ef-cms.ustaxcourt.gov' # Options for DynamoDB connections here: # https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-connect.html OUTPUT_OPTIONS = { "dynamodb.output.tableName": args['destination_table'], "dynamodb.output.retry": 35, "dynamodb.throughput.write.percent": 0.1, "dynamodb.sts.roleArn": args['external_role_arn'], "dynamodb.region": "us-east-1" } # Per: https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-connect.html#aws-glue-programming-etl-connect-dynamodb # with g1.x worker type
def main(): spark = SparkSession.builder.appName("PySparkTitanic").getOrCreate() args = getResolvedOptions(sys.argv, ['s3_input_data_location', 's3_output_bucket', 's3_output_bucket_prefix', 's3_model_bucket', 's3_model_bucket_prefix']) # This is needed to write RDDs to file which is the only way to write nested Dataframes into CSV. spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") train = spark.read.csv(args['s3_input_data_location'], header=False) oldColumns = train.schema.names newColumns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cat'] train = reduce(lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), train) # dropping null values train = train.dropna() # Target label catIndexer = StringIndexer(inputCol="cat", outputCol="label") labelIndexModel = catIndexer.fit(train) train = labelIndexModel.transform(train) converter = IndexToString(inputCol="label", outputCol="cat") # Spliting in train and test set. Beware : It sorts the dataset (traindf, validationdf) = train.randomSplit([0.8, 0.2]) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. buyingIndexer = StringIndexer(inputCol="buying", outputCol="indexedBuying") maintIndexer = StringIndexer(inputCol="maint", outputCol="indexedMaint") doorsIndexer = StringIndexer(inputCol="doors", outputCol="indexedDoors") personsIndexer = StringIndexer(inputCol="persons", outputCol="indexedPersons") lug_bootIndexer = StringIndexer(inputCol="lug_boot", outputCol="indexedLug_boot") safetyIndexer = StringIndexer(inputCol="safety", outputCol="indexedSafety") # One Hot Encoder on indexed features buyingEncoder = OneHotEncoder(inputCol="indexedBuying", outputCol="buyingVec") maintEncoder = OneHotEncoder(inputCol="indexedMaint", outputCol="maintVec") doorsEncoder = OneHotEncoder(inputCol="indexedDoors", outputCol="doorsVec") personsEncoder = OneHotEncoder(inputCol="indexedPersons", outputCol="personsVec") lug_bootEncoder = OneHotEncoder(inputCol="indexedLug_boot", outputCol="lug_bootVec") safetyEncoder = OneHotEncoder(inputCol="indexedSafety", outputCol="safetyVec") # Create the vector structured data (label,features(vector)) assembler = VectorAssembler(inputCols=["buyingVec", "maintVec", "doorsVec", "personsVec", "lug_bootVec", "safetyVec"], outputCol="features") # Chain featurizers in a Pipeline pipeline = Pipeline(stages=[buyingIndexer, maintIndexer, doorsIndexer, personsIndexer, lug_bootIndexer, safetyIndexer, buyingEncoder, maintEncoder, doorsEncoder, personsEncoder, lug_bootEncoder, safetyEncoder, assembler]) # Train model. This also runs the indexers. model = pipeline.fit(traindf) # Delete previous data from output s3 = boto3.resource('s3') bucket = s3.Bucket(args['s3_output_bucket']) bucket.objects.filter(Prefix=args['s3_output_bucket_prefix']).delete() # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = model.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'train') # Similar data processing for validation dataset. predictions = model.transform(validationdf) transformed_train_rdd = predictions.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'validation') # Serialize and store via MLeap SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", predictions) # Unzipping as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = args['s3_model_bucket_prefix'] + '/' + 'model.tar.gz' s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/model.tar.gz', file_name) os.remove('/tmp/model.zip') os.remove('/tmp/model.tar.gz') shutil.rmtree('/tmp/model') # Save postprocessor SimpleSparkSerializer().serializeToBundle(converter, "jar:file:/tmp/postprocess.zip", predictions) with zipfile.ZipFile("/tmp/postprocess.zip") as zf: zf.extractall("/tmp/postprocess") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/postprocess.tar.gz", "w:gz") as tar: tar.add("/tmp/postprocess/bundle.json", arcname='bundle.json') tar.add("/tmp/postprocess/root", arcname='root') file_name = args['s3_model_bucket_prefix'] + '/' + 'postprocess.tar.gz' s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/postprocess.tar.gz', file_name) os.remove('/tmp/postprocess.zip') os.remove('/tmp/postprocess.tar.gz') shutil.rmtree('/tmp/postprocess')
def main(): spark = SparkSession.builder.appName("PySparkAbalone").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("sex", StringType(), True), StructField("length", DoubleType(), True), StructField("diameter", DoubleType(), True), StructField("height", DoubleType(), True), StructField("whole_weight", DoubleType(), True), StructField("shucked_weight", DoubleType(), True), StructField("viscera_weight", DoubleType(), True), StructField("shell_weight", DoubleType(), True), StructField("rings", DoubleType(), True)]) # Downloading the data from S3 into a Dataframe total_df = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'abalone.csv')), header=False, schema=schema) #StringIndexer on the sex column which has categorical value sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex") #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex) sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec") #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format assembler = VectorAssembler(inputCols=["sex_vec", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"], outputCol="features") # The pipeline comprises of the steps added above pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler]) # This step trains the feature transformers. We need to serialize this model with MLeap and save to S3 model = pipeline.fit(total_df) # This step transforms the dataset with information obtained from the previous fit transformed_total_df = model.transform(total_df) # Split the overall dataset into 80-20 training and validation (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2]) # Convert the train dataframe to RDD to save in CSV format and upload to S3 train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features)) train_lines = train_rdd.map(csv_line) train_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Convert the validation dataframe to RDD to save in CSV format and upload to S3 validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features)) validation_lines = validation_rdd.map(csv_line) validation_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize and store the model via MLeap SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", validation_df) # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writw back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
import sys import boto3 from pyspark.sql import SparkSession from awsglue.utils import getResolvedOptions # Arguments args = getResolvedOptions(sys.argv, [ 'parentFile', 'childFile', 'joinCol', 's3Bucket', 'inputLoc', 'stageLoc', 'outputLoc', 'outputFileName', 'kmsKey' ]) parentFile = args['parentFile'] childFile = args['childFile'] joinCol = args['joinCol'] s3Bucket = args['s3Bucket'] inputLoc = args['inputLoc'] stageLoc = args['stageLoc'] outputLoc = args['outputLoc'] outputFileName = args['outputFileName'] kmsKey = args['kmsKey'] # Create sparksession variable spark = SparkSession.builder.getOrCreate() # Create emp and dept dataframe and join them using deptid parentDF = spark.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load(parentFile) childDF = spark.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load(childFile) joinedDF = parentDF.join(childDF, (joinCol))
# or implied. See the License for the specific language governing permissions# # and limitations under the License. # ############################################################################### import gzip, re import pandas as pd import csv import sys from awsglue.utils import getResolvedOptions import boto3 s3 = boto3.client('s3') s3_resource = boto3.resource('s3') args = getResolvedOptions(sys.argv, [ 'input_bucket', 'clinvar_input_key', 'clinvar_annotated_input_key', 'output_bucket', 'output_key' ]) def download_to_local(filename): new_filename = filename.split('/')[-1] s3_resource.meta.client.download_file(args['input_bucket'], filename, '/tmp/' + new_filename) return new_filename def list_to_dict(l): """Convert list to dict.""" return {k: v for k, v in (x.split("=") for x in l)}
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.functions import * from awsglue.dynamicframe import DynamicFrame ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "flat_employee_db", table_name = "flat_data1_csv", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] empinfoDF = glueContext.create_dynamic_frame.from_catalog(database = "flat_employee_db", table_name = "flat_data1_csv").toDF() empDF = empinfoDF.withColumnRenamed('employee_id','id').select('id','first_name','last_name','salary','department_id') deptDF = empinfoDF.withColumnRenamed('department_id','id').withColumnRenamed('department_name','name').select('id','name','salary_increment') dynamic_emp_frame_write = DynamicFrame.fromDF(empDF,glueContext,'dynamic_emp_frame_write') dynamic_dept_frame_write = DynamicFrame.fromDF(deptDF,glueContext,'dynamic_dept_frame_write') ## @type: DataSink ## @args: [catalog_connection = "PostgresConnection", connection_options = {"database" : "postgres", "dbtable" : "Employee"}, redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>"]
import re import json from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from datetime import datetime glueContext = GlueContext(SparkContext.getOrCreate()) job = Job(glueContext) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'jobId', 'exportType', 'transactionTime', 'since', 'outputFormat', 'ddbTableName', 'workerType', 'numberWorkers', 's3OutputBucket' ]) # type and tenantId are optional parameters type = None if ('--{}'.format('type') in sys.argv): type = getResolvedOptions(sys.argv, ['type'])['type'] groupId = None if ('--{}'.format('groupId') in sys.argv): groupId = getResolvedOptions(sys.argv, ['groupId'])['groupId'] tenantId = None if ('--{}'.format('tenantId') in sys.argv): tenantId = getResolvedOptions(sys.argv, ['tenantId'])['tenantId'] # the following parameters are only needed for group export
import sys import boto3 from pyspark.sql import SparkSession from awsglue.utils import getResolvedOptions # Arguments args = getResolvedOptions(sys.argv, ['parentFile','childFile','joinCol','s3Bucket','inputLoc','stageLoc','outputLoc','outputFileName','kmsKey']) parentFile = args['parentFile'] childFile = args['childFile'] joinCol = args['joinCol'] s3Bucket = args['s3Bucket'] inputLoc = args['inputLoc'] stageLoc = args['stageLoc'] outputLoc = args['outputLoc'] outputFileName = args['outputFileName'] kmsKey = args['kmsKey'] # Create sparksession variable spark = SparkSession.builder.getOrCreate() # Create emp and dept dataframe and join them using deptid parentDF = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load(parentFile) childDF = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load(childFile) joinedDF = parentDF.join(childDF, (joinCol)) # Write csv file to S3 staging area s3StageLoc = "s3://" + s3Bucket + "/" + inputLoc + "/" + stageLoc joinedDF.write.format('com.databricks.spark.csv').save(s3StageLoc)
import sys import boto3 import pprint import pandas as pd import re import json from awsglue.utils import getResolvedOptions glue_client = boto3.client("glue") args = getResolvedOptions(sys.argv, [ 'WORKFLOW_NAME', 'WORKFLOW_RUN_ID', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY', 'REGION' ]) workflow_name = args['WORKFLOW_NAME'] workflow_run_id = args['WORKFLOW_RUN_ID'] # if workflow_name: # workflow_params = glue_client.get_workflow_run_properties( # Name=workflow_name, # RunId=workflow_run_id # )["RunProperties"] # # target_database = workflow_params['target_database'] # # target_s3_location = workflow_params['target_s3_location'] region = args['REGION'] availability_zones = [] product_descriptions = ['Linux/UNIX (Amazon VPC)'] numbers = re.compile('\d+(?:\.\d+)?') pricing = boto3.client('pricing', region_name='us-east-1') ec2 = boto3.client('ec2', region_name=region)
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 import time ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME', 'DB_NAME', 'TABLE_NAME', 'DEST_BUCKET']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog( database=args["DB_NAME"], table_name=args["TABLE_NAME"], transformation_ctx="datasource0") resolvechoice1 = ResolveChoice.apply(frame=datasource0, choice="make_struct", transformation_ctx="resolvechoice1") relationalized1 = resolvechoice1.relationalize("trail", args["TempDir"]).select("trail") datasink = glueContext.write_dynamic_frame.from_options( frame=relationalized1, connection_type="s3",
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 target_format = "parquet" ## @params: [JOB_NAME] args = getResolvedOptions( sys.argv, ['JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] job.init(args['JOB_NAME'], args) client = boto3.client(service_name='glue', region_name=aws_region) responseGetTables = client.get_tables(DatabaseName=glue_database) tableList = responseGetTables['TableList']
from pyspark.sql.functions import format_string from pyspark.sql.functions import col from gremlin_python import statics from gremlin_python.structure.graph import Graph from gremlin_python.process.graph_traversal import __ from gremlin_python.process.strategies import * from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection from gremlin_python.process.traversal import * from neptune_python_utils.glue_neptune_connection_info import GlueNeptuneConnectionInfo from neptune_python_utils.glue_gremlin_client import GlueGremlinClient from neptune_python_utils.glue_gremlin_csv_transforms import GlueGremlinCsvTransforms from neptune_python_utils.endpoints import Endpoints from neptune_python_utils.gremlin_utils import GremlinUtils args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'DATABASE_NAME', 'TABLE_PREFIX', 'NEPTUNE_CONNECTION_NAME', 'AWS_REGION', 'CONNECT_TO_NEPTUNE_ROLE_ARN' ]) sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(args['JOB_NAME'], args) database = args['DATABASE_NAME'] order_table = '{}salesdb_sales_order'.format(args['TABLE_PREFIX']) order_detail_table = '{}salesdb_sales_order_detail'.format( args['TABLE_PREFIX']) gremlin_endpoints = GlueNeptuneConnectionInfo( args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(
con_params['host'], con_params['port'], con_params['dbname'], con_params['username'], con_params['password']) rs_conn = pg.connect(dbname=rs_conn_string) rs_conn.query("set statement_timeout = 1200000") return rs_conn # Submits a query to the cluster @staticmethod def query(con, statement): res = con.query(statement) return res # Get job args args = getResolvedOptions(sys.argv, ['db_creds', 'glue_db']) db_creds = args['db_creds'] glue_db = args['glue_db'] sql = ''' BEGIN; CREATE TEMP TABLE staging_station_detail(LIKE public.station_detail); INSERT INTO staging_station_detail WITH cte AS ( SELECT ROW_NUMBER() OVER (PARTITION BY station_id ORDER BY last_updated DESC) AS rn ,station_id ,"name" AS station_name ,capacity
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.functions import current_timestamp ,input_file_name print "===========================================================================================" print "===========================================================================================" ## @params: [TempDir, JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME']) args = getResolvedOptions(sys.argv, ['input_file_path']) print "The input file path is -: ", args['input_file_path'] print "The TempDir is -: ", args['TempDir'] input_file = args['input_file_path'] sc = SparkContext() glueContext = GlueContext(SparkContext.getOrCreate()) #glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) #job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "redshift-intg", table_name = "checks", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] #datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "redshift-intg", table_name = "checks", transformation_ctx = "datasource0")
import sys import time from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from pyspark.sql.types import * glue_context = GlueContext(SparkContext.getOrCreate()) spark = glue_context.spark_session args = getResolvedOptions(sys.argv, ['JOB_NAME', 'database_name']) database_name = args["database_name"] def save_table(df, table_name): print(f"Writing to DynamoDB table: {table_name}") glue_context.write_dynamic_frame_from_options( frame=DynamicFrame.fromDF(df, glue_context, table_name), connection_type="dynamodb", connection_options={"dynamodb.output.tableName": table_name}) print(f"Table {table_name} updated") spark.sql(f""" use {database_name} """) current_date = spark.sql( "select max(creation_date) from posts").collect()[0][0]
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame import boto3 #Retrieve parameters for the Glue job. args = getResolvedOptions( sys.argv, ['BUCKET', 'JOB_NAME', 'S3_SOURCE', 'S3_DEST', 'TRAIN_KEY', 'VAL_KEY']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) logger = glueContext.get_logger() #Create a PySpark dataframe from the source table. source_data_frame = spark.read.load(args['S3_SOURCE'], format='csv', inferSchema=True, header=False) #Split the dataframe in to training and validation dataframes. train_data, val_data = source_data_frame.randomSplit([.7, .3])
import json import boto3 import sys from awsglue.utils import getResolvedOptions import logging import pgdb from urllib.parse import urlparse logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # Required Inputs args = getResolvedOptions(sys.argv, ['SQLScript', 'Secret']) script = args['SQLScript'] secret = args['Secret'] print('Secret is: %s' % secret) print('Script is: %s' % script) # Connect to the cluster try: print('Getting Connection Info') secmgr = boto3.client('secretsmanager') secret = secmgr.get_secret_value(SecretId=secret) secretString = json.loads(secret["SecretString"]) user = secretString["user"] password = secretString["password"] host = secretString["host"]
import sys import re import random from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from datetime import datetime, timedelta glueContext = GlueContext(SparkContext.getOrCreate()) args = getResolvedOptions( sys.argv, ['JOB_NAME', 'GLUE_DB', 'SOURCE_TABLE', 'TEMP_BUCKET', 'DEST_BUCKET']) job = Job(glueContext) job.init(args['JOB_NAME'], args) #pushing predicate aimed ate reducing cost by only fetchinng 48 hous of data # the job will run by defautl every day #By default the partitions created by the crawler are named partition_0, partition_1, partition_2 #these are the names we need to use in our predicate push down expression # partition_0 => year # partition_1 => month # partition_2 => day today = datetime.now() daysInPast = 2 predicates = [] for x in range(daysInPast):
def mexico_filter(latitude, longitude): geo = reverse_geocoder.RGeocoder(mode=1, verbose=True, stream=obj_data) coordinates = (latitude, longitude), (latitude, longitude) results = geo.query(coordinates) for response_dictionary in results: if [x for x in response_dictionary if response_dictionary[x] == 'US']: #logger.info("####################### MX RECORD ######################################################") return True return False start_time = datetime.datetime.now() # Reading glue job parametrs args = getResolvedOptions(sys.argv, [ 'Mexico_Geo_Filter_V1_Source', 'Mexico_Geo_Filter_V1_Target', 'Archive_Start_Date', 'Archive_End_Date' ]) # Represents source bucket source_s3_bucket = args['Mexico_Geo_Filter_V1_Source'] # Represents target bucket target_s3_bucket = args['Mexico_Geo_Filter_V1_Target'] filtered_keys = [] # Represents start date input_start_date = args['Archive_Start_Date'] # Represents end date input_end_date = args['Archive_End_Date'] # Creating S3 client s3_client = boto3.resource('s3')
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ######################################################################################### import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'bucket_name', 'folder_name', 'environment_name', 'application_name' ]) bucket_name = args['bucket_name'] folder_name = args['folder_name'] environment_name = args['environment_name'] application_name = args['application_name'] s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) for obj in bucket.objects.filter(Prefix=folder_name + '/'): s3.Object(bucket.name, obj.key).delete() sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext)
# glue_etl script import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 # Glue/Spark context set up args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'glue_db', 's3_bucket', 'svr', 'db', 'sch', 'tbl', 'partition_by', 'job-bookmark-option' ]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) logger = glueContext.get_logger() # Table reference data glue_db = args['glue_db'] s3_bucket = args['s3_bucket'] svr = args['svr'] db = args['db'] sch = args['sch'] tbl = args['tbl'] partition_by = args['partition_by'] bookmark = args['job-bookmark-option'] db_adj = db.replace("-", "_")
from awsglue.transforms import * from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from awsglue.job import Job from pyspark.context import SparkContext from pyspark.sql import SQLContext from pyspark.sql.functions import * from pyspark.sql.window import Window import settings import dynamic_frame_util as dfu import date_util as du glueContext = GlueContext(SparkContext.getOrCreate()) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'ENVIRONMENT', 'TYPE', 'START_DATE', 'END_DATE', 'DATABASE_PARQUET', 'DATABASE_STAG' ]) JOB_NAME = args['JOB_NAME'] ENVIRONMENT = args['ENVIRONMENT'] TYPE = None START_DATE = args['START_DATE'] END_DATE = args['END_DATE'] fact_table_database = 'highereducation-dw-edudirectdb-staging-dev' fact_table_v1 = '' fact_table_name = args['DATABASE_STAG'] database = args[ 'DATABASE_PARQUET'] # possible values = ['highereducation-dw-edudirectdb-parquet-current', 'highereducation-dw-edudirectdb-parquet']
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame #Retrieve parameters for the Glue job. args = getResolvedOptions( sys.argv, ['JOB_NAME', 'S3_SOURCE', 'S3_DEST', 'TRAIN_KEY', 'TEST_KEY']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) #Create a PySpark dataframe from the source table. source_data_frame = spark.read.load(args['S3_SOURCE'], format='csv', inferSchema=True, header=False) #Split the dataframe in to training and validation dataframes. train_data, val_data = source_data_frame.randomSplit([.7, .3]) #Write both dataframes to the destination datastore. train_path = args['S3_DEST'] + args['TRAIN_KEY'] val_path = args['S3_DEST'] + args['TEST_KEY']
def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions( sys.argv, [ "S3_INPUT_BUCKET", "S3_INPUT_KEY_PREFIX", "S3_OUTPUT_BUCKET", "S3_OUTPUT_KEY_PREFIX", "S3_MODEL_BUCKET", "S3_MODEL_KEY_PREFIX", ], ) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set( "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([ StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True), ]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv( ("s3://" + os.path.join(args["S3_INPUT_BUCKET"], args["S3_INPUT_KEY_PREFIX"], "train.csv")), header=False, schema=schema, encoding="UTF-8", ) validationdf = spark.read.csv( ("s3://" + os.path.join(args["S3_INPUT_BUCKET"], args["S3_INPUT_KEY_PREFIX"], "test.csv")), header=False, schema=schema, encoding="UTF-8", ) # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map( lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile("s3://" + os.path.join( args["S3_OUTPUT_BUCKET"], args["S3_OUTPUT_KEY_PREFIX"], "train")) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map( lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile("s3://" + os.path.join( args["S3_OUTPUT_BUCKET"], args["S3_OUTPUT_KEY_PREFIX"], "validation")) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname="bundle.json") tar.add("/tmp/model/root", arcname="root") s3 = boto3.resource("s3") file_name = os.path.join(args["S3_MODEL_KEY_PREFIX"], "model.tar.gz") s3.Bucket(args["S3_MODEL_BUCKET"]).upload_file("/tmp/model.tar.gz", file_name)
import sys from awsglue.job import Job from pyspark.context import SparkContext from awsglue.context import GlueContext from pyspark.sql import SQLContext from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.window import Window from awsglue.utils import getResolvedOptions from botocore.exceptions import ClientError import boto3 from urllib.parse import urlparse, unquote ddbconn = boto3.client('dynamodb', region_name=getResolvedOptions( sys.argv, ['aws_region'])['aws_region']) s3conn = boto3.client('s3') sparkContext = SparkContext.getOrCreate() glueContext = GlueContext(sparkContext) spark = glueContext.spark_session job = Job(glueContext) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'bucket', 'prefix', 'folder', 'out_path', 'lastIncrementalFile', 'newIncrementalFile', 'primaryKey', 'partitionKey', 'env' ]) job.init(args['JOB_NAME'], args) s3_inputpath = 's3://' + args['bucket'] + '/' + args['prefix'] + args['folder']
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import time import pg8000 import boto3 import re from decimal import * import extract_rs_query_logs_functions as functions ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME','REGION','CLUSTER_ENDPOINT']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) job_configs={} job_configs.update(args) clusterId= re.search('jdbc:redshift://(.+?)\..*',args['CLUSTER_ENDPOINT']).group(1) job_configs.update(functions.getJobConfigurations(clusterId,job_configs)) job_configs['CLUSTER_ID']=clusterId tempDir=args['TempDir'] s3Prefix=job_configs['s3_prefix'] credentials=boto3.Session().get_credentials() job_configs['aws_access_key_id'] = credentials.access_key job_configs['aws_secret_access_key'] = credentials.secret_key
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from pyspark.sql.functions import split from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME', 'target_s3_bucket']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ################################################################ # Combining Lambda@Edge Logs -[origin-request, viewer-request] # ################################################################ ## Create dyanmaic frame from optimized(Parquet format) Amazon Lambda@Edge viewer request logs as the datasource. Glue Data Catalog = {database = "reInvent2018_aws_service_logs", table_name = "lambdaedge_logs_viewer_request_optimized"} labdaEdgeViewerRequestLogs = glueContext.create_dynamic_frame.from_catalog(database = "reInvent2018_aws_service_logs", table_name = "lambdaedge_logs_viewer_request_optimized", transformation_ctx = "labdaEdgeViewerRequest") ## Drop the fields that are duplicate between Lambda@Edge viewer request logs and Lambda@Edge origin request logs modifiedLEViewerRequestLogs = DropFields.apply(frame = labdaEdgeViewerRequestLogs, paths=["eventtype"], transformation_ctx ="modifiedLEViewerRequestLogs") ## Create dyanmaic frame from optimized(Parquet format) Amazon Lambda@Edge origin request logs as the datasource. Glue Data Catalog = {database = "reInvent2018_aws_service_logs", table_name = "lambdaedge_logs_viewer_origin_optimized"} labdaEdgeOriginRequestLogs = glueContext.create_dynamic_frame.from_catalog(database = "reInvent2018_aws_service_logs", table_name = "lambdaedge_logs_origin_request_optimized", transformation_ctx = "labdaEdgeOriginRequest")