コード例 #1
0
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

#load table from S3
fod = glueContext.create_dynamic_frame.from_catalog(
    database="demo-db-dblp-acm", table_name="fodors_zagats_csv")
fodm = glueContext.create_dynamic_frame.from_catalog(
    database="demo-db-dblp-acm", table_name="fodors_marked_csv")

#create temp table in spark
fod.toDF().createOrReplaceTempView("fodors_zagats")
fodm.toDF().createOrReplaceTempView("fodors_marked_csv")

logger = logger = glueContext.get_logger()

# execute fodors_zagats CR rule
# fodrule1 = foda.fromDF(spark.sql("SELECT * FROM fodors_zagats LIMIT 5"), glueContext, "fodrule1")
logger.info(spark.sql("SELECT * FROM fodors_zagats LIMIT 5"), glueContext,
            "fodrule1")

# fodrule1 = foda.fromDF(spark.sql("SELECT t0.id,t0.name FROM fodors_zagats AS t0 INNER JOIN fodors_zagat_tablea AS t1 ON t0.city = t1.city"), glueContext, "fodrule1")
# glueContext.write_dynamic_frame.from_options(frame = fodrule1, connection_type = "s3", connection_options = {"path": "s3://podrulediscoveranalysis/result/CR3/fodors_zagats"}, format = "csv")

job.commit()
コード例 #2
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'partition_column', 'output_bucket_name',
        'primary_key', 'parallelism', 'date_column', 'datetime_column'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    date_column = args['date_column']
    datetime_column = args['datetime_column']
    hudi_primary_key = args['primary_key']
    output_bucket_name = args['output_bucket_name']
    parallelism = args['parallelism']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    spark = SparkSession.builder.config(
        'spark.serializer',
        'org.apache.spark.serializer.KryoSerializer').getOrCreate()
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)
    logger = glue_context.get_logger()
    logger.info('Initialization.')

    # Initialization of Glue client to connect to Glue Catalog and retrieve table information
    glueClient = boto3.client('glue')

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    isTableExists = False
    try:
        glueClient.get_table(DatabaseName=clean_db_name,
                             Name=target_entity_name)
        isTableExists = True
        logger.info(clean_db_name + '.' + target_entity_name + ' exists.')
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            isTableExists = False
            logger.info(clean_db_name + '.' + target_entity_name +
                        ' does not exist. Table will be created.')

    partition_path = '' if partition_column == 'None' else partition_column

    common_config = {
        'className':
        'org.apache.hudi',
        'hoodie.datasource.hive_sync.use_jdbc':
        'false',
        'hoodie.index.type':
        'GLOBAL_BLOOM',
        'hoodie.datasource.write.precombine.field':
        datetime_column,
        'hoodie.datasource.write.recordkey.field':
        hudi_primary_key,
        'hoodie.table.name':
        target_entity_name,
        'hoodie.consistency.check.enabled':
        'true',
        'hoodie.datasource.hive_sync.database':
        clean_db_name,
        'hoodie.datasource.hive_sync.table':
        target_entity_name,
        'hoodie.datasource.hive_sync.enable':
        'true',
        'hoodie.datasource.write.partitionpath.field':
        partition_path,
        'hoodie.datasource.hive_sync.partition_fields':
        partition_path,
        'hoodie.datasource.hive_sync.partition_extractor_class':
        'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column
        == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor',
        'hoodie.datasource.write.hive_style_partitioning':
        'false' if partition_column == 'None' else 'true',
        'hoodie.datasource.write.keygenerator.class':
        'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column
        == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator'
    }

    incremental_config = {
        'hoodie.upsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert',
        'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS',
        'hoodie.cleaner.commits.retained': 10
    }

    initLoad_config = {
        'hoodie.bulkinsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert'
    }

    if (isTableExists):
        logger.info('Incremental upsert.')
        combinedConf = {**common_config, **incremental_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Append').save("s3://" + output_bucket_name +
                                                "/" + clean_table_name)
    else:
        logger.info('Inital load.')
        combinedConf = {**common_config, **initLoad_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Overwrite').save("s3://" +
                                                   output_bucket_name + "/" +
                                                   clean_table_name)

    job.commit()
コード例 #3
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glue_context = GlueContext(sc)
spark = glue_context.spark_session

logger = glue_context.get_logger()
job = Job(glue_context)

input_path = "s3://"
output_path = "s3://"

#loading the test csv file for our operations
#renaming multiple columns in spark
df1 = spark.read.format("csv").option("header", "true").load(input_path)
df1 = df1.withColumnRenamed("Semiotic Class",
                            "semioticclass").withColumnRenamed(
                                "Input Token", "inputtoken").withColumnRenamed(
                                    "Output Token", "outputtoken")

#schema
schema = df1.printSchema()
コード例 #4
0
import boto3
import time
import pyspark.sql.functions as F
from datetime import datetime, timedelta
from collections import OrderedDict
from awsglue.transforms import *
from pyspark.sql.window import Window
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

# context initiation
glueContext = GlueContext(SparkContext.getOrCreate())
log = glueContext.get_logger()
args = getResolvedOptions(sys.argv, ['DATASET', 'JOB_NAME', 'start_date_string', 'end_date_string'])
sc = glueContext
spark = glueContext.spark_session

job = Job(glueContext)
job.init(args['JOB_NAME'], args)


# properties handler
class PropertiesHandler(object):

    def __init__(self, startDateStr='', endDateStr='', startYearStr='', endYearStr='', startMonthStr='', endMonthStr='',
                 startDayStr='', endDayStr='', targetTable='',
                 targetTableBucketLocation='', targetTablePathLocation='', targetTableFQDN='',
                 sourcePartitionYearField='',