예제 #1
0
def main(
    input_path, output_path, conn_type, module_name, function_name, glue_config=None
):
    """
    There is a main function of Glue job driver. The idea is to keep it generic to be able to reuse for running
    multiple Glue jobs. Driver is responsible to validate input parameters and invoke a proper Glue job. It does not
    contain any specific business logic.

    @param input_path: input path contains input files to the job
    @param output_path: output path used to write Glue job results
    @param conn_type: use 'file' to read files from a local file system or 's3' for reading files from S3
    @param module_name and function_name: are used to call Glue job using reflection
    @param glue_config: optional additional Glue configuration parameters

    Parameters example: --JOB_NAME=movie_analytics_job --CONN_TYPE=file --INPUT_PATH=/input --OUTPUT_PATH=/results --MODULE_NAME=tasks --FUNCTION_NAME=create_and_stage_average_rating
    """
    try:
        glue_context = create_glue_context()

        module = importlib.import_module(module_name)
        module.init_glue(glue_context)
        module.init_spark(glue_context.spark_session)
        f = getattr(module, function_name)
        f(input_path, output_path, conn_type)

        job = Job(glue_context)
        job.init(args["JOB_NAME"], args)

        job.commit()

    except Exception as e:
        sys.error(
            f"Couldn't execute job due to {str(e)} for module: {module_name} and function: {function_name}"
        )
        raise e
예제 #2
0
    def __init__(self, service_name):
        args = getResolvedOptions(sys.argv, ['JOB_NAME'] + self._job_arguments())

        # Validate the service name
        if not self.is_valid_service(service_name):
            raise Exception("'%s' is not yet a supported service." % service_name)

        self.glue_context = self._init_glue_context()
        self.job = Job(self.glue_context)
        region = self.get_instance_region()

        # Create data catalog references
        raw_klas = self.SERVICE_DEFINITIONS[service_name][0]
        converted_klas = self.SERVICE_DEFINITIONS[service_name][1]

        self.raw_catalog = raw_klas(
            region,
            args['raw_database_name'],
            args['raw_table_name'],
            args['s3_source_location']
        )
        self.optimized_catalog = converted_klas(
            region,
            args['converted_database_name'],
            args['converted_table_name'],
            args['s3_converted_target']
        )

        # Assume that if the raw table does not exist, this is our first run
        self.initial_run = not self.raw_catalog.does_table_exist()

        # Create a converter object and initialize the glue job!
        self.converter = DataConverter(self.glue_context, self.raw_catalog, self.optimized_catalog)
        self.job.init(args['JOB_NAME'], args)
예제 #3
0
def processMain():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)
    ds0 = glueContext.create_dynamic_frame.from_catalog(database="autoglues3lineage", table_name="train_sm_s2adb_csv", transformation_ctx="ds0")
    rc1 = ResolveChoice.apply(frame=ds0, choice="make_struct", transformation_ctx="rc1")

    ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="redshift",
                                                       connection_options={
                                                           "url": "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world",
                                                           "dbtable": "atn.gluetable211"},
                                                       transformation_ctx="ds3")
    ds7 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="s3",
                                                       connection_options={
                                                           "path": "s3://asgqatestautomation4/Targetdata211"},
                                                       format="json", transformation_ctx="ds7")

    # ds3 = glueContext.write_dynamic_frame.from_options(frame=rc1, connection_type="postgresql",
    #                                                    connection_options={
    #                                                        "url": "jdbc:postgresql://decheqaperf01v.asg.com:5432/postgres",
    #                                                        "dbtable": "public.gluetable26"},
    #                                                    transformation_ctx="ds3")
    job.commit()
예제 #4
0
def processMain():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)
    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database="autoglues3lineage",
        table_name="train_sm_s2adb_csv",
        transformation_ctx="datasource0")
    resolvechoice2 = ResolveChoice.apply(frame=datasource0,
                                         choice="make_struct",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")
    datasink4 = glueContext.write_dynamic_frame.from_options(
        frame=dropnullfields3,
        connection_type="s3",
        connection_options={"path": "s3://asgqatestautomation4/Targetdata1A"},
        format="csv",
        transformation_ctx="datasink4")
    datasink5 = glueContext.write_dynamic_frame.from_options(
        frame=dropnullfields3,
        connection_type="s3",
        connection_options={"path": "s3://asgqatestautomation4/Targetdata1B"},
        format="parquet",
        transformation_ctx="datasink5")
    job.commit()
예제 #5
0
    def _get_spark_context():
        spark_context = GlueContext(SparkContext.getOrCreate())
        job = Job(spark_context)
        args = _get_arguments({})
        job.init(args['JOB_NAME'], args)

        return (spark_context, job)
예제 #6
0
def create_glue_context():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    job = Job(glueContext)
    args = getResolvedOptions(sys.argv, ['JOB_NAME', 'input'])
    job.init(args['JOB_NAME'], args)

    return spark, job
예제 #7
0
def get_spark_session_and_glue_job(
    glue_args: Dict,
    conf=None,
    py_files: Iterable[str] = None,
    extra_jars: List[str] = None,
):
    """
    Get spark session and AWS glue job.

    Parameters
    ----------
    glue_args
        Dictionary of Argument Name: Argument value
    extra_jars
        Path to dependent jar files
    conf : Union[pyspark.SparkConf, Dict[str, str]]
        Spark config, either object or dictionary of config options.
    py_files
        Paths to python files (.py, .zip, .egg)

    Returns
    -------
    pyspark.sql.SparkSession, awsglue.job.Job

    """
    from awsglue.context import GlueContext
    from awsglue.job import Job
    from pyspark import SparkContext, SparkConf

    LOGGER.debug("Creating spark session with parameters")
    LOGGER.debug("conf=%s", conf)
    LOGGER.debug("py_files=%s", py_files)
    LOGGER.debug("extra_jars=%s", extra_jars)
    if isinstance(conf, dict):
        spark_conf = SparkConf()
        spark_conf.setAll(conf.items())
    elif isinstance(conf, SparkConf):
        spark_conf = conf
    else:
        spark_conf = None

    if extra_jars and spark_conf:
        spark_dependencies = ",".join(extra_jars)
        spark_conf.set("spark.jars.packages", spark_dependencies)

    sc = SparkContext.getOrCreate(conf=spark_conf)

    if py_files:
        LOGGER.debug("Adding PYFILEs: %s", py_files)
        for py_file in py_files:
            sc.addPyFile(py_file)

    glue_context = GlueContext(sparkContext=sc)
    job = Job(glue_context=glue_context)
    job.init(glue_args["JOB_NAME"], glue_args)

    # .py, .zip or .egg
    return glue_context.spark_session, job
def main(args, yesterday):
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    database = args['database_name']
    table_name = '{0}kinesis_data'.format(args['table_prefix'])

    partition_predicate = \
        'partition_0="{0}" and partition_1="{1:02}" and partition_2="{2:02}"'.format(
            yesterday.year, yesterday.month, yesterday.day)

    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database=database,
        table_name=table_name,
        push_down_predicate=partition_predicate,
        transformation_ctx='datasource0')

    if datasource0.count() > 0:
        applymapping1 = ApplyMapping.apply(frame=datasource0,
                                           mappings=[('device_id', 'string',
                                                      'device_id', 'string'),
                                                     ('timestamp', 'long',
                                                      'timestamp', 'long')],
                                           transformation_ctx='applymapping1')

        df = applymapping1.toDF()
        dev = df.drop_duplicates(['device_id'])
        device_list = dev.collect()
        result_list = []

        for device in device_list:
            device_id = device['device_id']
            df2 = df[df.device_id == device_id]
            df_list = df2.sort('timestamp', ascending=True).collect()

            result_list.extend(create_diff_timestamp_list(device_id, df_list))

        df = spark.createDataFrame(result_list)
        dyf = DynamicFrame.fromDF(df, glueContext, 'dyf')

        # Hive format
        output_path = os.path.join(args['target_bucket'],
                                   'year={0}'.format(yesterday.year),
                                   'month={0:02}'.format(yesterday.month),
                                   'day={0:02}'.format(yesterday.day))

        glueContext.write_dynamic_frame.from_options(
            frame=dyf,
            connection_type='s3',
            connection_options={'path': output_path},
            format='parquet',
            transformation_ctx='datasink2')

    job.commit()
예제 #9
0
def _get_spark_session_and_glue_job(glue_args):
    from awsglue.context import GlueContext
    from awsglue.job import Job

    sc = SparkContext.getOrCreate()
    glue_context = GlueContext(sparkContext=sc)
    job = Job(glue_context=glue_context)
    job.init(glue_args["JOB_NAME"], glue_args)
    return glue_context.spark_session, job
def create_glue_context():
    """ Creates Glue Context / Spark session """
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    return spark, job
예제 #11
0
def main():
    console("Starting Job")

    ## @params: [job_name]
    # 1. Start Glue Context
    glueContext = GlueContext(SparkContext.getOrCreate())

    # 2. Initialize Job
    job = Job(glueContext)
    args = get_args()
    job.init(args["JOB_NAME"], args)

    client = boto3.client("glue", region_name="us-east-1")

    Tables = client.get_tables(DatabaseName=args["athena_database"])
    tableList = Tables["TableList"]

    # 3. Clear bucket contents
    console(f"Excluding S3 files for: {args['target_bucket']}")
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(args["target_bucket"])
    bucket.objects.delete()

    for table in tableList:
        tableName = table["Name"]
        if re.search(f"[0-9]+", tableName):
            # Ignoring Athena Tables already processed...
            continue
        else:
            console("Processing Table {}".format(tableName))

        # 4. Load dynamic dataframe
        datasource0 = glueContext.create_dynamic_frame.from_catalog(
            database=args["athena_database"],
            table_name=tableName,
            transformation_ctx="datasource0",
        )
        # 5. Process Dynamo database
        # Drop null fields
        dropnullfields1 = DropNullFields.apply(
            frame=datasource0, transformation_ctx="dropnullfields1")

        # Save Dynamic Frame on S3 using Glue
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields1,
            connection_type="s3",
            connection_options={
                "path": "s3://{}/{}/".format(args["target_bucket"], tableName)
            },
            format="parquet",
            transformation_ctx="datasink2",
        )
    job.commit()
예제 #12
0
def main():
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE',
        'ANONYMIZATION_CONF'
    ])
    sc = SparkContext()

    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"

    # avoiding spark creates $folders$ in S3
    hadoop_conf = sc._jsc.hadoopConfiguration()
    # hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session

    # Needed to reliably work between parquet, spark, and pandas dataframes.
    spark.conf.set("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1")
    spark.conf.set("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1")

    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    input_conf_str_json = args["ANONYMIZATION_CONF"]

    print('input_conf_str_json:', input_conf_str_json)
    anonymization_conf_dict = json.loads(input_conf_str_json)
    print('anonimization_conf_dict:', anonymization_conf_dict)
    position_references_in_conf = {}
    for i in range(len(anonymization_conf_dict['datasets'])):
        position_references_in_conf[anonymization_conf_dict['datasets'][i]
                                    ['table']] = i
    print('position_references_in_conf:', position_references_in_conf)

    dataLakeBucket = args["DL_BUCKET"]
    dataLakePrefix = args["DL_PREFIX"]
    aws_region = args["DL_REGION"]

    glue_database = args["GLUE_SRC_DATABASE"]
    process(job, dataLakeBucket, dataLakePrefix, anonymization_conf_dict,
            position_references_in_conf, glueContext, glue_database,
            aws_region)
    delete_temporary_s3_objects(dataLakeBucket, aws_region)
예제 #13
0
def run():
    """
    Run method.

    This method is the starting point of the actual logic of the Glue job
    """
    args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_bucket', 'job_id'])
    bucket = args['s3_bucket']
    job_id = args['job_id']

    glue_context = GlueContext(SparkContext.getOrCreate())
    spark = glue_context.spark_session
    job = Job(glue_context)
    job.init(args['JOB_NAME'], args)

    aggregate_key = 's3://' + bucket + '/output/' + job_id + '/rnorm-aggregate.json'
    aggregate_dataframe = spark.read.json(aggregate_key)

    basics_key = 's3://' + bucket + '/input/title.basics.tsv.gz'
    basics_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(basics_key)
    basics_dataframe = basics_dataframe.toDF("tconstTmp", "titleType", "primaryTitle", "originalTitle", "isAdult",
                                             "startYear", "endYear", "runtimeMinutes", "genres")
    aggregate_dataframe = aggregate_dataframe.join(basics_dataframe,
                                                   aggregate_dataframe.tconst == basics_dataframe.tconstTmp,
                                                   how='left').drop('tconstTmp')

    ratings_key = 's3://' + bucket + '/input/title.ratings.tsv.gz'
    ratings_dataframe = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load(ratings_key)
    ratings_dataframe = ratings_dataframe.toDF("tconstTmp", "averageRating", "numVotes")
    aggregate_dataframe = aggregate_dataframe.join(ratings_dataframe,
                                                   aggregate_dataframe.tconst == ratings_dataframe.tconstTmp,
                                                   how='left').drop('tconstTmp')

    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'ynorm')
    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp90')
    aggregate_dataframe = merge_calc_config(spark, bucket, job_id, aggregate_dataframe, 'yp10')

    results_folder = 's3://' + bucket + '/athena/job_id=' + job_id + '/'
    aggregate_dataframe.repartition(10).write.parquet(results_folder, mode="overwrite")

    job.commit()
예제 #14
0
def main():
    ## @params: [JOB_NAME]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE'
    ])

    sc = SparkContext()
    # avoiding spark creates $folders$ in S3
    hadoop_conf = sc._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    hadoop_conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs",
                    "false")
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    dataLakeBucket = args["DL_BUCKET"]
    dataLakePrefix = args["DL_PREFIX"]
    aws_region = args["DL_REGION"]
    glue_database = args["GLUE_SRC_DATABASE"]
    process(job, glueContext, glue_database, dataLakeBucket, dataLakePrefix,
            aws_region)
def create_job(glueContext, args):
    job = Job(glueContext)
    job.init(args["JOB_NAME"], args)
    return job
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glue_context = GlueContext(sc)
spark = glue_context.spark_session

logger = glue_context.get_logger()
job = Job(glue_context)

input_path = "s3://"
output_path = "s3://"

#loading the test csv file for our operations
#renaming multiple columns in spark
df1 = spark.read.format("csv").option("header", "true").load(input_path)
df1 = df1.withColumnRenamed("Semiotic Class",
                            "semioticclass").withColumnRenamed(
                                "Input Token", "inputtoken").withColumnRenamed(
                                    "Output Token", "outputtoken")

#schema
schema = df1.printSchema()
logger.info(f'the schema before convertion is {schema}')
email_id,source_update_dtm,
source_creation_dtm,pohne_nbr, 
full_nam, last_nam, fe_nam, 
fe_id, fe_idn, source_name,
posting_agent, data_origin,
load_dtm, update_dtm, versn_flg, 
hr_sso, hr_nam, mgr_sso,
mgr_nam, prt_persn_d_md5, prt_persn_d_md5_scd1,
start_dtm, end_dtm, current_flag_ind,
deleted_flag_ind"""

# Initialize the GlueContext and SparkContext
SC = SparkContext()
GLUECONTEXT = GlueContext(SC)
SPARK = GLUECONTEXT.spark_session
JOB = Job(GLUECONTEXT)
JOB.init(ARGS['JOB_NAME'], ARGS)
RUN_ID = ARGS['JOB_RUN_ID']
JOB_NAME = ARGS['JOB_NAME']
TEMPDIR = ARGS['TempDir']

#CurrentTimeStamp
DATETIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
END_DTM = '9999-12-31 23:59:59'
JOB_START_TIME = timeit.default_timer()

# creating DataFrames from source glue crawlers

#SRC_NOTEMPTY = True

예제 #18
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, partition_column, output_bucket_name, datetime_column,date_column]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'partition_column', 'output_bucket_name',
        'primary_key', 'parallelism', 'date_column', 'datetime_column'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    date_column = args['date_column']
    datetime_column = args['datetime_column']
    hudi_primary_key = args['primary_key']
    output_bucket_name = args['output_bucket_name']
    parallelism = args['parallelism']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    spark = SparkSession.builder.config(
        'spark.serializer',
        'org.apache.spark.serializer.KryoSerializer').getOrCreate()
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)
    logger = glue_context.get_logger()
    logger.info('Initialization.')

    # Initialization of Glue client to connect to Glue Catalog and retrieve table information
    glueClient = boto3.client('glue')

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    isTableExists = False
    try:
        glueClient.get_table(DatabaseName=clean_db_name,
                             Name=target_entity_name)
        isTableExists = True
        logger.info(clean_db_name + '.' + target_entity_name + ' exists.')
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            isTableExists = False
            logger.info(clean_db_name + '.' + target_entity_name +
                        ' does not exist. Table will be created.')

    partition_path = '' if partition_column == 'None' else partition_column

    common_config = {
        'className':
        'org.apache.hudi',
        'hoodie.datasource.hive_sync.use_jdbc':
        'false',
        'hoodie.index.type':
        'GLOBAL_BLOOM',
        'hoodie.datasource.write.precombine.field':
        datetime_column,
        'hoodie.datasource.write.recordkey.field':
        hudi_primary_key,
        'hoodie.table.name':
        target_entity_name,
        'hoodie.consistency.check.enabled':
        'true',
        'hoodie.datasource.hive_sync.database':
        clean_db_name,
        'hoodie.datasource.hive_sync.table':
        target_entity_name,
        'hoodie.datasource.hive_sync.enable':
        'true',
        'hoodie.datasource.write.partitionpath.field':
        partition_path,
        'hoodie.datasource.hive_sync.partition_fields':
        partition_path,
        'hoodie.datasource.hive_sync.partition_extractor_class':
        'org.apache.hudi.hive.NonPartitionedExtractor' if partition_column
        == 'None' else 'org.apache.hudi.MultiPartKeysValueExtractor',
        'hoodie.datasource.write.hive_style_partitioning':
        'false' if partition_column == 'None' else 'true',
        'hoodie.datasource.write.keygenerator.class':
        'org.apache.hudi.keygen.NonpartitionedKeyGenerator' if partition_column
        == 'None' else 'org.apache.hudi.keygen.SimpleKeyGenerator'
    }

    incremental_config = {
        'hoodie.upsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert',
        'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS',
        'hoodie.cleaner.commits.retained': 10
    }

    initLoad_config = {
        'hoodie.bulkinsert.shuffle.parallelism': parallelism,
        'hoodie.datasource.write.operation': 'upsert'
    }

    if (isTableExists):
        logger.info('Incremental upsert.')
        combinedConf = {**common_config, **incremental_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Append').save("s3://" + output_bucket_name +
                                                "/" + clean_table_name)
    else:
        logger.info('Inital load.')
        combinedConf = {**common_config, **initLoad_config}
        enriched_data.write.format('org.apache.hudi').options(
            **combinedConf).mode('Overwrite').save("s3://" +
                                                   output_bucket_name + "/" +
                                                   clean_table_name)

    job.commit()
예제 #19
0
# Glue Script to read from S3, filter data and write to Dynamo DB.
# First read S3 data using Spark Context, Glue Context can also be used. Using Spark Context just to illustrate that
# dataframe can be conveted to dynamic filter.
import sys
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark import SparkContext

args = getResolvedOptions(sys.argv,
                          ['JOB_NAME', 's3_file_path', 'dynamodb_table'])
sc = SparkContext()
glue_ctx = GlueContext(sc)
spark = glue_ctx.spark_session
job = Job(glue_ctx)
job.init(args['JOB_NAME'], args)

s3_file_path = args['s3_file_path']
dynamodb_table = args['dynamodb_table']

df_src = spark.read.format("csv").option("header", "true").load(s3_file_path)
df_fil_dept = df_src.filter(df_src.dept_no == 101)

dyf_result = DynamicFrame.fromDF(df_fil_dept, glue_ctx, "dyf_result")

glue_ctx. \
    write_dynamic_frame_from_options(
        frame=dyf_result, connection_type="dynamodb",
        connection_options={"dynamodb.output.tableName": dynamodb_table, "dynamodb.throughput.write.percent": "1.0"})
    repartitioned_stream = DynamicFrame.fromDF(result_df, glueContext, 'result_df') \
        .repartition(2)

    glueContext.purge_s3_path(s3_path=target_path,
                              options={"retentionPeriod": 0})
    glueContext.write_dynamic_frame_from_options(
        frame=repartitioned_stream,
        connection_type="s3",
        connection_options={"path": target_path},
        format="glueparquet")


def main(argv, glueContext, job):
    args = getResolvedOptions(
        argv, ["JOB_NAME", "database_name", "source_table", "target_path"])
    job.init(args['JOB_NAME'], args)

    staging_df = __load_staging_data(args, glueContext)
    mapped_staging = __map_staging_data(staging_df)
    unioned_data = __union_with_existing_data(mapped_staging, args,
                                              glueContext)
    merged_result = __merge_rows(unioned_data)

    __repartition_and_store(merged_result, args, glueContext)
    job.commit()


if __name__ == '__main__':
    glueContext = GlueContext(SparkContext.getOrCreate())
    main(sys.argv, glueContext, Job(glueContext))
예제 #21
0
def main(args):
    if args.verbose:
        print("Got arguments: %s" % (args))

    glue_client = boto3.client("glue", region_name=args.region)
    s3_client = boto3.client("s3", region_name=args.region)

    # Verify source DB and table exist
    if validate_db(args.athenaDatabase,
                   glue_client=glue_client) and args.verbose:
        print("Validated source database %s exists." % (args.athenaDatabase))
    if (validate_table(
            args.athenaDatabase, args.athenaTable, glue_client=glue_client)
            and args.verbose):
        print("Validated source table %s exists." % (args.athenaTable))

    # Verify input and output buckets exist and are accessible.
    for bucket in [args.outputBucket, args.inputBucket]:
        if validate_bucket(bucket, s3_client=s3_client) and args.verbose:
            print("Verified bucket s3://%s exists and is accessible." %
                  (args.outputBucket))

    # Use latest file in bucket that matches prefix string as input.
    input_csv, latest_dt = get_latest_file(args.inputBucket,
                                           prefix=args.inputPrefix,
                                           s3_client=s3_client)

    if not input_csv:
        raise Exception(
            "Found no candidate CSV files in bucket %s with prefix %s." % (
                args.inputBucket,
                "(no prefix)" if not args.inputPrefix else args.inputPrefix,
            ))

    print("Got latest CSV file s3://%s/%s with write time %s." %
          (args.inputBucket, input_csv, latest_dt))

    sc = SparkContext()
    gc = GlueContext(sc)
    sparkSession = gc.spark_session
    job = Job(gc)
    job.init(args.JOB_NAME, vars(args))
    sparkSession.udf.register("json_clean", json_clean)

    # For requester payer
    sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader",
                                                "true")
    gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true")

    # Get minimum hour for existing data
    prelim_min_hour = 0
    aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER
    # * 1000000 needed since start_time is in microseconds
    prelim_min_hour = ((
        (START_TIME -
         (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000

    # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo.
    input_df = (gc.create_dynamic_frame_from_options(
        "s3",
        {
            "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)]
        },
        "csv",
        {
            "withHeader": True
        },
    ).toDF().select("*"))
    print("Input DF count: %s" % (input_df.count()))
    csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" %
                                    (prelim_min_hour))
    print("Date-bounded DF count: %s" % (csv_prelim_df.count()))

    # If --hourly is not set, query for policy hits for current hour
    # and previous 24 hours.
    # When --hourly is set, query for policy hits only for the current
    # and previous hour. (Querying for the previous hour is necessary)
    # as the previous CSV generation job may have run in the middle of the
    # hour, meaning new events for that hour may exist.)
    # (now - (now % 3600)) = start of the current hour
    # ((now - (now % 3600)) - 3600) = start of the previous hour
    hours_ago = 1 if args.hourly else 24
    increm_min_hour_sec = (START_TIME -
                           (START_TIME % 3600)) - (3600 * hours_ago)
    increm_min_hour_micro = increm_min_hour_sec * 1000000

    # When nonStrict is set, query for the latest hourly epoch in the input file
    # as well as all subsequent hourly epochs. This query may be more expensive
    # to complete, but allows for the possibility of filling in gaps if previous
    # runs of the job failed or were paused.
    if args.nonStrict:
        # We may need to re-query for transactions in the latest epoch depending on
        # when the previous query was run. Filter these rows out as well, then
        # set newest_epoch to our minimum hour epoch.
        last_seen_epoch = (
            input_df.selectExpr("cast(start_time as bigint) as ts").agg({
                "start_time":
                "max"
            }).collect().pop()[0])
        last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner
        # Make sure we don't go outside our --maxHoursAgo range.
        # This check is necessary as increm_min_hour_sec is used in the pushdown
        # predicate when querying against the Athena source table/view.
        increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec)
        increm_min_hour_micro = increm_min_hour_sec * 1000000

    print("Set minimum hour in seconds: %s" % (increm_min_hour_sec))

    csv_hits_in_range = csv_prelim_df.filter(
        "cast(start_time as bigint) < %s" % (increm_min_hour_micro))
    print("Total CSV hits in timeframe of interest: %s" %
          (csv_hits_in_range.count()))

    # Set timeframe for pushdown predicate
    pushdown = "(hour >= %s)" % (increm_min_hour_sec)

    # Get table contents starting from minimum hour settings.
    raw_data = gc.create_dynamic_frame.from_catalog(
        database=args.athenaDatabase,
        table_name=args.athenaTable,
        transformation_ctx="raw_data",
        push_down_predicate=pushdown,
    )

    # Determine what fields we should extract based on table definition.
    # Deriving the fields from the table schema is preferable to inferring
    # it from the underlying parquet because the two may differ.
    tmp_schema = get_table_schema(args.athenaDatabase,
                                  args.athenaTable,
                                  glue_client=glue_client)
    # Verify we only have one matching column on which to explode.
    if not [col[0] for col in tmp_schema].count(args.policyType) == 1:
        raise Exception(
            "Wrong number of matching %s columns found in schema: %s" %
            (args.policyType, tmp_schema))
    # Sub out the policies/parent_policies field for an explode expression.
    # Lightly modify other fields to promote clean and consistent writes to CSV file.
    schema = []
    for col in tmp_schema:
        if col[0] == args.policyType:
            schema.append("explode(%s) as policy" % (col[0]))
        elif "map" in col[1].lower() or "struct" in col[1].lower():
            schema.append("json_clean(%s) as %s" % (col[0], col[0]))
        else:
            schema.append("cast(%s as string)" % (col[0]))

    # Get targeted policies/parent_policies.
    # Should be passed to SparkSQL as quoted strings.
    pol_arr = ", ".join([
        "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter)
        if pol
    ])

    # Filter table contents according to policy hits.
    # A non-null policies field implies a non-null parent_policies field
    # (and vice versa), so it's OK to filter on just one.

    new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr(
        *schema).filter("policy in (%s)" % (pol_arr)))
    print("New policy triggers found since last job: %s" % (new_hits.count()))

    # Combine newly collected policy hits with dataframe of previous CSV contents.
    write_df = new_hits.union(csv_hits_in_range).orderBy(
        "start_time").coalesce(1)

    uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver)
    s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir,
                                       uniq))
    if args.verbose:
        print("S3 Results Location: %s" % s3_loc)

    write_df.write.option("quoteAll", True).csv(s3_loc, header=True)

    # Rename output file, if requested.
    if args.outputFilename:
        rename_resp = rename_file(
            args.outputBucket,
            args.outputFilename,
            prefix=args.outputDir,
            dont_preserve_dir=args.dontPreserveOutputDir,
            keep_orig=args.keepOrigOnRename,
        )

        if rename_resp and args.verbose:
            print("Renamed file to %s/%s." %
                  (args.outputDir, args.outputFilename))

    job.commit()
from awsglue.utils import getResolvedOptions
from pyspark.sql import SparkSession
from pyspark.sql.types import NullType, StructType, StructField
from pyspark.sql.types import StringType, DateType, LongType, ArrayType
from pyspark.sql.functions import *
import itertools
import boto3
import time


args = getResolvedOptions(sys.argv, ['JOB_NAME', 'bucketName' ,'workspace', 'jdbc_url', 'username', 'pswd'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

#ccn-alerts-104328-{workspace}-sfmc-migration
workspace = args['workspace']
bucketName = args['bucketName']
jdbc_url = args['jdbc_url']
username = args['username']
pswd = args['pswd']

print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username) )

s3Location = 's3://{}'.format(bucketName)
glue_database = '{}_sfmc_migration'.format(workspace)
outputLocation = ''+s3Location+'/athena-results-ccn-data/'
예제 #23
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'datetime_column', 'date_column',
        'partition_column', 'output_bucket_name'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    datetime_column = args['datetime_column']
    date_column = args['date_column']
    output_bucket_name = args['output_bucket_name']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    ## @type: DataSink
    ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"]
    ## @return: sink
    ## @inputs: [frame = enriched_data]

    sink = glue_context.getSink(connection_type="s3",
                                path="s3://" + output_bucket_name + "/" +
                                clean_table_name,
                                enableUpdateCatalog=True,
                                updateBehavior="UPDATE_IN_DATABASE",
                                partitionKeys=[partition_column])
    sink.setFormat("glueparquet")
    sink.setCatalogInfo(catalogDatabase=clean_db_name,
                        catalogTableName=clean_table_name)
    sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result'))

    job.commit()